From 9e7a475948515b92b27041bb1cea13116272a706 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sun, 9 Jul 2023 14:29:45 +0200 Subject: [PATCH 01/88] Start sketching out the scaled dot-product attention custom op Currently this is not a HLSCustomOp, but a QONNX CustomOp. Implemented are first operator attributes, ONNX graph/model construction and a rather improvised python mode node execution for debugging. --- src/finn/custom_op/fpgadataflow/__init__.py | 3 + src/finn/custom_op/fpgadataflow/attention.py | 355 ++++++++++++++++++ .../test_fpgadataflow_attention.py | 104 +++++ 3 files changed, 462 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/attention.py create mode 100644 tests/fpgadataflow/test_fpgadataflow_attention.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index aed2ab7fe1..9624710dca 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -55,6 +55,8 @@ from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.custom_op.fpgadataflow.attention import ScaledDotProductAttention + custom_op = dict() # make sure new HLSCustomOp subclasses are imported here so that they get @@ -81,3 +83,4 @@ custom_op["StreamingEltwise"] = StreamingEltwise custom_op["StreamingMaxPool"] = StreamingMaxPool custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour +custom_op["ScaledDotProductAttention"] = ScaledDotProductAttention diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py new file mode 100644 index 0000000000..4e3c7386a2 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -0,0 +1,355 @@ +# Python warning subsystem +import warnings +# Numpy math and arrays +import numpy as np +# Derive custom operators form the FINN base custom op +# from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +# Temporarily derive custom operators from QONNX base custom op +# TODO: Remove once switching to HLSCustomOp +from qonnx.custom_op.base import CustomOp + + +# Scaled Dot-Product Attention Custom Operator +# Note: Single head attention +class ScaledDotProductAttention(CustomOp): + # Returns a dict of permitted attributes for the node + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = {} # super().get_nodeattr_types() + # Update attributes dictionary for new custom operator + attrs.update({ + # Shared query and key and value embedding dimension + "qk_dim": ("i", True, 0), + "v_dim": ("i", True, 0), + # Shared key and value and query sequence length + "kv_len": ("i", True, 0), + "q_len": ("i", True, 0), + # Datatypes of inputs and outputs + "q_dtype": ("s", True, ""), + "k_dtype": ("s", True, ""), + "v_dtype": ("s", True, ""), + "o_dtype": ("s", True, ""), + # Mode used for providing the attention mask + # There can be no mask, a mask sent as the fourth input or a causal + # attention mask which is generated by the operator itself. + "mask_mode": ("s", True, "none", {"none", "input", "causal"}), + # Datatype of the attention mask (if there is a mask) + "mask_dtype": ("s", False, ""), + # Input (SIMD) and output (PE) parallelism + "SIMD": ("i", True, 0), + "PE": ("i", True, 0), + # Execution mode of the operator + # TODO: Remove once switching to HLSCustomOp + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + }) + # Return updated attribute dictionary + return attrs + + # Validates the attention shape constraints on queries, keys and values + @staticmethod + def assert_shape_constraints(q, k, v, mask=None, pe=None, simd=None): + # Queries and keys must match in embedding (second) dimension + assert q.shape[1] == k.shape[1], \ + "Queries and Keys must have matching embedding dimension" + # Keys and values must have matching sequence length (first) dimension + # Note: Lifting the restriction of q matching as well allows for cross + # attention using the same operator. + assert k.shape[0] == v.shape[0], \ + "Keys and Values must have matching sequence length" + + # If the mask is provided, it must have a shape matching the query and + # key product shape, i.e. the shape of the attention matrix + if mask is not None and not isinstance(mask, str): + # Compare mask shape against attention matrix shape + assert mask.shape == (q.shape[0], k.shape[0]), \ + "Mask shape must match the shape of the attention matrix" + + # If specified, validate PE shape constraints as well + if pe is not None: + # PE operates along the sequence length dimension of the keys + assert k.shape[0] % pe == 0, \ + "Key sequence length must be divisible by PE" + # PE operates along the embedding dimension of the values + assert v.shape[1] % pe == 0, \ + "Value embedding dimension must be divisible by PE" + + # If specified, validate SIMD shape constraints as well + if simd is not None: + # SIMD operates along the shared embedding dimension of the queries + # and the keys + assert q.shape[1] % simd == 0, \ + "Query and Key embedding dimension must be divisible by SIMD" + # SIMD operates along the sequence length dimension of the values + assert v.shape[0] % simd == 0, \ + "Value sequence length must be divisible by SIMD" + + # Generates a dummy node matching the shapes of the input numpy arrays + @staticmethod + def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): + # Utility types and function for creating onnx nodes and graphs + from onnx import TensorProto, helper + # Utility for creating and wrapping qonnx graphs and models + from qonnx.util.basic import qonnx_make_model + from qonnx.core.modelwrapper import ModelWrapper + + # Convert unspecified mask to 'none' mode + mask = 'none' if mask is None else mask + # Validate all shape constraints first + ScaledDotProductAttention.assert_shape_constraints( + q, k, v, mask, pe, simd + ) + + # Start building the node as a dictionary of attributes + node_kwargs = { + # Refer to this operator type by its name + "op_type": "ScaledDotProductAttention", + # Execution will try to look up the implementation in the package + # referred to by the domain + # Note: the op_type should be registered as a custom op within the + # domain package + "domain": "finn.custom_op.fpgadataflow", + # Execution backend + # Note: Required attribute inherited from HLSCustomOp + "backend": "fpgadataflow", + # Configuration of input parallelism + "SIMD": simd, + # Configuration of output parallelism + "PE": pe + } + + # Infer the output shape from the input shapes + o_shape = (q.shape[0], v.shape[1]) + + # Create onnx value info of all inputs and outputs assuming float + # datatypes + q_info = helper.make_tensor_value_info("q", TensorProto.FLOAT, q.shape) + k_info = helper.make_tensor_value_info("k", TensorProto.FLOAT, k.shape) + v_info = helper.make_tensor_value_info("v", TensorProto.FLOAT, v.shape) + o_info = helper.make_tensor_value_info("o", TensorProto.FLOAT, o_shape) + + # Collect input and output nodes in order + inputs, outputs = [q_info, k_info, v_info], [o_info] + + # Collect all inputs/outputs to the operator node + io_kwargs = { + "inputs": ["q", "k", "v"], "outputs": ["o"], "mask_mode": "none" + } + + # Start building the shape attributes + shape_kwargs = { + # Shared embedding dimension of the queries and keys and embedding + # dimension of the values + "qk_dim": q.shape[1], "v_dim": v.shape[1], + # Shared sequence length of keys and values and sequence length of + # the queries + "kv_len": k.shape[0], "q_len": q.shape[0], + } + + # Start building the datatype attributes + dtype_kwargs = { + # Datatypes of the query, key, value inputs and the output + "q_dtype": "FLOAT32", "k_dtype": "FLOAT32", + "v_dtype": "FLOAT32", "o_dtype": "FLOAT32", + } + + # If the optional mask is specified as an input + if isinstance(mask, np.ndarray) or mask == "input": + # Add the mask to the input node names + io_kwargs["inputs"].append("mask") + # Configure masking mode via io_kwargs as well + io_kwargs["mask_mode"] = "input" + # Always infer the mask shape + mask_shape = (q.shape[0], k.shape[0]) + # Create value info of the mask input + mask_info = helper.make_tensor_value_info( + "mask", TensorProto.FLOAT, mask_shape + ) + # Append the mask input as fourth input node + inputs.append(mask_info) + # Add the mask default datatype to the datatype attributes + dtype_kwargs["mask_dtype"] = "FLOAT32" + + # If a causal mask is to be generated during execution + if mask == "causal": + # Configure masking mode via io_kwargs as well + io_kwargs["mask_mode"] = "causal" + # Add the mask default datatype to the datatype attributes + dtype_kwargs["mask_dtype"] = "FLOAT32" + + # The optional dtypes keyword arguments must describe a subset of the + # model inputs and outputs + assert set(dtypes) <= {*dtype_kwargs, "mask_dtype"}, \ + "Specified datatype of unknown input or output" + + # Update the datatype attributes according to the keyword arguments + dtype_kwargs.update({ + key: value.name for key, value in dtypes.items() + }) + + # Create an onnx graph node by unpacking all prepared keyword arguments + node = helper.make_node( + **node_kwargs, **io_kwargs, **shape_kwargs, **dtype_kwargs + ) + # Create a graph out of the operator node and the input/output nodes + graph = helper.make_graph( + [node], inputs=inputs, outputs=outputs, name='attention_graph' + ) + # Wrap the graph in a qonnx model wrapper + model = ModelWrapper(qonnx_make_model( + graph, producer_name='attention-model' + )) + + # Add datatype annotations to all input tensors + for tensor_name in io_kwargs["inputs"]: + # Only annotate if a datatype is specified + if f'{tensor_name}_dtype' in dtypes: + # Update the datatype annotation + model.set_tensor_datatype( + tensor_name, dtypes[f'{tensor_name}_dtype'] + ) + + # Add datatype annotations to all output tensors + for tensor_name in io_kwargs["outputs"]: + # Only annotate if a datatype is specified + if f'{tensor_name}_dtype' in dtypes: + # Update the datatype annotation + model.set_tensor_datatype( + tensor_name, dtypes[f'{tensor_name}_dtype'] + ) + + # Return the constructed qonnx model wrapper + return model + + # Returns an ONNX node that has the same shape inference behavior + def make_shape_compatible_op(self, model): + # Infer the output shape from the input shapes + o_shape = (self.get_nodeattr("q_len"), self.get_nodeattr("v_dim")) + # Constant operation producing output of given shape + return super().make_const_shape_op(o_shape) + + # Infers the output data types and updates the input datatypes of the node + def infer_node_datatype(self, model): + # ONNX graph node of the operator + node = self.onnx_node + + # Get input datatypes from model for query, key, value nodes in order + q_dtype = model.get_tensor_datatype(node.input[0]) + k_dtype = model.get_tensor_datatype(node.input[1]) + v_dtype = model.get_tensor_datatype(node.input[2]) + + # Test for changing query input datatype + if q_dtype != self.get_nodeattr("q_dtype"): + # Issue a warning message + warnings.warn("q_dtype changing for %s: %s -> %s " % ( + node.name, + str(self.get_nodeattr("q_dtype")), + str(q_dtype), + )) + # Test for changing key input datatype + if k_dtype != self.get_nodeattr("k_dtype"): + # Issue a warning message + warnings.warn("k_dtype changing for %s: %s -> %s " % ( + node.name, + str(self.get_nodeattr("k_dtype")), + str(k_dtype), + )) + # Test for changing value input datatype + if v_dtype != self.get_nodeattr("v_dtype"): + # Issue a warning message + warnings.warn("v_dtype changing for %s: %s -> %s " % ( + node.name, + str(self.get_nodeattr("v_dtype")), + str(v_dtype), + )) + + # Update the node datatype attributes + self.set_nodeattr("q_dtype", q_dtype.name) + self.set_nodeattr("k_dtype", k_dtype.name) + self.set_nodeattr("v_dtype", v_dtype.name) + + # Attention mask might be provided as an input as well + if self.get_nodeattr("mask_mode") == "input": + # Get the datatype attribute of the attention mask + # Note: Optional mask will be provided as the fourth input + mask_dtype = model.get_tensor_datatype(node.input[3]) + # Test for changing mask input datatype + if mask_dtype != self.get_nodeattr("mask_dtype"): + # Issue a warning message + warnings.warn("mask_dtype changing for %s: %s -> %s " % ( + node.name, + str(self.get_nodeattr("mask_dtype")), + str(mask_dtype), + )) + # Update the node datatype attribute of the attention mask + self.set_nodeattr("mask_dtype", mask_dtype.namke) + + # Set the model output datatype + model.set_tensor_datatype(node.output[0], self.get_nodeattr('o_dtype')) + + # Executes the node + def execute_node(self, context, graph): + # Get the mode to use for execution + mode = self.get_nodeattr("exec_mode") + + # Support python execution mode for now + # TODO: Remove python mode once switching to HLSCustomOp + if mode == "python": + # Numpy compatible softmax implementation + from scipy.special import softmax + # Generate random input data for testing + from qonnx.util.basic import gen_finn_dt_tensor, DataType + + # Read input tensors of the query, key and value inputs from context + q = context[self.onnx_node.input[0]] + k = context[self.onnx_node.input[1]] + v = context[self.onnx_node.input[2]] + # Get the shared embedding dimension of queries and keys + d = self.get_nodeattr('qk_dim') + # Start with zero mask + mask = 0 + # The actual attention mask may be provided as the fourth input + if self.get_nodeattr("mask_mode") == "input": + # Get the mask tensor from the execution context + mask = context[self.onnx_node.input[3]] + # Another option is to generate a causal attention mask on the fly + elif self.get_nodeattr("mask_mode") == "causal": + # Get the datatype of the attention mask + mask_dtype = DataType[self.get_nodeattr("mask_dtype")] + # Start with an all zero attention mask + mask = 0 * gen_finn_dt_tensor( + mask_dtype, (q.shape[0], k.shape[0]) + ) + # Generate upper triangular causal attention mask + mask[np.triu_indices_from(mask, 1)] = - np.inf + # Compute the attention matrix between queries and keys + attention = softmax(q @ k.T * (d ** -0.5) + mask, axis=-1) + # Compute product of attention weights and value input + o = attention @ v + # Get the name of the output + o_name = self.onnx_node.output[0] + # Save the output tensor to the execution context + context[o_name] = o + # CPP Simulation of the HLS operator + elif mode == "cppsim": + # TODO: Implement cppsim mode + raise NotImplementedError( + "exec_mode cppsim is not implemented yet!" + ) + # RTL Simulation of the HLS operator + elif mode == "rtlsim": + # TODO: Implement rtlsim mode + raise NotImplementedError( + "exec_mode rtlsim is not implemented yet!" + ) + # All other modes are unsupported + else: + raise Exception( + """ + Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim") + """.format(mode) + ) + + # Optional node verification + def verify_node(self): + pass diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py new file mode 100644 index 0000000000..0107beea5c --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -0,0 +1,104 @@ +# Testing framework +import pytest + +# Use numpy for python execution / computing the ground truth expected values +import numpy as np +# Numpy compatible implementation of the softmax operation +from scipy.special import softmax + +# Generate random input data for testing +from qonnx.util.basic import gen_finn_dt_tensor, DataType +# Execute onnx model graphs +from qonnx.core.onnx_exec import execute_onnx +# Attention operator to test +from finn.custom_op.fpgadataflow.attention import ScaledDotProductAttention +# Graphs transformation setting the execution mode attribute +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + + +# Size of query and key embedding dimension +@pytest.mark.parametrize("qk_dim", [64]) +# Size of value embedding dimension +@pytest.mark.parametrize("v_dim", [64]) +# Length of key and value sequences +@pytest.mark.parametrize("kv_len", [256]) +# Length of query sequence +@pytest.mark.parametrize("q_len", [256]) +# Different modes to provide a mask +@pytest.mark.parametrize("mask", ["none", "input", "causal"]) +# Output parallelism +@pytest.mark.parametrize("pe", [1]) +# Input parallelism +@pytest.mark.parametrize("simd", [1]) +# Datatypes of queries, keys and values, mask and output +@pytest.mark.parametrize("q_dtype", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("k_dtype", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("v_dtype", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("mask_dtype", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("o_dtype", [DataType["FLOAT32"]]) +# Tests python implementation of single scaled dot-product attention head +def test_attention_python( + qk_dim, v_dim, kv_len, q_len, mask, pe, simd, q_dtype, k_dtype, v_dtype, + mask_dtype, o_dtype +): + # Generate random input data + q = gen_finn_dt_tensor(q_dtype, (q_len, qk_dim)) + k = gen_finn_dt_tensor(k_dtype, (kv_len, qk_dim)) + v = gen_finn_dt_tensor(v_dtype, (kv_len, v_dim)) + + dtypes = { + # Datatypes of the query, key, value inputs and the output + "q_dtype": q_dtype, "k_dtype": k_dtype, + "v_dtype": v_dtype, "o_dtype": o_dtype, + } + + # Generate the operator matching the configuration + model = ScaledDotProductAttention.make_modelwrapper_like( + q, k, v, mask, pe, simd, **dtypes, mask_dtype=mask_dtype + ) + + # Generate random input mask if the operator expects the mask as fourth + # input + if mask == "input": + mask = gen_finn_dt_tensor(DataType["FLOAT32"], (q_len, kv_len)) + # If a causal attention mask is requested, generate upper triangular matrix + elif mask == "causal": + # Start zero initialized mask + mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (q_len, kv_len)) + # Fill upper triangular causal attention mask + mask[np.triu_indices_from(mask, 1)] = - np.inf + # No mask input requested + elif mask == "none": + # No mask is equivalent to a zero mask + mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (q_len, kv_len)) + + # Prepare execution context + context = { + "q": q, "k": k, "v": v, "mask": mask + } + # Set model execution mode to python (numpy execution) + model = model.transform(SetExecMode("python")) + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context)["o"] + + # Compute the attention matrix between queries and keys + attention = softmax(q @ k.T * (qk_dim ** -0.5) + mask, axis=-1) + # Compute product of attention weights and value input + o_expected = attention @ v + + # Test whether the expectation and the onnx model output match + assert (o_produced == o_expected).all(), "python exec failed" # noqa + + +# This is a fpgadataflow type of test +@pytest.mark.fpgadataflow +# Tests cpp simulation of single scaled dot-product attention head +def test_fpgadataflow_attention_cppsim(): + pass + + +# This is a fpgadataflow type of test +@pytest.mark.fpgadataflow +# Tests rtl simulation of single scaled dot-product attention head +def test_fpgadataflow_attention_rtlsim(): + pass From 7f9733272a5e3f1b3995b7bb4706778e10b9ad35 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 10 Jul 2023 09:51:18 +0200 Subject: [PATCH 02/88] [Attention] Add __init__ method to custom op --- src/finn/custom_op/fpgadataflow/attention.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 4e3c7386a2..ef6e5ca3dc 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -12,6 +12,11 @@ # Scaled Dot-Product Attention Custom Operator # Note: Single head attention class ScaledDotProductAttention(CustomOp): + # Initializes the operator given an onnx graph node + def __init__(self, onnx_node, **kwargs): + # Just forward all arguments to the init method of the CustomOp base + super().__init__(onnx_node, **kwargs) + # Returns a dict of permitted attributes for the node def get_nodeattr_types(self): # Start from parent operator class attributes From e77ad2be69865c2b6c0f7a066506524ed1128247 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 10 Jul 2023 13:57:18 +0200 Subject: [PATCH 03/88] [Attention] Add datatype and shape queries to custom op --- src/finn/custom_op/fpgadataflow/attention.py | 93 +++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index ef6e5ca3dc..75d6fd1c36 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -8,6 +8,9 @@ # TODO: Remove once switching to HLSCustomOp from qonnx.custom_op.base import CustomOp +# QONNX/FINN datatypes +from qonnx.core.datatype import DataType + # Scaled Dot-Product Attention Custom Operator # Note: Single head attention @@ -302,7 +305,7 @@ def execute_node(self, context, graph): # Numpy compatible softmax implementation from scipy.special import softmax # Generate random input data for testing - from qonnx.util.basic import gen_finn_dt_tensor, DataType + from qonnx.util.basic import gen_finn_dt_tensor # Read input tensors of the query, key and value inputs from context q = context[self.onnx_node.input[0]] @@ -358,3 +361,91 @@ def execute_node(self, context, graph): # Optional node verification def verify_node(self): pass + + # Gets the datatype of input at index ind + def get_input_datatype(self, ind=0): + # Ordered list of names of allowed inputs + inputs = ["q", "k", "v"] + # If the attention mask is provided as input, it has a type as well + if self.get_nodeattr("mask_mode") == "input": + # The mask type is an attribute itself + inputs += ["mask"] + # Look up datatype name in attributes and convert to DataType + return DataType[self.get_nodeattr(f"{inputs[ind]}_dtype")] + + # Gets the datatype of the output (at index ind, but there is just one) + def get_output_datatype(self, ind=0): + # Ordered list of names of allowed outputs + outputs = ["o"] + # Look up datatype name in attributes and convert to DataType + return DataType[self.get_nodeattr(f"{outputs[ind]}_dtype")] + + # Gets the shape of the input at index ind without folding + def get_normal_input_shape(self, ind=0): + # List shapes of inputs in order + inputs_shapes = [ + # Query input sequence + (self.get_nodeattr("q_len"), self.get_nodeattr("qk_dim")), + # Key input sequence + (self.get_nodeattr("kv_len"), self.get_nodeattr("kv_dim")), + # Value input sequence + (self.get_nodeattr("kv_len"), self.get_nodeattr("v_dim")), + ] + # If the attention mask is provided as input, it has a shape as well + if self.get_nodeattr("mask_mode") == "input": + # Mask shape is inferred from query and key sequence lengths + inputs_shapes += [ + (self.get_nodeattr("q_len"), self.get_nodeattr("kv_len")) + ] + # Get the shape by indexing into the ordered list of all inputs + return inputs_shapes[ind] + + # Gets the shape of the output at index ind (there is just one) without + # folding + def get_normal_output_shape(self, ind=0): # noqa, there is just one output + # The output shape is inferred from the length of the query sequence and + # the embedding dimension of the values + return tuple((self.get_nodeattr("q_len"), self.get_nodeattr("v_dim"))) + + # Gets the shape of the input at index ind with folding + def get_folded_input_shape(self, ind=0): + # Get the unfolded size of the input + t, d = self.get_normal_input_shape(ind) + # Get the amount of input (SIMD) and output (PE) parallelism + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") # TODO: What about this? + + # The query (first) and key (second) inputs are treated the same and + # merely differ in buffering requirements + if ind == 0 or ind == 1: + # Fold the input along the embedding dimension + sf = d // simd + # New shape with SIMD elements as the last dimension + return tuple((t, sf, simd)) + # For the value (third) inputs the axes flip and simd/pe change roles + if ind == 2: + # Fold the input along the sequence length dimension + sf = t // simd + # New shape with SIMD elements as the last dimension + return tuple((sf, d, simd)) + # If the mask is provided as input, it is folded as well + if ind == 3 and self.get_nodeattr("mask_mode") == "input": + # The mask is folded along the second dimension which is actually a + # sequence length as well. It might be confusing to call it d here. + sf = d // simd + # New shape with SIMD elements as the last dimension + return tuple((t, sf, simd)) + + # If this point is reached, something went wrong + raise Exception(f"Requested shape of invalid input index {ind}") + + # Gets the shape of the output at index ind (there is just one) with folding + def get_folded_output_shape(self, ind=0): # noqa, there is just one output + # Get the unfolded size of the output + t, d = self.get_normal_output_shape(ind) + # Get the amount of output (PE) parallelism + pe = self.get_nodeattr("PE") + # The output is folded along the embedding dimension, neuron-fold + nf = d // pe + # New shape with PE elements as the last dimension + return tuple((t, nf, pe)) From c95b397b69d96ca67a7cd87eeb9d640bf11f0e3f Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 10 Jul 2023 14:32:04 +0200 Subject: [PATCH 04/88] [Attention] Add stream/bit-width queries to custom op --- src/finn/custom_op/fpgadataflow/attention.py | 31 ++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 75d6fd1c36..2178ca7c15 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -449,3 +449,34 @@ def get_folded_output_shape(self, ind=0): # noqa, there is just one output nf = d // pe # New shape with PE elements as the last dimension return tuple((t, nf, pe)) + + # Widths of the input data stream of the input at index ind + def get_instream_width(self, ind=0): + # Get the number of bits used to represent the input + i_bits = self.get_input_datatype(ind).bitwidth() + # Width of a stream receiving SIMD inputs in parallel + return self.get_nodeattr("SIMD") * i_bits + + # Widths of the output data stream of the output at index ind + def get_outstream_width(self, ind=0): + # Get the number of bits used to represent the output + o_bits = self.get_output_datatype(ind).bitwidth() + # Width of a stream producing PE outputs in parallel + return self.get_nodeattr("PE") * o_bits + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + i_bits_max = max((self.get_instream_width(ind) for ind in range(3))) + # Find the widths of the widest output + o_bits_max = max((self.get_instream_width(ind) for ind in range(1))) + # Assume no bits to represent the mask, if there is no mask + m_bits = 0 + # A mask received as input or produced as causal on the fly has a + # bit-width as well + if self.get_nodeattr("mask_mode") in {"input", "causal"}: + # Get width of the mask datatype + m_bits = DataType[self.get_nodeattr("mask_dtype")].bitwidth() + # TODO: Are there more intermediates which need to be considered? + # Find maximum of all maximal bit-widths + return max([i_bits_max, o_bits_max, m_bits]) From 4a0e98e8749b47c19c15d4e6fc3a5b577b6a3fe9 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 4 Aug 2023 19:29:06 +0200 Subject: [PATCH 05/88] [Attention] Add refactored node attributes matching HLS op template --- src/finn/custom_op/fpgadataflow/attention.py | 62 ++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 2178ca7c15..5afdf8dd33 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -53,6 +53,68 @@ def get_nodeattr_types(self): # Return updated attribute dictionary return attrs + # WIP: Refactor the node attributes matching the HLS operator which is WIP + # in another repository right now. + def _get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = {} # super().get_nodeattr_types() + # Update attributes dictionary for new custom operator + attrs.update({ + # Embedding dimension of queries and keys + "QKDim": ("i", True, 0), + # Length of the query sequence + "QLen": ("i", True, 0), + # Embedding dimension of the values + "VDim": ("i", True, 0), + # Length of the key and value sequence + "KVLen": ("i", True, 0), + + # Folding along the embedding dimensions + "EmbFold": ("i", True, 0), + # Folding along the sequence dimensions + "SeqFold": ("i", True, 0), + + # Datatype of query matrix elements + "QType": ("s", True, ""), + # Datatype of key matrix elements + "KType": ("s", True, ""), + # Datatype of value matrix elements + "VType": ("s", True, ""), + # Datatype of mask matrix elements + "MType": ("s", False, "INT0"), + # Datatype of attention weights elements + "AType": ("s", True, ""), + # Datatype of output elements + "OType": ("s", True, ""), + + # Datatype of accumulator elements of the Query x Key multiplication + "AccQKMatMul": ("s", False, ""), + # Datatype of output elements of the Query x Key multiplication + "OutQKMatMul": ("s", False, ""), + # Activation function type of the Query x Key multiplication + "ActQKMatMul": ("s", False, ""), + + # Datatype of accumulator elements of the Attention x Value + # multiplication + "AccAVMatMul": ("s", False, ""), + # Datatype of output elements of the Attention x Value + # multiplication + "OutAVMatMul": ("s", False, ""), + # Activation function type of the Attention x Value multiplication + "ActAVMatMul": ("s", False, ""), + + # Activation function type of the softmax normalization of the + # attention weights + "ActASoftmax": ("s", False, ""), + + # Mode used for providing the attention mask: There can be no mask, + # a mask sent as the fourth input or a causal attention mask which + # is generated by the operator itself. + "mask_mode": ("s", True, "none", {"none", "input", "causal"}), + }) + # Return updated attribute dictionary + return attrs + # Validates the attention shape constraints on queries, keys and values @staticmethod def assert_shape_constraints(q, k, v, mask=None, pe=None, simd=None): From c3ea73e85a9c0f324127a2954b511e435d9a2d25 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 7 Aug 2023 10:49:09 +0200 Subject: [PATCH 06/88] [Attention] Adapt the custom op to the new folding concept --- src/finn/custom_op/fpgadataflow/attention.py | 285 ++++++++---------- .../test_fpgadataflow_attention.py | 54 ++-- 2 files changed, 147 insertions(+), 192 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 5afdf8dd33..5460f1e567 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -20,42 +20,9 @@ def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base super().__init__(onnx_node, **kwargs) - # Returns a dict of permitted attributes for the node - def get_nodeattr_types(self): - # Start from parent operator class attributes - attrs = {} # super().get_nodeattr_types() - # Update attributes dictionary for new custom operator - attrs.update({ - # Shared query and key and value embedding dimension - "qk_dim": ("i", True, 0), - "v_dim": ("i", True, 0), - # Shared key and value and query sequence length - "kv_len": ("i", True, 0), - "q_len": ("i", True, 0), - # Datatypes of inputs and outputs - "q_dtype": ("s", True, ""), - "k_dtype": ("s", True, ""), - "v_dtype": ("s", True, ""), - "o_dtype": ("s", True, ""), - # Mode used for providing the attention mask - # There can be no mask, a mask sent as the fourth input or a causal - # attention mask which is generated by the operator itself. - "mask_mode": ("s", True, "none", {"none", "input", "causal"}), - # Datatype of the attention mask (if there is a mask) - "mask_dtype": ("s", False, ""), - # Input (SIMD) and output (PE) parallelism - "SIMD": ("i", True, 0), - "PE": ("i", True, 0), - # Execution mode of the operator - # TODO: Remove once switching to HLSCustomOp - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), - }) - # Return updated attribute dictionary - return attrs - # WIP: Refactor the node attributes matching the HLS operator which is WIP # in another repository right now. - def _get_nodeattr_types(self): + def get_nodeattr_types(self): # Start from parent operator class attributes attrs = {} # super().get_nodeattr_types() # Update attributes dictionary for new custom operator @@ -111,51 +78,46 @@ def _get_nodeattr_types(self): # a mask sent as the fourth input or a causal attention mask which # is generated by the operator itself. "mask_mode": ("s", True, "none", {"none", "input", "causal"}), + + # Execution mode of the operator + # TODO: Remove once switching to HLSCustomOp + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), }) # Return updated attribute dictionary return attrs - # Validates the attention shape constraints on queries, keys and values - @staticmethod - def assert_shape_constraints(q, k, v, mask=None, pe=None, simd=None): - # Queries and keys must match in embedding (second) dimension - assert q.shape[1] == k.shape[1], \ - "Queries and Keys must have matching embedding dimension" - # Keys and values must have matching sequence length (first) dimension - # Note: Lifting the restriction of q matching as well allows for cross - # attention using the same operator. - assert k.shape[0] == v.shape[0], \ - "Keys and Values must have matching sequence length" - - # If the mask is provided, it must have a shape matching the query and - # key product shape, i.e. the shape of the attention matrix - if mask is not None and not isinstance(mask, str): - # Compare mask shape against attention matrix shape - assert mask.shape == (q.shape[0], k.shape[0]), \ - "Mask shape must match the shape of the attention matrix" - - # If specified, validate PE shape constraints as well - if pe is not None: - # PE operates along the sequence length dimension of the keys - assert k.shape[0] % pe == 0, \ - "Key sequence length must be divisible by PE" - # PE operates along the embedding dimension of the values - assert v.shape[1] % pe == 0, \ - "Value embedding dimension must be divisible by PE" - - # If specified, validate SIMD shape constraints as well - if simd is not None: - # SIMD operates along the shared embedding dimension of the queries - # and the keys - assert q.shape[1] % simd == 0, \ - "Query and Key embedding dimension must be divisible by SIMD" - # SIMD operates along the sequence length dimension of the values - assert v.shape[0] % simd == 0, \ - "Value sequence length must be divisible by SIMD" + # Shape configuration of the operator + @property + def shapes(self): + # Note: This matches the order of definition above and the order of the + # HLS lib template as well + return (self.get_nodeattr("QKDim"), self.get_nodeattr("QLen"), + self.get_nodeattr("VDim"), self.get_nodeattr("KVLen")) + + # Folding configuration of the operator + @property + def folds(self): + # Note: This matches the order of definition above and the order of the + # HLS lib template as well + return self.get_nodeattr("EmbFold"), self.get_nodeattr("SeqFold") + + # Tests whether the given folding is a valid configuration with respect to + # the shape configuration + @property + def is_valid_folding(self): + # Get and unpack the shape attributes (except the q matrix length, which + # is never folded) + qkdim, _, vdim, kvlen = self.shapes + # Get and unpack the folding attributes + embfold, seqfold = self.folds + # All shapes must be multiples of their corresponding fold + return not ((qkdim % embfold) or (vdim % embfold) or (kvlen % seqfold)) # Generates a dummy node matching the shapes of the input numpy arrays @staticmethod - def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): + def make_modelwrapper_like( + q, k, v, mask=None, embfold=1, seqfold=1, **dtypes + ): # Utility types and function for creating onnx nodes and graphs from onnx import TensorProto, helper # Utility for creating and wrapping qonnx graphs and models @@ -164,10 +126,6 @@ def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): # Convert unspecified mask to 'none' mode mask = 'none' if mask is None else mask - # Validate all shape constraints first - ScaledDotProductAttention.assert_shape_constraints( - q, k, v, mask, pe, simd - ) # Start building the node as a dictionary of attributes node_kwargs = { @@ -175,16 +133,13 @@ def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): "op_type": "ScaledDotProductAttention", # Execution will try to look up the implementation in the package # referred to by the domain - # Note: the op_type should be registered as a custom op within the - # domain package "domain": "finn.custom_op.fpgadataflow", - # Execution backend - # Note: Required attribute inherited from HLSCustomOp + # Execution backend: Required attribute inherited from HLSCustomOp "backend": "fpgadataflow", - # Configuration of input parallelism - "SIMD": simd, - # Configuration of output parallelism - "PE": pe + # Folding along the embedding dimensions + "EmbFold": embfold, + # Folding along the sequence dimensions + "SeqFold": seqfold } # Infer the output shape from the input shapes @@ -192,34 +147,34 @@ def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): # Create onnx value info of all inputs and outputs assuming float # datatypes - q_info = helper.make_tensor_value_info("q", TensorProto.FLOAT, q.shape) - k_info = helper.make_tensor_value_info("k", TensorProto.FLOAT, k.shape) - v_info = helper.make_tensor_value_info("v", TensorProto.FLOAT, v.shape) - o_info = helper.make_tensor_value_info("o", TensorProto.FLOAT, o_shape) + q_info = helper.make_tensor_value_info("Q", TensorProto.FLOAT, q.shape) + k_info = helper.make_tensor_value_info("K", TensorProto.FLOAT, k.shape) + v_info = helper.make_tensor_value_info("V", TensorProto.FLOAT, v.shape) + o_info = helper.make_tensor_value_info("O", TensorProto.FLOAT, o_shape) # Collect input and output nodes in order inputs, outputs = [q_info, k_info, v_info], [o_info] # Collect all inputs/outputs to the operator node io_kwargs = { - "inputs": ["q", "k", "v"], "outputs": ["o"], "mask_mode": "none" + "inputs": ["Q", "K", "V"], "outputs": ["O"], "mask_mode": "none" } # Start building the shape attributes shape_kwargs = { # Shared embedding dimension of the queries and keys and embedding # dimension of the values - "qk_dim": q.shape[1], "v_dim": v.shape[1], + "QKDim": q.shape[1], "VDim": v.shape[1], # Shared sequence length of keys and values and sequence length of # the queries - "kv_len": k.shape[0], "q_len": q.shape[0], + "KVLen": k.shape[0], "QLen": q.shape[0], } # Start building the datatype attributes dtype_kwargs = { # Datatypes of the query, key, value inputs and the output - "q_dtype": "FLOAT32", "k_dtype": "FLOAT32", - "v_dtype": "FLOAT32", "o_dtype": "FLOAT32", + "QType": "FLOAT32", "KType": "FLOAT32", + "VType": "FLOAT32", "OType": "FLOAT32", } # If the optional mask is specified as an input @@ -237,18 +192,18 @@ def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): # Append the mask input as fourth input node inputs.append(mask_info) # Add the mask default datatype to the datatype attributes - dtype_kwargs["mask_dtype"] = "FLOAT32" + dtype_kwargs["MType"] = "FLOAT32" # If a causal mask is to be generated during execution if mask == "causal": # Configure masking mode via io_kwargs as well io_kwargs["mask_mode"] = "causal" # Add the mask default datatype to the datatype attributes - dtype_kwargs["mask_dtype"] = "FLOAT32" + dtype_kwargs["MType"] = "FLOAT32" # The optional dtypes keyword arguments must describe a subset of the # model inputs and outputs - assert set(dtypes) <= {*dtype_kwargs, "mask_dtype"}, \ + assert set(dtypes) <= {*dtype_kwargs, "MType"}, \ "Specified datatype of unknown input or output" # Update the datatype attributes according to the keyword arguments @@ -272,19 +227,19 @@ def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): # Add datatype annotations to all input tensors for tensor_name in io_kwargs["inputs"]: # Only annotate if a datatype is specified - if f'{tensor_name}_dtype' in dtypes: + if f'{tensor_name}Type' in dtypes: # Update the datatype annotation model.set_tensor_datatype( - tensor_name, dtypes[f'{tensor_name}_dtype'] + tensor_name, dtypes[f'{tensor_name}Type'] ) # Add datatype annotations to all output tensors for tensor_name in io_kwargs["outputs"]: # Only annotate if a datatype is specified - if f'{tensor_name}_dtype' in dtypes: + if f'{tensor_name}Type' in dtypes: # Update the datatype annotation model.set_tensor_datatype( - tensor_name, dtypes[f'{tensor_name}_dtype'] + tensor_name, dtypes[f'{tensor_name}Type'] ) # Return the constructed qonnx model wrapper @@ -293,7 +248,7 @@ def make_modelwrapper_like(q, k, v, mask=None, pe=1, simd=1, **dtypes): # Returns an ONNX node that has the same shape inference behavior def make_shape_compatible_op(self, model): # Infer the output shape from the input shapes - o_shape = (self.get_nodeattr("q_len"), self.get_nodeattr("v_dim")) + o_shape = (self.get_nodeattr("QLen"), self.get_nodeattr("VDim")) # Constant operation producing output of given shape return super().make_const_shape_op(o_shape) @@ -308,34 +263,34 @@ def infer_node_datatype(self, model): v_dtype = model.get_tensor_datatype(node.input[2]) # Test for changing query input datatype - if q_dtype != self.get_nodeattr("q_dtype"): + if q_dtype != self.get_nodeattr("QType"): # Issue a warning message - warnings.warn("q_dtype changing for %s: %s -> %s " % ( + warnings.warn("QType changing for %s: %s -> %s " % ( node.name, - str(self.get_nodeattr("q_dtype")), + str(self.get_nodeattr("QType")), str(q_dtype), )) # Test for changing key input datatype - if k_dtype != self.get_nodeattr("k_dtype"): + if k_dtype != self.get_nodeattr("KType"): # Issue a warning message - warnings.warn("k_dtype changing for %s: %s -> %s " % ( + warnings.warn("KType changing for %s: %s -> %s " % ( node.name, - str(self.get_nodeattr("k_dtype")), + str(self.get_nodeattr("KType")), str(k_dtype), )) # Test for changing value input datatype - if v_dtype != self.get_nodeattr("v_dtype"): + if v_dtype != self.get_nodeattr("VType"): # Issue a warning message - warnings.warn("v_dtype changing for %s: %s -> %s " % ( + warnings.warn("VType changing for %s: %s -> %s " % ( node.name, - str(self.get_nodeattr("v_dtype")), + str(self.get_nodeattr("VType")), str(v_dtype), )) # Update the node datatype attributes - self.set_nodeattr("q_dtype", q_dtype.name) - self.set_nodeattr("k_dtype", k_dtype.name) - self.set_nodeattr("v_dtype", v_dtype.name) + self.set_nodeattr("QType", q_dtype.name) + self.set_nodeattr("KType", k_dtype.name) + self.set_nodeattr("VType", v_dtype.name) # Attention mask might be provided as an input as well if self.get_nodeattr("mask_mode") == "input": @@ -343,21 +298,24 @@ def infer_node_datatype(self, model): # Note: Optional mask will be provided as the fourth input mask_dtype = model.get_tensor_datatype(node.input[3]) # Test for changing mask input datatype - if mask_dtype != self.get_nodeattr("mask_dtype"): + if mask_dtype != self.get_nodeattr("MType"): # Issue a warning message - warnings.warn("mask_dtype changing for %s: %s -> %s " % ( + warnings.warn("MType changing for %s: %s -> %s " % ( node.name, - str(self.get_nodeattr("mask_dtype")), + str(self.get_nodeattr("MType")), str(mask_dtype), )) # Update the node datatype attribute of the attention mask - self.set_nodeattr("mask_dtype", mask_dtype.namke) + self.set_nodeattr("MType", mask_dtype.namke) # Set the model output datatype - model.set_tensor_datatype(node.output[0], self.get_nodeattr('o_dtype')) + model.set_tensor_datatype(node.output[0], self.get_nodeattr('OType')) # Executes the node def execute_node(self, context, graph): + # The folding configuration must be valid + assert self.is_valid_folding, "Invalid Folding" + # Get the mode to use for execution mode = self.get_nodeattr("exec_mode") @@ -374,7 +332,7 @@ def execute_node(self, context, graph): k = context[self.onnx_node.input[1]] v = context[self.onnx_node.input[2]] # Get the shared embedding dimension of queries and keys - d = self.get_nodeattr('qk_dim') + d = self.get_nodeattr('QKDim') # Start with zero mask mask = 0 # The actual attention mask may be provided as the fourth input @@ -384,7 +342,7 @@ def execute_node(self, context, graph): # Another option is to generate a causal attention mask on the fly elif self.get_nodeattr("mask_mode") == "causal": # Get the datatype of the attention mask - mask_dtype = DataType[self.get_nodeattr("mask_dtype")] + mask_dtype = DataType[self.get_nodeattr("MType")] # Start with an all zero attention mask mask = 0 * gen_finn_dt_tensor( mask_dtype, (q.shape[0], k.shape[0]) @@ -427,37 +385,37 @@ def verify_node(self): # Gets the datatype of input at index ind def get_input_datatype(self, ind=0): # Ordered list of names of allowed inputs - inputs = ["q", "k", "v"] + inputs = ["Q", "K", "V"] # If the attention mask is provided as input, it has a type as well if self.get_nodeattr("mask_mode") == "input": # The mask type is an attribute itself inputs += ["mask"] # Look up datatype name in attributes and convert to DataType - return DataType[self.get_nodeattr(f"{inputs[ind]}_dtype")] + return DataType[self.get_nodeattr(f"{inputs[ind]}Type")] # Gets the datatype of the output (at index ind, but there is just one) def get_output_datatype(self, ind=0): # Ordered list of names of allowed outputs - outputs = ["o"] + outputs = ["O"] # Look up datatype name in attributes and convert to DataType - return DataType[self.get_nodeattr(f"{outputs[ind]}_dtype")] + return DataType[self.get_nodeattr(f"{outputs[ind]}Type")] # Gets the shape of the input at index ind without folding def get_normal_input_shape(self, ind=0): # List shapes of inputs in order inputs_shapes = [ # Query input sequence - (self.get_nodeattr("q_len"), self.get_nodeattr("qk_dim")), + (self.get_nodeattr("QLen"), self.get_nodeattr("QKDim")), # Key input sequence - (self.get_nodeattr("kv_len"), self.get_nodeattr("kv_dim")), + (self.get_nodeattr("KVLen"), self.get_nodeattr("QKDim")), # Value input sequence - (self.get_nodeattr("kv_len"), self.get_nodeattr("v_dim")), + (self.get_nodeattr("KVLen"), self.get_nodeattr("VDim")), ] # If the attention mask is provided as input, it has a shape as well if self.get_nodeattr("mask_mode") == "input": # Mask shape is inferred from query and key sequence lengths inputs_shapes += [ - (self.get_nodeattr("q_len"), self.get_nodeattr("kv_len")) + (self.get_nodeattr("QLen"), self.get_nodeattr("KVLen")) ] # Get the shape by indexing into the ordered list of all inputs return inputs_shapes[ind] @@ -467,36 +425,28 @@ def get_normal_input_shape(self, ind=0): def get_normal_output_shape(self, ind=0): # noqa, there is just one output # The output shape is inferred from the length of the query sequence and # the embedding dimension of the values - return tuple((self.get_nodeattr("q_len"), self.get_nodeattr("v_dim"))) + return tuple((self.get_nodeattr("QLen"), self.get_nodeattr("VDim"))) # Gets the shape of the input at index ind with folding def get_folded_input_shape(self, ind=0): # Get the unfolded size of the input - t, d = self.get_normal_input_shape(ind) - # Get the amount of input (SIMD) and output (PE) parallelism - simd = self.get_nodeattr("SIMD") - pe = self.get_nodeattr("PE") # TODO: What about this? - - # The query (first) and key (second) inputs are treated the same and - # merely differ in buffering requirements - if ind == 0 or ind == 1: - # Fold the input along the embedding dimension - sf = d // simd - # New shape with SIMD elements as the last dimension - return tuple((t, sf, simd)) - # For the value (third) inputs the axes flip and simd/pe change roles - if ind == 2: - # Fold the input along the sequence length dimension - sf = t // simd - # New shape with SIMD elements as the last dimension - return tuple((sf, d, simd)) - # If the mask is provided as input, it is folded as well + ilen, idim = self.get_normal_input_shape(ind) + # Get the folding configuration specifying the amount of parallelism + embfold, seqfold = self.folds + + # Queries, keys and values are all folded similarly along the embedding + # dimension + if ind in (0, 1, 2): + # Note: Embedding dimension is always assumed to be the second + # dimension, any transpose is handled implicitly by the operator + return ilen, embfold, idim // embfold + + # If the mask is provided as input, it is folded along the second + # sequence dimension if ind == 3 and self.get_nodeattr("mask_mode") == "input": - # The mask is folded along the second dimension which is actually a - # sequence length as well. It might be confusing to call it d here. - sf = d // simd - # New shape with SIMD elements as the last dimension - return tuple((t, sf, simd)) + # Note: Both dimensions are sequence dimension, the second + # corresponds to the KVLen + return ilen, seqfold, idim // seqfold # If this point is reached, something went wrong raise Exception(f"Requested shape of invalid input index {ind}") @@ -504,27 +454,32 @@ def get_folded_input_shape(self, ind=0): # Gets the shape of the output at index ind (there is just one) with folding def get_folded_output_shape(self, ind=0): # noqa, there is just one output # Get the unfolded size of the output - t, d = self.get_normal_output_shape(ind) - # Get the amount of output (PE) parallelism - pe = self.get_nodeattr("PE") - # The output is folded along the embedding dimension, neuron-fold - nf = d // pe - # New shape with PE elements as the last dimension - return tuple((t, nf, pe)) + olen, odim = self.get_normal_output_shape(ind) + # Get the folding configuration specifying the amount of parallelism + embfold, seqfold = self.folds + # The output is always folded along the embedding dimension, which is + # assumed to be the second dimension + return olen, embfold, odim // embfold # Widths of the input data stream of the input at index ind def get_instream_width(self, ind=0): # Get the number of bits used to represent the input i_bits = self.get_input_datatype(ind).bitwidth() - # Width of a stream receiving SIMD inputs in parallel - return self.get_nodeattr("SIMD") * i_bits + # Parallelism is the number of elements in the last dimension of the + # folded input + _, _, elems = self.get_folded_input_shape(ind) + # Width of a stream receiving input elements in parallel + return elems * i_bits # Widths of the output data stream of the output at index ind def get_outstream_width(self, ind=0): # Get the number of bits used to represent the output o_bits = self.get_output_datatype(ind).bitwidth() - # Width of a stream producing PE outputs in parallel - return self.get_nodeattr("PE") * o_bits + # Parallelism is the number of elements in the last dimension of the + # folded output + _, _, elems = self.get_folded_output_shape(ind) + # Width of a stream producing output elements in parallel + return elems * o_bits # Maximum width of any ap_int used in this operator def get_ap_int_max_w(self): @@ -538,7 +493,7 @@ def get_ap_int_max_w(self): # bit-width as well if self.get_nodeattr("mask_mode") in {"input", "causal"}: # Get width of the mask datatype - m_bits = DataType[self.get_nodeattr("mask_dtype")].bitwidth() + m_bits = DataType[self.get_nodeattr("MType")].bitwidth() # TODO: Are there more intermediates which need to be considered? # Find maximum of all maximal bit-widths return max([i_bits_max, o_bits_max, m_bits]) diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 0107beea5c..51f9f89ca9 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -17,72 +17,72 @@ # Size of query and key embedding dimension -@pytest.mark.parametrize("qk_dim", [64]) +@pytest.mark.parametrize("QKDim", [64]) # Size of value embedding dimension -@pytest.mark.parametrize("v_dim", [64]) +@pytest.mark.parametrize("VDim", [64]) # Length of key and value sequences -@pytest.mark.parametrize("kv_len", [256]) +@pytest.mark.parametrize("KVLen", [256]) # Length of query sequence -@pytest.mark.parametrize("q_len", [256]) +@pytest.mark.parametrize("QLen", [256]) # Different modes to provide a mask @pytest.mark.parametrize("mask", ["none", "input", "causal"]) -# Output parallelism -@pytest.mark.parametrize("pe", [1]) -# Input parallelism -@pytest.mark.parametrize("simd", [1]) +# Folding along the embedding dimensions +@pytest.mark.parametrize("EmbFold", [64, 32]) +# Folding along the sequence dimensions +@pytest.mark.parametrize("SeqFold", [256, 128]) # Datatypes of queries, keys and values, mask and output -@pytest.mark.parametrize("q_dtype", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("k_dtype", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("v_dtype", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("mask_dtype", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("o_dtype", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("QType", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("KType", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("VType", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("MType", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("OType", [DataType["FLOAT32"]]) # Tests python implementation of single scaled dot-product attention head def test_attention_python( - qk_dim, v_dim, kv_len, q_len, mask, pe, simd, q_dtype, k_dtype, v_dtype, - mask_dtype, o_dtype + QKDim, VDim, KVLen, QLen, mask, EmbFold, SeqFold, QType, KType, VType, + MType, OType ): # Generate random input data - q = gen_finn_dt_tensor(q_dtype, (q_len, qk_dim)) - k = gen_finn_dt_tensor(k_dtype, (kv_len, qk_dim)) - v = gen_finn_dt_tensor(v_dtype, (kv_len, v_dim)) + q = gen_finn_dt_tensor(QType, (QLen, QKDim)) + k = gen_finn_dt_tensor(KType, (KVLen, QKDim)) + v = gen_finn_dt_tensor(VType, (KVLen, VDim)) dtypes = { # Datatypes of the query, key, value inputs and the output - "q_dtype": q_dtype, "k_dtype": k_dtype, - "v_dtype": v_dtype, "o_dtype": o_dtype, + "QType": QType, "KType": KType, + "VType": VType, "OType": OType, } # Generate the operator matching the configuration model = ScaledDotProductAttention.make_modelwrapper_like( - q, k, v, mask, pe, simd, **dtypes, mask_dtype=mask_dtype + q, k, v, mask, EmbFold, SeqFold, **dtypes, MType=MType ) # Generate random input mask if the operator expects the mask as fourth # input if mask == "input": - mask = gen_finn_dt_tensor(DataType["FLOAT32"], (q_len, kv_len)) + mask = gen_finn_dt_tensor(DataType["FLOAT32"], (QLen, KVLen)) # If a causal attention mask is requested, generate upper triangular matrix elif mask == "causal": # Start zero initialized mask - mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (q_len, kv_len)) + mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (QLen, KVLen)) # Fill upper triangular causal attention mask mask[np.triu_indices_from(mask, 1)] = - np.inf # No mask input requested elif mask == "none": # No mask is equivalent to a zero mask - mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (q_len, kv_len)) + mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (QLen, KVLen)) # Prepare execution context context = { - "q": q, "k": k, "v": v, "mask": mask + "Q": q, "K": k, "V": v, "mask": mask } # Set model execution mode to python (numpy execution) model = model.transform(SetExecMode("python")) # Execute the onnx model to collect the result - o_produced = execute_onnx(model, context)["o"] + o_produced = execute_onnx(model, context)["O"] # Compute the attention matrix between queries and keys - attention = softmax(q @ k.T * (qk_dim ** -0.5) + mask, axis=-1) + attention = softmax(q @ k.T * (QKDim ** -0.5) + mask, axis=-1) # Compute product of attention weights and value input o_expected = attention @ v From 602f1cad8fef79b6de9f426ef8de4deb4c30cecf Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 7 Aug 2023 15:05:57 +0200 Subject: [PATCH 07/88] [Attention] Fix get_ap_int_max_w output and mask stream width --- src/finn/custom_op/fpgadataflow/attention.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 5460f1e567..d52e8bf884 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -486,14 +486,18 @@ def get_ap_int_max_w(self): # Find the widths of the widest input i_bits_max = max((self.get_instream_width(ind) for ind in range(3))) # Find the widths of the widest output - o_bits_max = max((self.get_instream_width(ind) for ind in range(1))) + o_bits_max = max((self.get_outstream_width(ind) for ind in range(1))) # Assume no bits to represent the mask, if there is no mask m_bits = 0 # A mask received as input or produced as causal on the fly has a # bit-width as well if self.get_nodeattr("mask_mode") in {"input", "causal"}: + # Parallelism is the number of elements in the last dimension of the + # folded output + _, _, elems = self.get_folded_output_shape(ind=3) # Get width of the mask datatype - m_bits = DataType[self.get_nodeattr("MType")].bitwidth() - # TODO: Are there more intermediates which need to be considered? + m_bits = elems * DataType[self.get_nodeattr("MType")].bitwidth() + # TODO: Are there more intermediates which need to be considered? Yes, + # attention weights and MatMul accumulators and outputs. # Find maximum of all maximal bit-widths return max([i_bits_max, o_bits_max, m_bits]) From ad17b1b59cf07046cc8a192d7a2c5750f549adc8 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 7 Aug 2023 15:54:05 +0200 Subject: [PATCH 08/88] [Attention] Start filling some of the HLSCustomOp abstract methods --- src/finn/custom_op/fpgadataflow/attention.py | 34 ++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index d52e8bf884..3eaea417f3 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -3,7 +3,7 @@ # Numpy math and arrays import numpy as np # Derive custom operators form the FINN base custom op -# from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp # Temporarily derive custom operators from QONNX base custom op # TODO: Remove once switching to HLSCustomOp from qonnx.custom_op.base import CustomOp @@ -14,7 +14,7 @@ # Scaled Dot-Product Attention Custom Operator # Note: Single head attention -class ScaledDotProductAttention(CustomOp): +class ScaledDotProductAttention(HLSCustomOp): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -501,3 +501,33 @@ def get_ap_int_max_w(self): # attention weights and MatMul accumulators and outputs. # Find maximum of all maximal bit-widths return max([i_bits_max, o_bits_max, m_bits]) + + # Defines C++ type aliases, global constants and macros + def defines(self, var): + # Get and unpack the shape attributes (except the q matrix length, which + # is never folded) + qkdim, qlen, vdim, kvlen = self.shapes + # Get and unpack the folding attributes + embfold, seqfold = self.folds + # Insert constants and typer aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Shapes of attention inputs: query, key and value + f"static constexpr std::size_t QKDim = {qkdim};", + f"static constexpr std::size_t QLen = {qlen};", + f"static constexpr std::size_t VDim = {vdim};", + f"static constexpr std::size_t KVLen = {kvlen};", + # Folding configuration + f"static constexpr std::size_t EmbFold = {embfold};", + f"static constexpr std::size_t SeqFold = {seqfold};", + ] + + # Generates C++ blackboxfunction for IP generation + def blackboxfunction(self): + # Insert function head describing the top level interface of the + # attention operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # Note: Assumes stream type aliases to be set in defines + """ + void {}(QStream &q, KStream &k, VStream &v, OStream &out) + """.format(self.onnx_node.name) + ] From 0de1bcec8bd5751a85b02dc446183fd56e0f3366 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 8 Aug 2023 13:49:47 +0200 Subject: [PATCH 09/88] [Attention] Fill out includes and defines for C++ code generation --- src/finn/custom_op/fpgadataflow/attention.py | 162 +++++++++++++++--- .../test_fpgadataflow_attention.py | 24 ++- 2 files changed, 157 insertions(+), 29 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 3eaea417f3..8840bd4cc6 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -24,7 +24,7 @@ def __init__(self, onnx_node, **kwargs): # in another repository right now. def get_nodeattr_types(self): # Start from parent operator class attributes - attrs = {} # super().get_nodeattr_types() + attrs = super().get_nodeattr_types() # Update attributes dictionary for new custom operator attrs.update({ # Embedding dimension of queries and keys @@ -50,29 +50,29 @@ def get_nodeattr_types(self): # Datatype of mask matrix elements "MType": ("s", False, "INT0"), # Datatype of attention weights elements - "AType": ("s", True, ""), + "AType": ("s", False, "UINT32"), # Datatype of output elements "OType": ("s", True, ""), # Datatype of accumulator elements of the Query x Key multiplication - "AccQKMatMul": ("s", False, ""), + "AccQKMatMul": ("s", False, "UINT32"), # Datatype of output elements of the Query x Key multiplication - "OutQKMatMul": ("s", False, ""), + "OutQKMatMul": ("s", False, "UINT32"), # Activation function type of the Query x Key multiplication - "ActQKMatMul": ("s", False, ""), + "ActQKMatMul": ("s", False, "PassThroughActivation"), # Datatype of accumulator elements of the Attention x Value # multiplication - "AccAVMatMul": ("s", False, ""), + "AccAVMatMul": ("s", False, "UINT32"), # Datatype of output elements of the Attention x Value # multiplication - "OutAVMatMul": ("s", False, ""), + "OutAVMatMul": ("s", False, "UINT32"), # Activation function type of the Attention x Value multiplication - "ActAVMatMul": ("s", False, ""), + "ActAVMatMul": ("s", False, "PassThroughActivation"), # Activation function type of the softmax normalization of the # attention weights - "ActASoftmax": ("s", False, ""), + "ActASoftmax": ("s", False, "PassThroughActivation"), # Mode used for providing the attention mask: There can be no mask, # a mask sent as the fourth input or a causal attention mask which @@ -502,26 +502,135 @@ def get_ap_int_max_w(self): # Find maximum of all maximal bit-widths return max([i_bits_max, o_bits_max, m_bits]) - # Defines C++ type aliases, global constants and macros + # Gets the number of expected output values, i.e. how many times read() + # could/should be called on the output stream of this operator + def get_number_output_values(self): + # Elements over all but the last dimension of the output folded along + # the embedding dimension + return np.prod(self.get_folded_output_shape()[:-1]) + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # FINN HLSLIB activation functions: e.g. PassThroughActivation + self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] + # Attention operator HLS code + self.code_gen_dict["$GLOBALS$"] += ['#include "attention.hpp"'] + + # Generates C++ code of type alias, global constant and macro definitions def defines(self, var): - # Get and unpack the shape attributes (except the q matrix length, which - # is never folded) - qkdim, qlen, vdim, kvlen = self.shapes - # Get and unpack the folding attributes - embfold, seqfold = self.folds + # Generate shape definitions from attributes to C++ constant definitions + def shapedefs(*names): + # C++ qualified type to be used for shape constants + shape = "static constexpr std::size_t" + # Generate a C++ constant definition for each of the attributes + # given by argument list names + return ( + f"{shape} {name} = {self.get_nodeattr(name)};" for name in names + ) + + # Generate datatype definitions mapping from QONNX DataType to HLS type + def typedefs(*names): + # Gets the HLS type string for the datatype specified by the named + # attribute + def hls_type(name): + # Looks up the datatype specified for the attribute and + # translates from QONNX to HLS type + return DataType[self.get_nodeattr(name)].get_hls_datatype_str() + + # Generate a C++ type alias definition for each of the attributes + # given by argument list names + return (f"using {name} = {hls_type(name)};" for name in names) + # Insert constants and typer aliases into the dictionary self.code_gen_dict["$DEFINES$"] = [ - # Shapes of attention inputs: query, key and value - f"static constexpr std::size_t QKDim = {qkdim};", - f"static constexpr std::size_t QLen = {qlen};", - f"static constexpr std::size_t VDim = {vdim};", - f"static constexpr std::size_t KVLen = {kvlen};", - # Folding configuration - f"static constexpr std::size_t EmbFold = {embfold};", - f"static constexpr std::size_t SeqFold = {seqfold};", + # Shape constant definitions of attention inputs (query, key and + # value) and folding configuration + *shapedefs( + "QKDim", + "QLen", + "VDim", + "KVLen", + "EmbFold", + "SeqFold" + ), + # Type alias definitions for all input, output and intermediate + # datatypes + *typedefs( + "QType", + "KType", + "VType", + "MType", + "AType", + "OType" + ), + # Type alias definitions for the matmul accumulators and output + # datatypes + *typedefs( + "AccQKMatMul", + "OutQKMatMul", + "AccAVMatMul", + "OutAVMatMul" + ), + # Type alias definitions for the activation functions + f"using ActQKMatMul = {self.get_nodeattr('ActQKMatMul')};", + f"using ActAVMatMul = {self.get_nodeattr('ActAVMatMul')};", + f"using ActASoftmax = {self.get_nodeattr('ActASoftmax')};", + # Type alias of the properly configured attention operator class + f"using Attention = ScaledDotProductAttention<", + f" QKDim,", + f" QLen,", + f" VDim,", + f" KVLen,", + f" EmbFold,", + f" SeqFold,", + f" QType,", + f" KType,", + f" VType,", + f" MType,", + f" AType,", + f" OType,", + f" AccQKMatMul,", + f" OutQKMatMul,", + f" ActQKMatMul,", + f" AccAVMatMul,", + f" OutAVMatMul,", + f" ActAVMatMul,", + f" ActASoftmax", + f">;", + # Short type aliases of attention input and output streams + f"using QStream = Attention::QStream;", + f"using KStream = Attention::KStream;", + f"using VStream = Attention::VStream;", + f"using OStream = Attention::OStream;", ] - # Generates C++ blackboxfunction for IP generation + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + pass + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + pass + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + pass + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C** simulation + def dataoutstrm(self): + pass + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + pass + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis def blackboxfunction(self): # Insert function head describing the top level interface of the # attention operator @@ -531,3 +640,8 @@ def blackboxfunction(self): void {}(QStream &q, KStream &k, VStream &v, OStream &out) """.format(self.onnx_node.name) ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + pass diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 51f9f89ca9..a25f6caf6b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -12,8 +12,10 @@ from qonnx.core.onnx_exec import execute_onnx # Attention operator to test from finn.custom_op.fpgadataflow.attention import ScaledDotProductAttention +from qonnx.custom_op.registry import getCustomOp # Graphs transformation setting the execution mode attribute from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim # Size of query and key embedding dimension @@ -31,11 +33,11 @@ # Folding along the sequence dimensions @pytest.mark.parametrize("SeqFold", [256, 128]) # Datatypes of queries, keys and values, mask and output -@pytest.mark.parametrize("QType", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("KType", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("VType", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("MType", [DataType["FLOAT32"]]) -@pytest.mark.parametrize("OType", [DataType["FLOAT32"]]) +@pytest.mark.parametrize("QType", [DataType["UINT16"]]) +@pytest.mark.parametrize("KType", [DataType["UINT16"]]) +@pytest.mark.parametrize("VType", [DataType["UINT16"]]) +@pytest.mark.parametrize("MType", [DataType["UINT16"]]) +@pytest.mark.parametrize("OType", [DataType["UINT16"]]) # Tests python implementation of single scaled dot-product attention head def test_attention_python( QKDim, VDim, KVLen, QLen, mask, EmbFold, SeqFold, QType, KType, VType, @@ -78,6 +80,18 @@ def test_attention_python( } # Set model execution mode to python (numpy execution) model = model.transform(SetExecMode("python")) + + # Add the path to store C++ simulation source code generated by the + # PrepareCppSim transform + for node in model.graph.node: + # lookup op_type in registry of CustomOps + inst = getCustomOp(node) + print(inst) + inst.set_nodeattr("code_gen_dir_cppsim", "test_code_gen_attention/") + + # Generates the C++ source to be compiled as C++ simulation + model.transform(PrepareCppSim()) + # Execute the onnx model to collect the result o_produced = execute_onnx(model, context)["O"] From de9dc73e629a4570648bdb5efefbbe885fbef6e3 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 8 Aug 2023 14:33:17 +0200 Subject: [PATCH 10/88] [Attention] Add IP generation C++ source generation step to test --- src/finn/custom_op/fpgadataflow/attention.py | 6 ++--- .../test_fpgadataflow_attention.py | 23 ++++++++----------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 8840bd4cc6..16f0a7e2ee 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -636,9 +636,9 @@ def blackboxfunction(self): # attention operator self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ # Note: Assumes stream type aliases to be set in defines - """ - void {}(QStream &q, KStream &k, VStream &v, OStream &out) - """.format(self.onnx_node.name) + f"void {self.onnx_node.name} (", + f" QStream &q, KStream &k, VStream &v, OStream &out", + f")", ] # Generates C++ pragmas to be inserted into the main function of the C++ diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index a25f6caf6b..a839689ea4 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -3,6 +3,7 @@ # Use numpy for python execution / computing the ground truth expected values import numpy as np +from qonnx.transformation.general import GiveUniqueNodeNames # Numpy compatible implementation of the softmax operation from scipy.special import softmax @@ -16,7 +17,8 @@ # Graphs transformation setting the execution mode attribute from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim - +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP # Size of query and key embedding dimension @pytest.mark.parametrize("QKDim", [64]) @@ -27,11 +29,11 @@ # Length of query sequence @pytest.mark.parametrize("QLen", [256]) # Different modes to provide a mask -@pytest.mark.parametrize("mask", ["none", "input", "causal"]) +@pytest.mark.parametrize("mask", ["none"]) # Folding along the embedding dimensions -@pytest.mark.parametrize("EmbFold", [64, 32]) +@pytest.mark.parametrize("EmbFold", [64]) # Folding along the sequence dimensions -@pytest.mark.parametrize("SeqFold", [256, 128]) +@pytest.mark.parametrize("SeqFold", [256]) # Datatypes of queries, keys and values, mask and output @pytest.mark.parametrize("QType", [DataType["UINT16"]]) @pytest.mark.parametrize("KType", [DataType["UINT16"]]) @@ -81,16 +83,11 @@ def test_attention_python( # Set model execution mode to python (numpy execution) model = model.transform(SetExecMode("python")) - # Add the path to store C++ simulation source code generated by the - # PrepareCppSim transform - for node in model.graph.node: - # lookup op_type in registry of CustomOps - inst = getCustomOp(node) - print(inst) - inst.set_nodeattr("code_gen_dir_cppsim", "test_code_gen_attention/") - # Generates the C++ source to be compiled as C++ simulation - model.transform(PrepareCppSim()) + model = model.transform(PrepareCppSim()) + # Prepares IP-generation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) # Execute the onnx model to collect the result o_produced = execute_onnx(model, context)["O"] From f21a47ca52109000b872ef415fb0c5f9805923e5 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 8 Aug 2023 14:47:53 +0200 Subject: [PATCH 11/88] [Attention] Add some interface pragmas for C++ code generation --- src/finn/custom_op/fpgadataflow/attention.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 16f0a7e2ee..75d64d4c4f 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -644,4 +644,19 @@ def blackboxfunction(self): # Generates C++ pragmas to be inserted into the main function of the C++ # simulation and the ipgen-blackboxfunction as well def pragmas(self): - pass + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the query input stream with an axi stream interface + "#pragma HLS INTERFACE axis port=q", + # Connect the key input stream with an axi stream interface + "#pragma HLS INTERFACE axis port=k", + # Connect the value input stream with an axi stream interface + "#pragma HLS INTERFACE axis port=v", + # Connect the output stream with an axi stream interface + "#pragma HLS INTERFACE axis port=out", + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) From 8e94cfe710dee79e95007a8abbb037c26612b81e Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 8 Aug 2023 14:55:46 +0200 Subject: [PATCH 12/88] [Attention] Add stream declarations for C++ simulation code generation --- src/finn/custom_op/fpgadataflow/attention.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 75d64d4c4f..9ed7b6eecd 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -613,7 +613,11 @@ def read_npy_data(self): # Generates C++ code for declaring all streams involved in C++ simulation # for testing def strm_decl(self): - pass + # Declare input (query, key, value) and output streams + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # Note: Assumes stream type aliases to be set in defines + 'QStream q;', 'KStream k;', 'VStream v;', 'OStream out;' + ] # Generates C++ code for calling the computation part of the operator def docompute(self): From 03ddfb2156f79c9fc9f0e82ca403d875c463d500 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 8 Aug 2023 15:02:28 +0200 Subject: [PATCH 13/88] [Attention] Add attention function body to C++ code generation --- src/finn/custom_op/fpgadataflow/attention.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 9ed7b6eecd..e9a99bacae 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -621,7 +621,19 @@ def strm_decl(self): # Generates C++ code for calling the computation part of the operator def docompute(self): - pass + # Write the body of the attention top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Instantiate the attention operator and connect to the streams + # Note: Assumes "Attention" to be aliased appropriate configuration + # in defines with. + "Attention attention(q, k, v);", + # Transfer from input to output stream + # TODO: Ge rid of this once switching to function-call style for the + # attention operator. + "for(std::size_t i = 0; i < QLen * EmbFold; ++i) {", + " out.write(attention.out.read());", + "}", + ] # Generates C++ code for reading the output stream and converting back to # numpy format for testing in C** simulation From 295ab25ad3396b0b71d6b60f80e6036c35b1514f Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 8 Aug 2023 17:39:17 +0200 Subject: [PATCH 14/88] [Attention] Add C++ simulation code feeding the input streams from files --- src/finn/custom_op/fpgadataflow/attention.py | 54 +++++++++++++++++++- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index e9a99bacae..ac23914c97 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -603,12 +603,60 @@ def hls_type(name): f"using KStream = Attention::KStream;", f"using VStream = Attention::VStream;", f"using OStream = Attention::OStream;", + f"using MStream = Attention::MStream;", ] # Generates C++ code for reading data from .npy (numpy format) for testing # in C++ simulation def read_npy_data(self): - pass + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [ + # Deduce the datatype of elements packed into the query input stream + # TODO: Maybe these type-deductions can be removed by changing the + # order of the template arguments of the npy2apintstream, such + # that type-deduction is handled there? + f"using QPacked = decltype(QStream.read());", + # Generate function call reding from file into the input stream + # Note: Inputs are always represented as numpy floats + f"npy2apintstream(", + f" {code_gen_dir}/q.npy, q, false", + ");", + + # Deduce the datatype of elements packed into the key input stream + f"using KPacked = decltype(KStream.read());", + # Generate function call reding from file into the input stream + # Note: Inputs are always represented as numpy floats + f"npy2apintstream(", + f" {code_gen_dir}/k.npy, k, false", + ");", + + # Deduce the datatype of elements packed into the value input stream + f"using VPacked = decltype(VStream.read());", + # Generate function call reding from file into the input stream + # Note: Inputs are always represented as numpy floats + f"npy2apintstream(", + f" {code_gen_dir}/v.npy, v, false", + ");", + ] + + # If the mask is provided as an input, it needs to be read as well + if self.get_nodeattr("mask_mode") == "input": + # Generate function call for reading the mask file into the input + # stream + self.code_gen_dict["$READNPYDATA$"] += [ + # Deduce the datatype of elements packed into the mask input + # stream + f"using MPacked = decltype(MStream.read());", + # Generate function call reding from file into the input stream + # Note: Inputs are always represented as numpy floats + f"npy2apintstream(", + f" {code_gen_dir}/m.npy, m, false", + ");", + ] # Generates C++ code for declaring all streams involved in C++ simulation # for testing @@ -643,7 +691,9 @@ def dataoutstrm(self): # Generates C++ code for saving the output of C++ simulation to a file in # numpy format def save_as_npy(self): - pass + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] # Generates essentially the head of the C++ function from which the IP block # will be generated during ipgen, i.e. actual synthesis From b6a26e1e4f4d2f4fef3758039755f2f7402cc1c6 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 9 Aug 2023 09:56:46 +0200 Subject: [PATCH 15/88] [Attention] Add C++ simulation code saving the output stream to file --- src/finn/custom_op/fpgadataflow/attention.py | 35 ++++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index ac23914c97..877da4ff05 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -620,7 +620,7 @@ def read_npy_data(self): # order of the template arguments of the npy2apintstream, such # that type-deduction is handled there? f"using QPacked = decltype(QStream.read());", - # Generate function call reding from file into the input stream + # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f"npy2apintstream(", f" {code_gen_dir}/q.npy, q, false", @@ -628,7 +628,7 @@ def read_npy_data(self): # Deduce the datatype of elements packed into the key input stream f"using KPacked = decltype(KStream.read());", - # Generate function call reding from file into the input stream + # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f"npy2apintstream(", f" {code_gen_dir}/k.npy, k, false", @@ -636,7 +636,7 @@ def read_npy_data(self): # Deduce the datatype of elements packed into the value input stream f"using VPacked = decltype(VStream.read());", - # Generate function call reding from file into the input stream + # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f"npy2apintstream(", f" {code_gen_dir}/v.npy, v, false", @@ -651,7 +651,7 @@ def read_npy_data(self): # Deduce the datatype of elements packed into the mask input # stream f"using MPacked = decltype(MStream.read());", - # Generate function call reding from file into the input stream + # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f"npy2apintstream(", f" {code_gen_dir}/m.npy, m, false", @@ -676,8 +676,8 @@ def docompute(self): # in defines with. "Attention attention(q, k, v);", # Transfer from input to output stream - # TODO: Ge rid of this once switching to function-call style for the - # attention operator. + # TODO: Get rid of this once switching to function-call style for + # the attention operator. "for(std::size_t i = 0; i < QLen * EmbFold; ++i) {", " out.write(attention.out.read());", "}", @@ -686,7 +686,28 @@ def docompute(self): # Generates C++ code for reading the output stream and converting back to # numpy format for testing in C** simulation def dataoutstrm(self): - pass + # Output data will be stored in numpy files in the code generation + # dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Generate function call for reading from the output stream into the + # output file + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + # Deduce the datatype of elements packed into the output stream + f'using OPacked = decltype(OStream.read());', + # Generate function call reading from stream into the output file + # Note: Outputs are always represented as numpy floats + f'apintstream2npy(', + f' out, {shape}, "{code_gen_dir}/out.npy", false', + ');', + ] # Generates C++ code for saving the output of C++ simulation to a file in # numpy format From 5d800e7fee8489eee9982fab163214d3b953e3cb Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 9 Aug 2023 09:59:51 +0200 Subject: [PATCH 16/88] [Attention] Add missing "" to generated C++ strings --- src/finn/custom_op/fpgadataflow/attention.py | 32 ++++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 877da4ff05..c607c50593 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -619,28 +619,28 @@ def read_npy_data(self): # TODO: Maybe these type-deductions can be removed by changing the # order of the template arguments of the npy2apintstream, such # that type-deduction is handled there? - f"using QPacked = decltype(QStream.read());", + f'using QPacked = decltype(QStream.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats - f"npy2apintstream(", - f" {code_gen_dir}/q.npy, q, false", - ");", + f'npy2apintstream(', + f' "{code_gen_dir}/q.npy", q, false', + ');', # Deduce the datatype of elements packed into the key input stream - f"using KPacked = decltype(KStream.read());", + f'using KPacked = decltype(KStream.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats - f"npy2apintstream(", - f" {code_gen_dir}/k.npy, k, false", - ");", + f'npy2apintstream(', + f' "{code_gen_dir}/k.npy", k, false', + ');', # Deduce the datatype of elements packed into the value input stream - f"using VPacked = decltype(VStream.read());", + f'using VPacked = decltype(VStream.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats - f"npy2apintstream(", - f" {code_gen_dir}/v.npy, v, false", - ");", + f'npy2apintstream(', + f' "{code_gen_dir}/v.npy", v, false', + ');', ] # If the mask is provided as an input, it needs to be read as well @@ -650,12 +650,12 @@ def read_npy_data(self): self.code_gen_dict["$READNPYDATA$"] += [ # Deduce the datatype of elements packed into the mask input # stream - f"using MPacked = decltype(MStream.read());", + f'using MPacked = decltype(MStream.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats - f"npy2apintstream(", - f" {code_gen_dir}/m.npy, m, false", - ");", + f'npy2apintstream(', + f' "{code_gen_dir}/m.npy", m, false', + ');', ] # Generates C++ code for declaring all streams involved in C++ simulation From 906a8c53ef8dd1a8ade07523694d544aa56eeb77 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 9 Aug 2023 10:59:47 +0200 Subject: [PATCH 17/88] [Attention] Add missing bit width cases to get_ap_int_max_w --- src/finn/custom_op/fpgadataflow/attention.py | 70 +++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index c607c50593..cdaa597d7b 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -59,7 +59,7 @@ def get_nodeattr_types(self): # Datatype of output elements of the Query x Key multiplication "OutQKMatMul": ("s", False, "UINT32"), # Activation function type of the Query x Key multiplication - "ActQKMatMul": ("s", False, "PassThroughActivation"), + "ActQKMatMul": ("s", False, "PassThroughActivation"), # Datatype of accumulator elements of the Attention x Value # multiplication @@ -68,11 +68,11 @@ def get_nodeattr_types(self): # multiplication "OutAVMatMul": ("s", False, "UINT32"), # Activation function type of the Attention x Value multiplication - "ActAVMatMul": ("s", False, "PassThroughActivation"), + "ActAVMatMul": ("s", False, "PassThroughActivation"), # Activation function type of the softmax normalization of the # attention weights - "ActASoftmax": ("s", False, "PassThroughActivation"), + "ActASoftmax": ("s", False, "PassThroughActivation"), # Mode used for providing the attention mask: There can be no mask, # a mask sent as the fourth input or a causal attention mask which @@ -427,6 +427,12 @@ def get_normal_output_shape(self, ind=0): # noqa, there is just one output # the embedding dimension of the values return tuple((self.get_nodeattr("QLen"), self.get_nodeattr("VDim"))) + # Gets the shape of the attention weights at index ind (there is just one) + # without folding + def get_normal_attention_shape(self, ind=0): # noqa, there is just one + # The attention weights have shape covering both sequence dimensions + return tuple((self.get_nodeattr("QLen"), self.get_nodeattr("KVLen"))) + # Gets the shape of the input at index ind with folding def get_folded_input_shape(self, ind=0): # Get the unfolded size of the input @@ -461,6 +467,17 @@ def get_folded_output_shape(self, ind=0): # noqa, there is just one output # assumed to be the second dimension return olen, embfold, odim // embfold + # Gets the shape of the attention weights at index ind (there is just one) + # with folding + def get_folded_attention_shape(self, ind=0): # noqa, there is just one + # Get the unfolded size of the attention weights + alen, adim = self.get_normal_attention_shape(ind) + # Get the folding configuration specifying the amount of parallelism + embfold, seqfold = self.folds + # The attention weights are always folded along the sequence dimension, + # which is assumed to be the second dimension + return alen, seqfold, adim // seqfold + # Widths of the input data stream of the input at index ind def get_instream_width(self, ind=0): # Get the number of bits used to represent the input @@ -493,14 +510,49 @@ def get_ap_int_max_w(self): # bit-width as well if self.get_nodeattr("mask_mode") in {"input", "causal"}: # Parallelism is the number of elements in the last dimension of the - # folded output - _, _, elems = self.get_folded_output_shape(ind=3) + # folded mask input + _, _, elems = self.get_folded_input_shape(ind=3) # Get width of the mask datatype m_bits = elems * DataType[self.get_nodeattr("MType")].bitwidth() - # TODO: Are there more intermediates which need to be considered? Yes, - # attention weights and MatMul accumulators and outputs. - # Find maximum of all maximal bit-widths - return max([i_bits_max, o_bits_max, m_bits]) + + # Elements per folded key input (second input) + _, _, i_elems = self.get_folded_input_shape(ind=1) + # Elements per folded value input (third input), same as the number of + # output elements + _, _, o_elems = self.get_folded_input_shape(ind=2) + + # Parallelism is the number of elements in the last dimension of the + # folded attention weights + _, _, s_elems = self.get_folded_attention_shape() + # Number of bits used for the attention weights stream + a_bits = s_elems * DataType[self.get_nodeattr("AType")].bitwidth() + + # Maximum bits per tile of the key and value matrix streams + tile_bits_max = max([ + i_elems * s_elems * DataType[self.get_nodeattr("KType")].bitwidth(), + o_elems * s_elems * DataType[self.get_nodeattr("VType")].bitwidth(), + ]) + # Maximum bits per matmul accumulators + acc_bits_max = max([ + # These are not streamed, thus single element width is counted + DataType[self.get_nodeattr("AccQKMatMul")].bitwidth(), + DataType[self.get_nodeattr("AccAVMatMul")].bitwidth(), + ]) + # Maximum bits per matmul outputs + out_bits_max = max([ + # These are the stream widths, which are always >= than individual + # elements + s_elems * DataType[self.get_nodeattr("OutQKMatMul")].bitwidth(), + o_elems * DataType[self.get_nodeattr("OutAVMatMul")].bitwidth(), + ]) + # Aggregate the maximum bit width in both matmul operators over all + # inputs, intermediates and outputs + matmul_bits_max = max([ + tile_bits_max, acc_bits_max, out_bits_max + ]) + + # Find maximum of all (maximal) bit-widths + return max([i_bits_max, o_bits_max, m_bits, a_bits, matmul_bits_max]) # Gets the number of expected output values, i.e. how many times read() # could/should be called on the output stream of this operator From acaa9b2d272257052e85addeafde55991d73f4c3 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 9 Aug 2023 11:09:06 +0200 Subject: [PATCH 18/88] Some clean up and "# noqa" to calm the IDE --- src/finn/custom_op/fpgadataflow/attention.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index cdaa597d7b..bc44072e33 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -4,12 +4,9 @@ import numpy as np # Derive custom operators form the FINN base custom op from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -# Temporarily derive custom operators from QONNX base custom op -# TODO: Remove once switching to HLSCustomOp -from qonnx.custom_op.base import CustomOp - # QONNX/FINN datatypes -from qonnx.core.datatype import DataType +from qonnx.core.datatype import DataType # noqa qonnx dependency is specified +# in setup.cfg as well as in fetch-repos.sh # Scaled Dot-Product Attention Custom Operator @@ -121,8 +118,9 @@ def make_modelwrapper_like( # Utility types and function for creating onnx nodes and graphs from onnx import TensorProto, helper # Utility for creating and wrapping qonnx graphs and models - from qonnx.util.basic import qonnx_make_model - from qonnx.core.modelwrapper import ModelWrapper + from qonnx.util.basic import qonnx_make_model # noqa qonnx dependency + # is specified in setup.cfg as well as in fetch-repos.sh + from qonnx.core.modelwrapper import ModelWrapper # noqa # Convert unspecified mask to 'none' mode mask = 'none' if mask is None else mask @@ -325,7 +323,8 @@ def execute_node(self, context, graph): # Numpy compatible softmax implementation from scipy.special import softmax # Generate random input data for testing - from qonnx.util.basic import gen_finn_dt_tensor + from qonnx.util.basic import gen_finn_dt_tensor # noqa qonnx + # dependency is specified in setup.cfg as well as in fetch-repos.sh # Read input tensors of the query, key and value inputs from context q = context[self.onnx_node.input[0]] @@ -594,7 +593,7 @@ def hls_type(name): # given by argument list names return (f"using {name} = {hls_type(name)};" for name in names) - # Insert constants and typer aliases into the dictionary + # Insert constants and type aliases into the dictionary self.code_gen_dict["$DEFINES$"] = [ # Shape constant definitions of attention inputs (query, key and # value) and folding configuration From b41575dc9a36bdd23619baadbdded929d29a5c45 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 10 Aug 2023 12:22:23 +0200 Subject: [PATCH 19/88] [Attention] Get C++ simulation to compile and prepare inputs --- src/finn/custom_op/fpgadataflow/attention.py | 67 ++++++++++++++++--- src/finn/custom_op/fpgadataflow/hlsbackend.py | 3 + .../test_fpgadataflow_attention.py | 8 +-- 3 files changed, 64 insertions(+), 14 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index bc44072e33..19249be9b1 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -1,3 +1,5 @@ +# Operating system stuff, e.g. paths +import os # Python warning subsystem import warnings # Numpy math and arrays @@ -6,6 +8,8 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp # QONNX/FINN datatypes from qonnx.core.datatype import DataType # noqa qonnx dependency is specified + + # in setup.cfg as well as in fetch-repos.sh @@ -78,6 +82,8 @@ def get_nodeattr_types(self): # Execution mode of the operator # TODO: Remove once switching to HLSCustomOp + # TODO: Not possible right now, python mode is still required by + # dummy unit test "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), }) # Return updated attribute dictionary @@ -314,9 +320,38 @@ def execute_node(self, context, graph): # The folding configuration must be valid assert self.is_valid_folding, "Invalid Folding" - # Get the mode to use for execution + # The execution mode is configured via a node attribute of the + # HLSCustomOp base. This is a string, either "cppsim" or "rtlsim". mode = self.get_nodeattr("exec_mode") + # Input data is stored in numpy files in the code generation dictionary + # TODO: Refactor this, there is too much duplication of mode checking + if mode == "cppsim" or mode == "python": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """ + Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim") + """.format(mode) + ) + + # Enumerate and name the node inputs. The mask is an optional fourth + # input. As "zip" runs over the shortest of its arguments, there will be + # no mask file generated if there is no fourth node input. + for ind, (name, context_name) in enumerate( + # TODO: Maybe configure the naming and order of inputs somewhere? + zip(["q", "k", "v", "m"], self.onnx_node.input)): + # Read the input from the execution context and reshape to match the + # expected folding + x = context[context_name].reshape(self.get_folded_input_shape(ind)) + # TODO: Why do the HLSCustomOp and MatrixVectorActivation make a + # copy here? + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"{name}.npy"), x) + # Support python execution mode for now # TODO: Remove python mode once switching to HLSCustomOp if mode == "python": @@ -358,9 +393,14 @@ def execute_node(self, context, graph): context[o_name] = o # CPP Simulation of the HLS operator elif mode == "cppsim": - # TODO: Implement cppsim mode - raise NotImplementedError( - "exec_mode cppsim is not implemented yet!" + # Execute the precompiled C++ simulation program + # Note: Reusing the HLSCustomOp base implementation is probably fine + super().exec_precompiled_singlenode_model() + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, f"out.npy")) + # Reshape the folded input and insert into the execution context + context[self.onnx_node.output[0]] = out.reshape( + self.get_normal_output_shape(ind=0) ) # RTL Simulation of the HLS operator elif mode == "rtlsim": @@ -670,7 +710,7 @@ def read_npy_data(self): # TODO: Maybe these type-deductions can be removed by changing the # order of the template arguments of the npy2apintstream, such # that type-deduction is handled there? - f'using QPacked = decltype(QStream.read());', + f'using QPacked = decltype(QStream{{}}.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', @@ -678,7 +718,7 @@ def read_npy_data(self): ');', # Deduce the datatype of elements packed into the key input stream - f'using KPacked = decltype(KStream.read());', + f'using KPacked = decltype(KStream{{}}.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', @@ -686,7 +726,7 @@ def read_npy_data(self): ');', # Deduce the datatype of elements packed into the value input stream - f'using VPacked = decltype(VStream.read());', + f'using VPacked = decltype(VStream{{}}.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', @@ -701,7 +741,7 @@ def read_npy_data(self): self.code_gen_dict["$READNPYDATA$"] += [ # Deduce the datatype of elements packed into the mask input # stream - f'using MPacked = decltype(MStream.read());', + f'using MPacked = decltype(MStream{{}}.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', @@ -717,7 +757,14 @@ def strm_decl(self): # Note: Assumes stream type aliases to be set in defines 'QStream q;', 'KStream k;', 'VStream v;', 'OStream out;' ] - + # If the mask is provided as an input, it needs a stream declaration as + # well + if self.get_nodeattr("mask_mode") == "input": + # Append the mask stream to the declaration list + self.code_gen_dict["$STREAMDECLARATIONS$"] += [ + # Note: Assumes stream type aliases to be set in defines + 'MStream m;', + ] # Generates C++ code for calling the computation part of the operator def docompute(self): # Write the body of the attention top-level function @@ -752,7 +799,7 @@ def dataoutstrm(self): # output file self.code_gen_dict["$DATAOUTSTREAM$"] = [ # Deduce the datatype of elements packed into the output stream - f'using OPacked = decltype(OStream.read());', + f'using OPacked = decltype(OStream{{}}.read());', # Generate function call reading from stream into the output file # Note: Outputs are always represented as numpy floats f'apintstream2npy(', diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index d8210fd684..1434e31132 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -234,6 +234,9 @@ def compile_singlenode_code(self): builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") builder.append_includes("-I$FINN_ROOT/deps/cnpy/") builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") + # TODO: Is it ok to add this here? Add some specialization to the + # attention operator? Eventually integrate this into the finn-hlslib? + builder.append_includes("-I$FINN_ROOT/deps/attention-hlslib") builder.append_includes("-I$FINN_ROOT/custom_hls") builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) builder.append_includes("--std=c++14") diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index a839689ea4..8c4048d3ca 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -17,7 +17,7 @@ # Graphs transformation setting the execution mode attribute from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP # Size of query and key embedding dimension @@ -39,7 +39,7 @@ @pytest.mark.parametrize("KType", [DataType["UINT16"]]) @pytest.mark.parametrize("VType", [DataType["UINT16"]]) @pytest.mark.parametrize("MType", [DataType["UINT16"]]) -@pytest.mark.parametrize("OType", [DataType["UINT16"]]) +@pytest.mark.parametrize("OType", [DataType["UINT32"]]) # Tests python implementation of single scaled dot-product attention head def test_attention_python( QKDim, VDim, KVLen, QLen, mask, EmbFold, SeqFold, QType, KType, VType, @@ -82,11 +82,11 @@ def test_attention_python( } # Set model execution mode to python (numpy execution) model = model.transform(SetExecMode("python")) - # Generates the C++ source to be compiled as C++ simulation + model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) # Prepares IP-generation - model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) # Execute the onnx model to collect the result From a718bf63f9a4168b2bae7f407443096d95521bc1 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 10 Aug 2023 15:16:22 +0200 Subject: [PATCH 20/88] [Attention] Move dummy model wrapper construction out of custom op --- src/finn/custom_op/fpgadataflow/attention.py | 138 +-------------- .../test_fpgadataflow_attention.py | 163 ++++++++++++++++-- 2 files changed, 151 insertions(+), 150 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 19249be9b1..f5727348fe 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -8,8 +8,6 @@ from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp # QONNX/FINN datatypes from qonnx.core.datatype import DataType # noqa qonnx dependency is specified - - # in setup.cfg as well as in fetch-repos.sh @@ -116,139 +114,6 @@ def is_valid_folding(self): # All shapes must be multiples of their corresponding fold return not ((qkdim % embfold) or (vdim % embfold) or (kvlen % seqfold)) - # Generates a dummy node matching the shapes of the input numpy arrays - @staticmethod - def make_modelwrapper_like( - q, k, v, mask=None, embfold=1, seqfold=1, **dtypes - ): - # Utility types and function for creating onnx nodes and graphs - from onnx import TensorProto, helper - # Utility for creating and wrapping qonnx graphs and models - from qonnx.util.basic import qonnx_make_model # noqa qonnx dependency - # is specified in setup.cfg as well as in fetch-repos.sh - from qonnx.core.modelwrapper import ModelWrapper # noqa - - # Convert unspecified mask to 'none' mode - mask = 'none' if mask is None else mask - - # Start building the node as a dictionary of attributes - node_kwargs = { - # Refer to this operator type by its name - "op_type": "ScaledDotProductAttention", - # Execution will try to look up the implementation in the package - # referred to by the domain - "domain": "finn.custom_op.fpgadataflow", - # Execution backend: Required attribute inherited from HLSCustomOp - "backend": "fpgadataflow", - # Folding along the embedding dimensions - "EmbFold": embfold, - # Folding along the sequence dimensions - "SeqFold": seqfold - } - - # Infer the output shape from the input shapes - o_shape = (q.shape[0], v.shape[1]) - - # Create onnx value info of all inputs and outputs assuming float - # datatypes - q_info = helper.make_tensor_value_info("Q", TensorProto.FLOAT, q.shape) - k_info = helper.make_tensor_value_info("K", TensorProto.FLOAT, k.shape) - v_info = helper.make_tensor_value_info("V", TensorProto.FLOAT, v.shape) - o_info = helper.make_tensor_value_info("O", TensorProto.FLOAT, o_shape) - - # Collect input and output nodes in order - inputs, outputs = [q_info, k_info, v_info], [o_info] - - # Collect all inputs/outputs to the operator node - io_kwargs = { - "inputs": ["Q", "K", "V"], "outputs": ["O"], "mask_mode": "none" - } - - # Start building the shape attributes - shape_kwargs = { - # Shared embedding dimension of the queries and keys and embedding - # dimension of the values - "QKDim": q.shape[1], "VDim": v.shape[1], - # Shared sequence length of keys and values and sequence length of - # the queries - "KVLen": k.shape[0], "QLen": q.shape[0], - } - - # Start building the datatype attributes - dtype_kwargs = { - # Datatypes of the query, key, value inputs and the output - "QType": "FLOAT32", "KType": "FLOAT32", - "VType": "FLOAT32", "OType": "FLOAT32", - } - - # If the optional mask is specified as an input - if isinstance(mask, np.ndarray) or mask == "input": - # Add the mask to the input node names - io_kwargs["inputs"].append("mask") - # Configure masking mode via io_kwargs as well - io_kwargs["mask_mode"] = "input" - # Always infer the mask shape - mask_shape = (q.shape[0], k.shape[0]) - # Create value info of the mask input - mask_info = helper.make_tensor_value_info( - "mask", TensorProto.FLOAT, mask_shape - ) - # Append the mask input as fourth input node - inputs.append(mask_info) - # Add the mask default datatype to the datatype attributes - dtype_kwargs["MType"] = "FLOAT32" - - # If a causal mask is to be generated during execution - if mask == "causal": - # Configure masking mode via io_kwargs as well - io_kwargs["mask_mode"] = "causal" - # Add the mask default datatype to the datatype attributes - dtype_kwargs["MType"] = "FLOAT32" - - # The optional dtypes keyword arguments must describe a subset of the - # model inputs and outputs - assert set(dtypes) <= {*dtype_kwargs, "MType"}, \ - "Specified datatype of unknown input or output" - - # Update the datatype attributes according to the keyword arguments - dtype_kwargs.update({ - key: value.name for key, value in dtypes.items() - }) - - # Create an onnx graph node by unpacking all prepared keyword arguments - node = helper.make_node( - **node_kwargs, **io_kwargs, **shape_kwargs, **dtype_kwargs - ) - # Create a graph out of the operator node and the input/output nodes - graph = helper.make_graph( - [node], inputs=inputs, outputs=outputs, name='attention_graph' - ) - # Wrap the graph in a qonnx model wrapper - model = ModelWrapper(qonnx_make_model( - graph, producer_name='attention-model' - )) - - # Add datatype annotations to all input tensors - for tensor_name in io_kwargs["inputs"]: - # Only annotate if a datatype is specified - if f'{tensor_name}Type' in dtypes: - # Update the datatype annotation - model.set_tensor_datatype( - tensor_name, dtypes[f'{tensor_name}Type'] - ) - - # Add datatype annotations to all output tensors - for tensor_name in io_kwargs["outputs"]: - # Only annotate if a datatype is specified - if f'{tensor_name}Type' in dtypes: - # Update the datatype annotation - model.set_tensor_datatype( - tensor_name, dtypes[f'{tensor_name}Type'] - ) - - # Return the constructed qonnx model wrapper - return model - # Returns an ONNX node that has the same shape inference behavior def make_shape_compatible_op(self, model): # Infer the output shape from the input shapes @@ -398,7 +263,7 @@ def execute_node(self, context, graph): super().exec_precompiled_singlenode_model() # Load the output numpy file generated by the C++ simulation out = np.load(os.path.join(code_gen_dir, f"out.npy")) - # Reshape the folded input and insert into the execution context + # Reshape the folded output and insert into the execution context context[self.onnx_node.output[0]] = out.reshape( self.get_normal_output_shape(ind=0) ) @@ -765,6 +630,7 @@ def strm_decl(self): # Note: Assumes stream type aliases to be set in defines 'MStream m;', ] + # Generates C++ code for calling the computation part of the operator def docompute(self): # Write the body of the attention top-level function diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 8c4048d3ca..83dd296286 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -1,25 +1,160 @@ # Testing framework -import pytest +import pytest # noqa pytest dependecy is listed in setup.cfg -# Use numpy for python execution / computing the ground truth expected values -import numpy as np -from qonnx.transformation.general import GiveUniqueNodeNames -# Numpy compatible implementation of the softmax operation -from scipy.special import softmax +# Utility types and function for creating onnx nodes and graphs +from onnx import TensorProto, helper -# Generate random input data for testing -from qonnx.util.basic import gen_finn_dt_tensor, DataType +# QONNX utility for generating random input data for testing and for creating +# models +from qonnx.util.basic import ( # noqa qonnx dependency is specified in + # setup.cfg as well as in fetch-repos.sh + gen_finn_dt_tensor, DataType, qonnx_make_model +) +# Wrapper around ONNX model with some graph manipulation utility +from qonnx.core.modelwrapper import ModelWrapper # noqa # Execute onnx model graphs -from qonnx.core.onnx_exec import execute_onnx -# Attention operator to test -from finn.custom_op.fpgadataflow.attention import ScaledDotProductAttention -from qonnx.custom_op.registry import getCustomOp -# Graphs transformation setting the execution mode attribute +from qonnx.core.onnx_exec import execute_onnx # noqa +# Graph transformation giving unique names to each node in a QONNX model graph +from qonnx.transformation.general import GiveUniqueNodeNames # noqa + +# FINN graph transformations for preparing simulation (cppsim or rtlsim) from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +# Use numpy for python execution / computing the ground truth expected values +import numpy as np +# Numpy compatible implementation of the softmax operation +from scipy.special import softmax + + +# Generates a QONNX ModelWrapper for testing scaled dot-product attention +def make_single_sdp_modelwrapper_like( + q, k, v, mask=None, embfold=1, seqfold=1, **dtypes +): + # Convert unspecified mask to 'none' mode + mask = 'none' if mask is None else mask + + # Start building the node as a dictionary of attributes + node_kwargs = { + # Refer to this operator type by its name + "op_type": "ScaledDotProductAttention", + # Execution will try to look up the implementation in the package + # referred to by the domain + "domain": "finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from HLSCustomOp + "backend": "fpgadataflow", + # Folding along the embedding dimensions + "EmbFold": embfold, + # Folding along the sequence dimensions + "SeqFold": seqfold + } + + # Infer the output shape from the input shapes + o_shape = (q.shape[0], v.shape[1]) + + # Create onnx value info of all inputs and outputs assuming float + # datatypes + q_info = helper.make_tensor_value_info("Q", TensorProto.FLOAT, q.shape) + k_info = helper.make_tensor_value_info("K", TensorProto.FLOAT, k.shape) + v_info = helper.make_tensor_value_info("V", TensorProto.FLOAT, v.shape) + o_info = helper.make_tensor_value_info("O", TensorProto.FLOAT, o_shape) + + # Collect input and output nodes in order + inputs, outputs = [q_info, k_info, v_info], [o_info] + + # Collect all inputs/outputs to the operator node + io_kwargs = { + "inputs": ["Q", "K", "V"], "outputs": ["O"], "mask_mode": "none" + } + + # Start building the shape attributes + shape_kwargs = { + # Shared embedding dimension of the queries and keys and embedding + # dimension of the values + "QKDim": q.shape[1], "VDim": v.shape[1], + # Shared sequence length of keys and values and sequence length of + # the queries + "KVLen": k.shape[0], "QLen": q.shape[0], + } + + # Start building the datatype attributes + dtype_kwargs = { + # Datatypes of the query, key, value inputs and the output + "QType": "FLOAT32", "KType": "FLOAT32", + "VType": "FLOAT32", "OType": "FLOAT32", + } + + # If the optional mask is specified as an input + if isinstance(mask, np.ndarray) or mask == "input": + # Add the mask to the input node names + io_kwargs["inputs"].append("mask") + # Configure masking mode via io_kwargs as well + io_kwargs["mask_mode"] = "input" + # Always infer the mask shape + mask_shape = (q.shape[0], k.shape[0]) + # Create value info of the mask input + mask_info = helper.make_tensor_value_info( + "mask", TensorProto.FLOAT, mask_shape + ) + # Append the mask input as fourth input node + inputs.append(mask_info) + # Add the mask default datatype to the datatype attributes + dtype_kwargs["MType"] = "FLOAT32" + + # If a causal mask is to be generated during execution + if mask == "causal": + # Configure masking mode via io_kwargs as well + io_kwargs["mask_mode"] = "causal" + # Add the mask default datatype to the datatype attributes + dtype_kwargs["MType"] = "FLOAT32" + + # The optional dtypes keyword arguments must describe a subset of the + # model inputs and outputs + assert set(dtypes) <= {*dtype_kwargs, "MType"}, \ + "Specified datatype of unknown input or output" + + # Update the datatype attributes according to the keyword arguments + dtype_kwargs.update({ + key: value.name for key, value in dtypes.items() + }) + + # Create an onnx graph node by unpacking all prepared keyword arguments + node = helper.make_node( + **node_kwargs, **io_kwargs, **shape_kwargs, **dtype_kwargs + ) + # Create a graph out of the operator node and the input/output nodes + graph = helper.make_graph( + [node], inputs=inputs, outputs=outputs, name='attention_graph' + ) + # Wrap the graph in a qonnx model wrapper + model = ModelWrapper(qonnx_make_model( + graph, producer_name='attention-model' + )) + + # Add datatype annotations to all input tensors + for tensor_name in io_kwargs["inputs"]: + # Only annotate if a datatype is specified + if f'{tensor_name}Type' in dtypes: + # Update the datatype annotation + model.set_tensor_datatype( + tensor_name, dtypes[f'{tensor_name}Type'] + ) + + # Add datatype annotations to all output tensors + for tensor_name in io_kwargs["outputs"]: + # Only annotate if a datatype is specified + if f'{tensor_name}Type' in dtypes: + # Update the datatype annotation + model.set_tensor_datatype( + tensor_name, dtypes[f'{tensor_name}Type'] + ) + + # Return the constructed qonnx model wrapper + return model + + # Size of query and key embedding dimension @pytest.mark.parametrize("QKDim", [64]) # Size of value embedding dimension @@ -57,7 +192,7 @@ def test_attention_python( } # Generate the operator matching the configuration - model = ScaledDotProductAttention.make_modelwrapper_like( + model = make_single_sdp_modelwrapper_like( q, k, v, mask, EmbFold, SeqFold, **dtypes, MType=MType ) From 189a4158d73877387f64ce74da187f59141f5c8d Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 14 Aug 2023 17:18:08 +0200 Subject: [PATCH 21/88] [Attention] Refactor the cppsim unit test using thresholds in python sim This causes the C++ simulation to fail as multithreshold activations are not implemented on the HLS side yet. --- src/finn/custom_op/fpgadataflow/attention.py | 28 +- .../test_fpgadataflow_attention.py | 517 +++++++++++------- 2 files changed, 348 insertions(+), 197 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index f5727348fe..7f1056334d 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -458,6 +458,30 @@ def get_ap_int_max_w(self): # Find maximum of all (maximal) bit-widths return max([i_bits_max, o_bits_max, m_bits, a_bits, matmul_bits_max]) + # Minimize the accumulator bit width + def minimize_accumulator_width(self, model): # noqa: model is unused + # Ge the query, key, value and attention weights type + QType = DataType[self.get_nodeattr("QType")] # noqa + KType = DataType[self.get_nodeattr("KType")] # noqa + VType = DataType[self.get_nodeattr("VType")] # noqa + AType = DataType[self.get_nodeattr("AType")] # noqa + # Minimal and maximal possible results of query-key multiplication + qk_min = self.get_nodeattr("QKDim") * QType.min() * KType.min() + qk_max = self.get_nodeattr("QKDim") * QType.max() * KType.max() + # Minimal and maximal possible results of attention-value multiplication + av_min = self.get_nodeattr("VDim") * AType.min() * VType.min() + av_max = self.get_nodeattr("VDim") * AType.max() * VType.max() + # Update the accumulator types to fit the min-max range + # TODO: Is this correct? + _qk_max = max(-qk_min, 1 + qk_max) + acc_bit_width = np.log2(_qk_max) + 1 + acc_bit_width = int(np.ceil(acc_bit_width)) + self.set_nodeattr("AccQKMatMul", f"UINT{acc_bit_width}") + _av_max = max(-av_min, 1 + av_max) + acc_bit_width = np.log2(_av_max) + 1 + acc_bit_width = int(np.ceil(acc_bit_width)) + self.set_nodeattr("AccAVMatMul", f"UINT{acc_bit_width}") + # Gets the number of expected output values, i.e. how many times read() # could/should be called on the output stream of this operator def get_number_output_values(self): @@ -545,12 +569,12 @@ def hls_type(name): f" VType,", f" MType,", f" AType,", - f" OType,", + f" OType,", # Note: OType and last MatMul out must match f" AccQKMatMul,", f" OutQKMatMul,", f" ActQKMatMul,", f" AccAVMatMul,", - f" OutAVMatMul,", + f" OType,", # Note: OType and last MatMul out must match f" ActAVMatMul,", f" ActASoftmax", f">;", diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 83dd296286..75ffc74313 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -1,246 +1,373 @@ # Testing framework import pytest # noqa pytest dependecy is listed in setup.cfg +# Automatically generate init, repr, ... for classes containing a lot of +# attributes +from dataclasses import dataclass + +# Use numpy for python execution / computing the ground truth expected values +import numpy as np +# Numpy compatible implementation of the softmax operation +from scipy.special import softmax # Utility types and function for creating onnx nodes and graphs from onnx import TensorProto, helper # QONNX utility for generating random input data for testing and for creating # models -from qonnx.util.basic import ( # noqa qonnx dependency is specified in - # setup.cfg as well as in fetch-repos.sh - gen_finn_dt_tensor, DataType, qonnx_make_model -) +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model # noqa qonnx +# dependency is specified in setup.cfg as well as in fetch-repos.sh +# QONNX datatypes +from qonnx.core.datatype import DataType, IntType, BaseDataType # noqa # Wrapper around ONNX model with some graph manipulation utility from qonnx.core.modelwrapper import ModelWrapper # noqa # Execute onnx model graphs from qonnx.core.onnx_exec import execute_onnx # noqa # Graph transformation giving unique names to each node in a QONNX model graph from qonnx.transformation.general import GiveUniqueNodeNames # noqa +# Multithreshold activations +from qonnx.custom_op.general.multithreshold import multithreshold # noqa # FINN graph transformations for preparing simulation (cppsim or rtlsim) from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP - -# Use numpy for python execution / computing the ground truth expected values -import numpy as np -# Numpy compatible implementation of the softmax operation -from scipy.special import softmax -# Generates a QONNX ModelWrapper for testing scaled dot-product attention -def make_single_sdp_modelwrapper_like( - q, k, v, mask=None, embfold=1, seqfold=1, **dtypes -): - # Convert unspecified mask to 'none' mode - mask = 'none' if mask is None else mask - - # Start building the node as a dictionary of attributes - node_kwargs = { - # Refer to this operator type by its name - "op_type": "ScaledDotProductAttention", - # Execution will try to look up the implementation in the package - # referred to by the domain - "domain": "finn.custom_op.fpgadataflow", - # Execution backend: Required attribute inherited from HLSCustomOp - "backend": "fpgadataflow", - # Folding along the embedding dimensions - "EmbFold": embfold, - # Folding along the sequence dimensions - "SeqFold": seqfold - } - - # Infer the output shape from the input shapes - o_shape = (q.shape[0], v.shape[1]) - - # Create onnx value info of all inputs and outputs assuming float - # datatypes - q_info = helper.make_tensor_value_info("Q", TensorProto.FLOAT, q.shape) - k_info = helper.make_tensor_value_info("K", TensorProto.FLOAT, k.shape) - v_info = helper.make_tensor_value_info("V", TensorProto.FLOAT, v.shape) - o_info = helper.make_tensor_value_info("O", TensorProto.FLOAT, o_shape) - - # Collect input and output nodes in order - inputs, outputs = [q_info, k_info, v_info], [o_info] - - # Collect all inputs/outputs to the operator node - io_kwargs = { - "inputs": ["Q", "K", "V"], "outputs": ["O"], "mask_mode": "none" - } - - # Start building the shape attributes - shape_kwargs = { - # Shared embedding dimension of the queries and keys and embedding - # dimension of the values - "QKDim": q.shape[1], "VDim": v.shape[1], - # Shared sequence length of keys and values and sequence length of - # the queries - "KVLen": k.shape[0], "QLen": q.shape[0], - } - - # Start building the datatype attributes - dtype_kwargs = { - # Datatypes of the query, key, value inputs and the output - "QType": "FLOAT32", "KType": "FLOAT32", - "VType": "FLOAT32", "OType": "FLOAT32", - } +# Python/Numpy model of the scaled dot-product attention operator as it is (will +# be...) implemented in the attention-hlslib +@dataclass +class MockScaledDotProductAttention: + # Embedding dimension of queries and keys + QKDim: int + # Length of the query sequence + QLen: int + # Embedding dimension of the values + VDim: int + # Length of the key and value sequence + KVLen: int + + # Folding along the embedding dimensions + EmbFold: int + # Folding along the sequence dimensions + SeqFold: int + + # Datatype of query matrix elements + QType: IntType + # Datatype of key matrix elements + KType: IntType + # Datatype of value matrix elements + VType: IntType + # Datatype of mask matrix elements + MType: IntType + # Datatype of attention weights elements + AType: IntType + # Datatype of output elements + OType: IntType + + # Datatype of accumulator elements of the Query x Key multiplication + AccQKMatMul: IntType = DataType["UINT4"] + # Datatype of output elements of the Query x Key multiplication + OutQKMatMul: IntType = DataType["UINT4"] + # Activation function type of the Query x Key multiplication + ActQKMatMul: str = "PassThroughActivation" + + # Datatype of accumulator elements of the Attention x Value + # multiplication + AccAVMatMul: IntType = DataType["UINT4"] + # Datatype of output elements of the Attention x Value + # multiplication + OutAVMatMul: IntType = DataType["UINT4"] + # Activation function type of the Attention x Value multiplication + ActAVMatMul: str = "PassThroughActivation" + + # Activation function type of the softmax normalization of the + # attention weights + ActASoftmax: str = "PassThroughActivation" + + # Initializes those parameters which depend on the initial configuration, + # which is set by the generated __init__ + def __post_init__(self): + # The last matmul output type must match with the specified output type + assert self.OType == self.OutAVMatMul + + # Converts QONNX datatypes to their name (as a string) + def maybe_name(value): + # All QONNX datatypes are instances of the BaseDataType + if isinstance(value, BaseDataType): + # Convert to the name by referring to the datatypes name + # attribute + return value.name + # Everything else is just assumed to be in the right format + return value + + # Convert all node attributes which are registered so far to a + # dictionary matching the CustomOp format, where DataTypes are converted + # to string representations of their names + self.node_attrs = { + key: maybe_name(value) for key, value in self.__dict__.items() + } + + # Dummy float type to use the threshold generator with flot inputs + @dataclass + class DummyFloat32: + # Minimum and maximum of the represented float range + _min: float + _max: float + + # Getter for minimum of the represented range + def min(self): + return self._min + + # Getter for maximum of the represented range + def max(self): + return self._max + + # Generates thresholds representing a quantized identity function + # mapping input datatype (idt) to output datatype (odt) + def make_identity_thresholds(idt, odt, repeat=1): + # The number of thresholds is determined by the range of the output + # datatype + steps = odt.get_num_possible_values() - 1 + # The scale, or step size, is determined by the ratio between input + # and output range + scale = (idt.max() - idt.min()) / (odt.max() - odt.min()) + # Generate step thresholds covering the input range and repeat for + # multiple matrix rows/cols + return np.array( + repeat * [[scale * i + idt.min() for i in range(steps)]] + ).astype(dtype=np.float32) + + # Generate identity function thresholds mapping the query-key matmul + # accumulator type to the specified output type + self.qk_thresholds = np.round(make_identity_thresholds( + # Note: Repeat for all KVLen cols of the attention weights + self.AccQKMatMul, self.OutQKMatMul, self.KVLen + )) + + # Generate identity function thresholds mapping the float attention + # weights to the specified integer type + self.a_thresholds = make_identity_thresholds( + # Note: Repeat for all KVLen cols of the attention weights + DummyFloat32(0.0, 1.0), self.AType, self.KVLen + ) - # If the optional mask is specified as an input - if isinstance(mask, np.ndarray) or mask == "input": - # Add the mask to the input node names - io_kwargs["inputs"].append("mask") - # Configure masking mode via io_kwargs as well - io_kwargs["mask_mode"] = "input" - # Always infer the mask shape - mask_shape = (q.shape[0], k.shape[0]) - # Create value info of the mask input - mask_info = helper.make_tensor_value_info( - "mask", TensorProto.FLOAT, mask_shape + # Generate identity function thresholds mapping the attention-value + # matmul accumulator type to the specified output type + self.av_thresholds = np.round(make_identity_thresholds( + # Note: Repeat for all VDim cols of the output + self.AccAVMatMul, self.OutAVMatMul, self.VDim + )) + + # Computes the query-key matmul with activation function simulating + # quantization via thresholding + def qk_matmul(self, query, key): + return multithreshold(query @ key.T, self.qk_thresholds) + + # Computes the softmax normalization of attention weights with activation + # function simulating quantization via thresholding + def softmax(self, attention): + # TODO: Correctly model OUR softmax implementation, especially its + # overflow handling + return multithreshold(softmax(attention, axis=1), self.a_thresholds) + + # Computes the attention-value matmul with activation function simulating + # quantization via thresholding + def av_matmul(self, attention, value): + return multithreshold(attention @ value, self.av_thresholds) + + # Computes scaled dot-product attention + def __call__(self, query, key, value): + return self.av_matmul(self.softmax(self.qk_matmul(query, key)), value) + + # Generates random sample inputs + def make_rand_input(self): + # Sample random query, key and value matrices with types and shapes + # configured as attributes + query = gen_finn_dt_tensor(self.QType, (self.QLen, self.QKDim)) + key = gen_finn_dt_tensor(self.KType, (self.KVLen, self.QKDim)) + value = gen_finn_dt_tensor(self.VType, (self.KVLen, self.VDim)) + # Return query, key, value tensors with integers represented as floats + return query, key, value + + # Creates a QONNX ModelWrapper matching the attention configuration + def make_modelwrapper(self): + # Build up the node attribute dictionary + kwargs = { + # Refer to this operator type by its name + "op_type": "ScaledDotProductAttention", + # Execution will try to look up the implementation in the package + # referred to by the domain + "domain": "finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from HLSCustomOp + "backend": "fpgadataflow", + # Named inputs and activation thresholds + # Note: Currently no mask support + "inputs": ["Q", "K", "V"], + # Named model output + "outputs": ["O"], + # Currently no masking support + "mask_mode": "none" + } + + # Insert attributes into a new ONNX graph node + node = helper.make_node(**kwargs, **self.node_attrs) + + # Create random sample inputs for shape inference + q, k, v = self.make_rand_input() + # Infer the output shape from the input shapes + o_shape = (q.shape[0], v.shape[1]) + # Create onnx value info of all inputs and outputs assuming float + # datatypes + q_info = helper.make_tensor_value_info("Q", TensorProto.FLOAT, q.shape) + k_info = helper.make_tensor_value_info("K", TensorProto.FLOAT, k.shape) + v_info = helper.make_tensor_value_info("V", TensorProto.FLOAT, v.shape) + o_info = helper.make_tensor_value_info("O", TensorProto.FLOAT, o_shape) + # Collect input and output nodes in order + inputs, outputs = [q_info, k_info, v_info], [o_info] + + # Create a graph connecting the scaled dot-product attention node to the + # input and output nodes + graph = helper.make_graph( + [node], inputs=inputs, outputs=outputs, name='attention_graph' ) - # Append the mask input as fourth input node - inputs.append(mask_info) - # Add the mask default datatype to the datatype attributes - dtype_kwargs["MType"] = "FLOAT32" - - # If a causal mask is to be generated during execution - if mask == "causal": - # Configure masking mode via io_kwargs as well - io_kwargs["mask_mode"] = "causal" - # Add the mask default datatype to the datatype attributes - dtype_kwargs["MType"] = "FLOAT32" - - # The optional dtypes keyword arguments must describe a subset of the - # model inputs and outputs - assert set(dtypes) <= {*dtype_kwargs, "MType"}, \ - "Specified datatype of unknown input or output" - - # Update the datatype attributes according to the keyword arguments - dtype_kwargs.update({ - key: value.name for key, value in dtypes.items() - }) - - # Create an onnx graph node by unpacking all prepared keyword arguments - node = helper.make_node( - **node_kwargs, **io_kwargs, **shape_kwargs, **dtype_kwargs - ) - # Create a graph out of the operator node and the input/output nodes - graph = helper.make_graph( - [node], inputs=inputs, outputs=outputs, name='attention_graph' - ) - # Wrap the graph in a qonnx model wrapper - model = ModelWrapper(qonnx_make_model( - graph, producer_name='attention-model' - )) - - # Add datatype annotations to all input tensors - for tensor_name in io_kwargs["inputs"]: - # Only annotate if a datatype is specified - if f'{tensor_name}Type' in dtypes: - # Update the datatype annotation - model.set_tensor_datatype( - tensor_name, dtypes[f'{tensor_name}Type'] - ) - - # Add datatype annotations to all output tensors - for tensor_name in io_kwargs["outputs"]: - # Only annotate if a datatype is specified - if f'{tensor_name}Type' in dtypes: - # Update the datatype annotation - model.set_tensor_datatype( - tensor_name, dtypes[f'{tensor_name}Type'] - ) - - # Return the constructed qonnx model wrapper - return model + # Wrap the ONNX graph in QONNX model wrapper + model = ModelWrapper(qonnx_make_model( + graph, producer_name='attention-model' + )) + + # Add datatype annotations to all input tensors + for tensor_name in kwargs["inputs"]: + # Only annotate if a datatype is specified + if f"{tensor_name}Type" in kwargs: + # Update the datatype annotation + model.set_tensor_datatype( + tensor_name, DataType[kwargs[f"{tensor_name}Type"]] + ) + + # Add datatype annotations to all output tensors + for tensor_name in kwargs["outputs"]: + # Only annotate if a datatype is specified + if f"{tensor_name}Type" in kwargs: + # Update the datatype annotation + model.set_tensor_datatype( + tensor_name, DataType[kwargs[f"{tensor_name}Type"]] + ) + + # Set the threshold tensors as model initializer attributes of the + # appropriate type + # TODO: Uses the actual input type to the multithreshold function as + # datatype. Somehow the mvau tests always use INT32, why? + model.set_tensor_datatype("QKThresholds", self.AccQKMatMul) + model.set_initializer("QKThresholds", self.qk_thresholds) + + model.set_tensor_datatype("AThresholds", DataType["FLOAT32"]) + model.set_initializer("AThresholds", self.a_thresholds) + + model.set_tensor_datatype("AVThresholds", self.AccAVMatMul) + model.set_initializer("AVThresholds", self.av_thresholds) + + # Return the constructed qonnx model wrapper + return model # Size of query and key embedding dimension -@pytest.mark.parametrize("QKDim", [64]) +@pytest.mark.parametrize("QKDim", [4]) # Size of value embedding dimension -@pytest.mark.parametrize("VDim", [64]) +@pytest.mark.parametrize("VDim", [8]) # Length of key and value sequences -@pytest.mark.parametrize("KVLen", [256]) +@pytest.mark.parametrize("KVLen", [24]) # Length of query sequence -@pytest.mark.parametrize("QLen", [256]) -# Different modes to provide a mask -@pytest.mark.parametrize("mask", ["none"]) +@pytest.mark.parametrize("QLen", [16]) # Folding along the embedding dimensions -@pytest.mark.parametrize("EmbFold", [64]) +@pytest.mark.parametrize("EmbFold", [2]) # Folding along the sequence dimensions -@pytest.mark.parametrize("SeqFold", [256]) +@pytest.mark.parametrize("SeqFold", [8]) # Datatypes of queries, keys and values, mask and output -@pytest.mark.parametrize("QType", [DataType["UINT16"]]) -@pytest.mark.parametrize("KType", [DataType["UINT16"]]) -@pytest.mark.parametrize("VType", [DataType["UINT16"]]) -@pytest.mark.parametrize("MType", [DataType["UINT16"]]) -@pytest.mark.parametrize("OType", [DataType["UINT32"]]) -# Tests python implementation of single scaled dot-product attention head -def test_attention_python( - QKDim, VDim, KVLen, QLen, mask, EmbFold, SeqFold, QType, KType, VType, - MType, OType +@pytest.mark.parametrize("QType", [DataType["UINT4"]]) +@pytest.mark.parametrize("KType", [DataType["UINT4"]]) +@pytest.mark.parametrize("VType", [DataType["UINT4"]]) +@pytest.mark.parametrize("MType", [DataType["UINT4"]]) +@pytest.mark.parametrize("AType", [DataType["UINT4"]]) +@pytest.mark.parametrize("OType", [DataType["UINT4"]]) +# Different modes to provide a mask +@pytest.mark.parametrize("mask", ["none"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests cpp simulation of single scaled dot-product attention head +def test_attention_cppsim( + # Shape configuration + QKDim, # noqa: "Argument should be lowercase" + VDim, # noqa + KVLen, # noqa + QLen, # noqa + # Folding configuration + EmbFold, # noqa + SeqFold, # noqa + # Type configuration + QType, # noqa + KType, # noqa + VType, # noqa + MType, # noqa + AType, # noqa + OType, # noqa + # Type of mask to use: either 'none', 'input', or 'causal' + mask ): - # Generate random input data - q = gen_finn_dt_tensor(QType, (QLen, QKDim)) - k = gen_finn_dt_tensor(KType, (KVLen, QKDim)) - v = gen_finn_dt_tensor(VType, (KVLen, VDim)) - - dtypes = { - # Datatypes of the query, key, value inputs and the output - "QType": QType, "KType": KType, - "VType": VType, "OType": OType, - } - - # Generate the operator matching the configuration - model = make_single_sdp_modelwrapper_like( - q, k, v, mask, EmbFold, SeqFold, **dtypes, MType=MType + # Attention instance simulating in python and generating a matching QONNX + # configuration + attention = MockScaledDotProductAttention( + # Shape configuration + QKDim=QKDim, + QLen=QLen, + VDim=VDim, + KVLen=KVLen, + # Folding configuration + EmbFold=EmbFold, + SeqFold=SeqFold, + # Type configuration + QType=QType, + KType=KType, + VType=VType, + MType=MType, + AType=AType, + OType=OType, + # Accumulator type configuration + AccQKMatMul=DataType["UINT11"], + OutQKMatMul=DataType["UINT4"], + AccAVMatMul=DataType["UINT11"], + OutAVMatMul=DataType["UINT4"] ) - # Generate random input mask if the operator expects the mask as fourth - # input - if mask == "input": - mask = gen_finn_dt_tensor(DataType["FLOAT32"], (QLen, KVLen)) - # If a causal attention mask is requested, generate upper triangular matrix - elif mask == "causal": - # Start zero initialized mask - mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (QLen, KVLen)) - # Fill upper triangular causal attention mask - mask[np.triu_indices_from(mask, 1)] = - np.inf - # No mask input requested - elif mask == "none": - # No mask is equivalent to a zero mask - mask = 0 * gen_finn_dt_tensor(DataType["FLOAT32"], (QLen, KVLen)) - + # Create a QONNX model wrapper for testing + model = attention.make_modelwrapper() + # Save the ONNX graph for debugging + model.save("attention-test.onnx") + # Sample some random inputs + q, k, v = attention.make_rand_input() # Prepare execution context context = { "Q": q, "K": k, "V": v, "mask": mask } - # Set model execution mode to python (numpy execution) - model = model.transform(SetExecMode("python")) - # Generates the C++ source to be compiled as C++ simulation + # Set model execution mode to C++ simulation + model = model.transform(SetExecMode("cppsim")) + # Generates the C++ source and compiles the C++ simulation model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) - # Prepares IP-generation - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + # Compute ground-truth output in software + o_expected = attention(q, k, v) # Execute the onnx model to collect the result o_produced = execute_onnx(model, context)["O"] - # Compute the attention matrix between queries and keys - attention = softmax(q @ k.T * (QKDim ** -0.5) + mask, axis=-1) - # Compute product of attention weights and value input - o_expected = attention @ v + # Log outputs for debugging + print(f"{o_expected}\n", file=open('o_expected.txt', 'w')) + print(f"{o_produced}\n", file=open('o_produced.txt', 'w')) # Test whether the expectation and the onnx model output match - assert (o_produced == o_expected).all(), "python exec failed" # noqa - - -# This is a fpgadataflow type of test -@pytest.mark.fpgadataflow -# Tests cpp simulation of single scaled dot-product attention head -def test_fpgadataflow_attention_cppsim(): - pass + assert np.allclose(o_produced, o_expected), "cppsim exec failed" # This is a fpgadataflow type of test From b00c64a0ed3ad404baf690165a34fef347ee3929 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 15 Aug 2023 10:05:11 +0200 Subject: [PATCH 22/88] [Attention] Switch to the HLS function-call operator style --- src/finn/custom_op/fpgadataflow/attention.py | 8 +------- tests/fpgadataflow/test_fpgadataflow_attention.py | 4 ++-- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 7f1056334d..4464dd5883 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -662,13 +662,7 @@ def docompute(self): # Instantiate the attention operator and connect to the streams # Note: Assumes "Attention" to be aliased appropriate configuration # in defines with. - "Attention attention(q, k, v);", - # Transfer from input to output stream - # TODO: Get rid of this once switching to function-call style for - # the attention operator. - "for(std::size_t i = 0; i < QLen * EmbFold; ++i) {", - " out.write(attention.out.read());", - "}", + "Attention attention; attention(q, k, v, out);", ] # Generates C++ code for reading the output stream and converting back to diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 75ffc74313..28f0a7716b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -342,8 +342,6 @@ def test_attention_cppsim( # Create a QONNX model wrapper for testing model = attention.make_modelwrapper() - # Save the ONNX graph for debugging - model.save("attention-test.onnx") # Sample some random inputs q, k, v = attention.make_rand_input() # Prepare execution context @@ -365,6 +363,8 @@ def test_attention_cppsim( # Log outputs for debugging print(f"{o_expected}\n", file=open('o_expected.txt', 'w')) print(f"{o_produced}\n", file=open('o_produced.txt', 'w')) + # Save the ONNX model graph for debugging + model.save("attention-cppsim.onnx") # Test whether the expectation and the onnx model output match assert np.allclose(o_produced, o_expected), "cppsim exec failed" From 094f920624a726936f3f709b3f2d164170cc173e Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 16 Aug 2023 17:28:51 +0200 Subject: [PATCH 23/88] [Attention] Refactor towards thresholds HLS code generation --- src/finn/custom_op/fpgadataflow/attention.py | 178 ++++++++++++++++-- .../test_fpgadataflow_attention.py | 31 +-- 2 files changed, 180 insertions(+), 29 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 4464dd5883..426927e2ba 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -6,9 +6,13 @@ import numpy as np # Derive custom operators form the FINN base custom op from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +# Convert and pack (numpy) data for C++ code generation +from finn.util.data_packing import numpy_to_hls_code # QONNX/FINN datatypes from qonnx.core.datatype import DataType # noqa qonnx dependency is specified # in setup.cfg as well as in fetch-repos.sh +# QONNX wrapper to ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper # noqa # Scaled Dot-Product Attention Custom Operator @@ -58,7 +62,7 @@ def get_nodeattr_types(self): # Datatype of output elements of the Query x Key multiplication "OutQKMatMul": ("s", False, "UINT32"), # Activation function type of the Query x Key multiplication - "ActQKMatMul": ("s", False, "PassThroughActivation"), + "ActQKMatMul": ("s", False, "none", {"none", "thresholds"}), # Datatype of accumulator elements of the Attention x Value # multiplication @@ -67,11 +71,17 @@ def get_nodeattr_types(self): # multiplication "OutAVMatMul": ("s", False, "UINT32"), # Activation function type of the Attention x Value multiplication - "ActAVMatMul": ("s", False, "PassThroughActivation"), + "ActAVMatMul": ("s", False, "none", {"none", "thresholds"}), + + # Datatype of softmax normalization before applying activation or + # type cast. THis is called Acc to stick to the naming scheme of the + # MatMul operators before. + # Note: Currently this is ALWAYS floats + "AccASoftmax": ("s", False, "FLOAT32"), # Activation function type of the softmax normalization of the # attention weights - "ActASoftmax": ("s", False, "PassThroughActivation"), + "ActASoftmax": ("s", False, "none", {"none", "thresholds"}), # Mode used for providing the attention mask: There can be no mask, # a mask sent as the fourth input or a causal attention mask which @@ -203,12 +213,24 @@ def execute_node(self, context, graph): """.format(mode) ) - # Enumerate and name the node inputs. The mask is an optional fourth - # input. As "zip" runs over the shortest of its arguments, there will be - # no mask file generated if there is no fourth node input. + # Give names to the ordered node inputs which are always present: This + # serves as a translation table for mapping QONNX node index via the + # execution context name to this internal name for generating i/o files + # TODO: Maybe configure the naming and order of inputs somewhere more + # global? + named_inputs = ["q", "k", "v"] + # The mask is an optional fourth input. While "zip" runs over the + # shortest of its arguments, there would be no mask file generated if + # there is no fourth node input. However, the fourth input might be + # occupied by one of the thresholds instead, which is not an actual + # input and thus the mask ust be appended conditionally here. + if self.get_nodeattr("mask_mode") == "input": + named_inputs.append("m") + + # Enumerate and name the node inputs. for ind, (name, context_name) in enumerate( - # TODO: Maybe configure the naming and order of inputs somewhere? - zip(["q", "k", "v", "m"], self.onnx_node.input)): + zip(named_inputs, self.onnx_node.input) + ): # Read the input from the execution context and reshape to match the # expected folding x = context[context_name].reshape(self.get_folded_input_shape(ind)) @@ -289,13 +311,41 @@ def verify_node(self): # Gets the datatype of input at index ind def get_input_datatype(self, ind=0): # Ordered list of names of allowed inputs - inputs = ["Q", "K", "V"] + inputs = ["QType", "KType", "VType"] + # If the attention mask is provided as input, it has a type as well if self.get_nodeattr("mask_mode") == "input": # The mask type is an attribute itself - inputs += ["mask"] + inputs += ["MType"] + + # TODO: All the following types are probably never requested, they are + # implemented for the sake of completeness for now. If they are ever + # actually required, check whether the following defaults and dummies + # actually still make sense. + + # If there is a thresholding activation for the first matmul, it will + # have a type as well + if self.get_nodeattr("ActQKMatMul") == "thresholds": + # The thresholds will always be of the accumulator type as the + # activation maps from AccQKMatMul to OutQKMatMul + inputs += ["AccQKMatMul"] + + # If there is a thresholding activation for the second matmul, it will + # have a type as well + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # The thresholds will always be of the accumulator type as the + # activation maps from AccAVMatMul to OutAVMatMul + inputs += ["AccAVMatMul"] + + # If there is a thresholding activation for the softmax normalization, + # it will have a type as well + if self.get_nodeattr("ActASoftmax") == "thresholds": + # While there is a dummy configurable attribute describing the + # threshold type of the softmax, these are currently always floats + inputs += ["AccASoftmax"] + # Look up datatype name in attributes and convert to DataType - return DataType[self.get_nodeattr(f"{inputs[ind]}Type")] + return DataType[self.get_nodeattr(f"{inputs[ind]}")] # Gets the datatype of the output (at index ind, but there is just one) def get_output_datatype(self, ind=0): @@ -315,12 +365,37 @@ def get_normal_input_shape(self, ind=0): # Value input sequence (self.get_nodeattr("KVLen"), self.get_nodeattr("VDim")), ] + # If the attention mask is provided as input, it has a shape as well if self.get_nodeattr("mask_mode") == "input": # Mask shape is inferred from query and key sequence lengths inputs_shapes += [ (self.get_nodeattr("QLen"), self.get_nodeattr("KVLen")) ] + + # TODO: All the following shapes are probably never requested, they are + # implemented for the sake of completeness for now. If they are ever + # actually required, remember to insert meaningful shapes. + + # If there is a thresholding activation for the first matmul, these will + # be the next input index after the (optional) mask + if self.get_nodeattr("ActQKMatMul") == "thresholds": + # TODO: This is just a dummy shape + inputs_shapes += [(0, 0)] + + # If there is a thresholding activation for the second matmul, these + # will be the next input index after the (optional) first thresholds + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # TODO: This is just a dummy shape + inputs_shapes += [(0, 0)] + + # If there is a thresholding activation for the softmax normalization, + # these will be the next (and last) input index after the (optional) + # second thresholds + if self.get_nodeattr("ActASoftmax") == "thresholds": + # TODO: This is just a dummy shape + inputs_shapes += [(0, 0)] + # Get the shape by indexing into the ordered list of all inputs return inputs_shapes[ind] @@ -358,7 +433,14 @@ def get_folded_input_shape(self, ind=0): # corresponds to the KVLen return ilen, seqfold, idim // seqfold - # If this point is reached, something went wrong + # If this point is reached, probably something went wrong + # TODO: Requesting the folded shape of thresholds will reach here. Can + # this actually happen? Probably it is indeed an error, there should be + # no reason to ask for the shape of the thresholds, just ask for the + # initializer and get its shape? Folding of the thresholds behaves + # differently and would require to actually keep track of mapping + # indices to optional inputs to correctly associate the folding + # dimensions. raise Exception(f"Requested shape of invalid input index {ind}") # Gets the shape of the output at index ind (there is just one) with folding @@ -460,7 +542,7 @@ def get_ap_int_max_w(self): # Minimize the accumulator bit width def minimize_accumulator_width(self, model): # noqa: model is unused - # Ge the query, key, value and attention weights type + # Get the query, key, value and attention weights type QType = DataType[self.get_nodeattr("QType")] # noqa KType = DataType[self.get_nodeattr("KType")] # noqa VType = DataType[self.get_nodeattr("VType")] # noqa @@ -497,6 +579,52 @@ def global_includes(self): # Attention operator HLS code self.code_gen_dict["$GLOBALS$"] += ['#include "attention.hpp"'] + # Generates C++ parameters file, i.e. activation function thresholds + def generate_params(self, model: ModelWrapper, path): + # The code generation directory is specified as an argument, so this + # will work for both RTL and C++ simulation + code_gen_dir = path + + # Note: The attention operator itself has no weights to be generated as + # a parameter file + + # Start all three activations defaulting to pass-through of the + # accumulator type. + # Note: This might allow type-casts to the output types if they are + # not the same as the accumulators. + act_qk_matmul = "PassThroughActivation" + act_av_matmul = "PassThroughActivation" + act_a_softmax = "PassThroughActivation" + + # Query-key matmul can have an optional activation function set to + # thresholding activations via node attribute + if self.get_nodeattr("ActQKMatMul") == "thresholds": + # In this case there will be a thresholds parameter initializer + # TODO: Is hard coding the name here ok? + thresholds = model.get_initializer("thresholds_qk_matmul") + # Number of thresholds is given as the last dimension of the + # threshold tensor, first dimension is covering all output elements + num = thresholds.shape[-1] + # Get the datatype of the thresholds + thresholds_dtype = DataType[self.get_nodeattr("AccQKMatMul")] + # Format the thresholds as C++ array code + thresholds = numpy_to_hls_code( + thresholds, thresholds_dtype, "_", False, True + ) + # Replace default pass-through activation by thresholding activation + act_qk_matmul = ( + f"ThresholdsActivation<" + f" SeqFold, KVLen / SeqFold, {num}, AccQKMatMul, OutQKMatMul" + f">" + ) + # Open a file to store the thresholds parameters as C++ code + with open(f"{code_gen_dir}/params.hpp", "w") as file: + # Start writing a type alias definitions + file.write("\n".join([ + f"using ActQKMatMul = {act_qk_matmul};", + f"ActQKMatMul act_qk_matmul = {thresholds};" + ])) + # Generates C++ code of type alias, global constant and macro definitions def defines(self, var): # Generate shape definitions from attributes to C++ constant definitions @@ -553,9 +681,21 @@ def hls_type(name): "OutAVMatMul" ), # Type alias definitions for the activation functions - f"using ActQKMatMul = {self.get_nodeattr('ActQKMatMul')};", - f"using ActAVMatMul = {self.get_nodeattr('ActAVMatMul')};", - f"using ActASoftmax = {self.get_nodeattr('ActASoftmax')};", + # f"using ActQKMatMul = {self.get_nodeattr('ActQKMatMul')};", + f"using ActQKMatMul = ThresholdsActivation<", + f" SeqFold, KVLen / SeqFold, 16, AccQKMatMul, OutQKMatMul", + f">;", + + # f"using ActAVMatMul = {self.get_nodeattr('ActAVMatMul')};", + f"using ActAVMatMul = ThresholdsActivation<", + f" EmbFold, VDim / EmbFold, 16, AccAVMatMul, OutAVMatMul", + f">;", + + # f"using ActASoftmax = {self.get_nodeattr('ActASoftmax')};", + f"using ActASoftmax = ThresholdsActivation<", + f" SeqFold, KVLen / SeqFold, 16, float, AType", + f">;", + # Type alias of the properly configured attention operator class f"using Attention = ScaledDotProductAttention<", f" QKDim,", @@ -662,7 +802,11 @@ def docompute(self): # Instantiate the attention operator and connect to the streams # Note: Assumes "Attention" to be aliased appropriate configuration # in defines with. - "Attention attention; attention(q, k, v, out);", + "Attention attention;" + # "{" + # " act_qk_matmul, act_av_matmul, act_a_softmax" + # "};", + "attention(q, k, v, out);", ] # Generates C++ code for reading the output stream and converting back to diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 28f0a7716b..3ddb7402ed 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -69,7 +69,7 @@ class MockScaledDotProductAttention: # Datatype of output elements of the Query x Key multiplication OutQKMatMul: IntType = DataType["UINT4"] # Activation function type of the Query x Key multiplication - ActQKMatMul: str = "PassThroughActivation" + ActQKMatMul: str = "thresholds" # Datatype of accumulator elements of the Attention x Value # multiplication @@ -78,11 +78,11 @@ class MockScaledDotProductAttention: # multiplication OutAVMatMul: IntType = DataType["UINT4"] # Activation function type of the Attention x Value multiplication - ActAVMatMul: str = "PassThroughActivation" + ActAVMatMul: str = "thresholds" # Activation function type of the softmax normalization of the # attention weights - ActASoftmax: str = "PassThroughActivation" + ActASoftmax: str = "thresholds" # Initializes those parameters which depend on the initial configuration, # which is set by the generated __init__ @@ -191,6 +191,13 @@ def make_rand_input(self): # Creates a QONNX ModelWrapper matching the attention configuration def make_modelwrapper(self): + # Named threshold inputs + # Note: Order matters... + thresholds = [ + "thresholds_qk_matmul", + "thresholds_av_matmul", + "thresholds_a_softmax" + ] # Build up the node attribute dictionary kwargs = { # Refer to this operator type by its name @@ -201,11 +208,11 @@ def make_modelwrapper(self): # Execution backend: Required attribute inherited from HLSCustomOp "backend": "fpgadataflow", # Named inputs and activation thresholds - # Note: Currently no mask support - "inputs": ["Q", "K", "V"], + # TODO: Currently no masking support + "inputs": ["Q", "K", "V", *thresholds], # Named model output "outputs": ["O"], - # Currently no masking support + # TODO: Currently no masking support "mask_mode": "none" } @@ -257,14 +264,14 @@ def make_modelwrapper(self): # appropriate type # TODO: Uses the actual input type to the multithreshold function as # datatype. Somehow the mvau tests always use INT32, why? - model.set_tensor_datatype("QKThresholds", self.AccQKMatMul) - model.set_initializer("QKThresholds", self.qk_thresholds) + model.set_tensor_datatype("thresholds_qk_matmul", self.AccQKMatMul) + model.set_initializer("thresholds_qk_matmul", self.qk_thresholds) - model.set_tensor_datatype("AThresholds", DataType["FLOAT32"]) - model.set_initializer("AThresholds", self.a_thresholds) + model.set_tensor_datatype("thresholds_a_softmax", DataType["FLOAT32"]) + model.set_initializer("thresholds_a_softmax", self.a_thresholds) - model.set_tensor_datatype("AVThresholds", self.AccAVMatMul) - model.set_initializer("AVThresholds", self.av_thresholds) + model.set_tensor_datatype("thresholds_av_matmul", self.AccAVMatMul) + model.set_initializer("thresholds_av_matmul", self.av_thresholds) # Return the constructed qonnx model wrapper return model From 5d2836a2ea3e3554504c93efc85c0cb3a29e076d Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 17 Aug 2023 13:32:46 +0200 Subject: [PATCH 24/88] [Attention] Generate HLS code for all three activation thresholds Note: The threshold parameters are generated and included but not connected to the attention operator yet. The attention operator uses uninitialized thresholds of the same type and shape. --- src/finn/custom_op/fpgadataflow/attention.py | 191 +++++++++++++++---- 1 file changed, 156 insertions(+), 35 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 426927e2ba..678b4a81ea 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -4,6 +4,7 @@ import warnings # Numpy math and arrays import numpy as np + # Derive custom operators form the FINN base custom op from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp # Convert and pack (numpy) data for C++ code generation @@ -13,6 +14,8 @@ # in setup.cfg as well as in fetch-repos.sh # QONNX wrapper to ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper # noqa +# Partitions tensor into folded/pe groups +from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions # noqa # Scaled Dot-Product Attention Custom Operator @@ -579,6 +582,46 @@ def global_includes(self): # Attention operator HLS code self.code_gen_dict["$GLOBALS$"] += ['#include "attention.hpp"'] + # Converts names of optional inputs to the node input index and from there + # to the ONNX node input name if the input is present. + # Note: This mapping is required as the ONNX graph/node may provide + # different names (in particular automatically generated unique names) and + # some of these are optional inputs. + def get_input_name_by_name(self, name): + # Ordered names of the (optional) threshold inputs + thresholds = [ + "thresholds_qk_matmul", + "thresholds_av_matmul", + "thresholds_a_softmax" + ] + + # Ordered names of primary query, key, value inputs and optional mask + # and threshold inputs. + inputs = ["Q", "K", "V", "M", *thresholds] + + # Specify for each input whether it is present or not + inputs_present = [ + # Note: Primary inputs are always present, the mask is present in + # input mask mode + True, True, True, self.get_nodeattr("mask_mode") == "input", + ] + + # Thresholds are present if the activation function is set to + # thresholds + inputs_present.extend([ + self.get_nodeattr("ActQKMatMul") == "thresholds", + self.get_nodeattr("ActAVMatMul") == "thresholds", + self.get_nodeattr("ActASoftmax") == "thresholds" + ]) + + # Filter the ordered list of input names for those which are actually + # present + inputs = [x for x, present in zip(inputs, inputs_present) if present] + + # Find the position of the requested input name and look up the + # corresponding input name of the ONNX node + return self.onnx_node.input[inputs.index(name)] + # Generates C++ parameters file, i.e. activation function thresholds def generate_params(self, model: ModelWrapper, path): # The code generation directory is specified as an argument, so this @@ -596,34 +639,123 @@ def generate_params(self, model: ModelWrapper, path): act_av_matmul = "PassThroughActivation" act_a_softmax = "PassThroughActivation" + # Start all thresholds defaulting to empty default initializer braces + thresholds_qk_matmul = "{}" + thresholds_av_matmul = "{}" + thresholds_a_softmax = "{}" + + # Prepares a threshold tensor as C++ string for code generation + def prepare_thresholds(ts, length, fold, dtype): + # Number of thresholds is given as the last dimension of the + # threshold tensor, first dimension is covering all output elements + num = ts.shape[-1] # noqa + # Partition the thresholds along the length into folds of parallel + # elements + ts = interleave_matrix_outer_dim_from_partitions(ts, length // fold) + # Reshape folded thresholds adding an outer dimension + # TODO: Why? MVAU does this, just copied the behavior. This is + # probably to generate the outer C++ initializer braces {} for + # object construction. Isn't it weird to rely on an artificial + # dimension just to have the code generator produce the correct + # string? + ts = ts.reshape(1, length // fold, fold, num) + # Format the thresholds as C++ array code + # Note: no packing, no variable name/type declaration + return numpy_to_hls_code(ts, dtype, "_", False, True), num + + # Get shape and folding configuration. None of the activations fold + # along the query-key embedding dimension or the query sequence length + (_, _, vdim, kvlen), (embfold, seqfold) = self.shapes, self.folds + # Query-key matmul can have an optional activation function set to # thresholding activations via node attribute if self.get_nodeattr("ActQKMatMul") == "thresholds": # In this case there will be a thresholds parameter initializer - # TODO: Is hard coding the name here ok? - thresholds = model.get_initializer("thresholds_qk_matmul") - # Number of thresholds is given as the last dimension of the - # threshold tensor, first dimension is covering all output elements - num = thresholds.shape[-1] + thresholds = model.get_initializer( + self.get_input_name_by_name("thresholds_qk_matmul") + ) # Get the datatype of the thresholds thresholds_dtype = DataType[self.get_nodeattr("AccQKMatMul")] - # Format the thresholds as C++ array code - thresholds = numpy_to_hls_code( - thresholds, thresholds_dtype, "_", False, True + # Format the thresholds as C++ array code: QK matmul outputs fold + # along the key-value sequence length dimension + thresholds_qk_matmul, num = prepare_thresholds( + thresholds, kvlen, seqfold, thresholds_dtype ) # Replace default pass-through activation by thresholding activation - act_qk_matmul = ( - f"ThresholdsActivation<" - f" SeqFold, KVLen / SeqFold, {num}, AccQKMatMul, OutQKMatMul" + # Note: Relies on type and shape definitions generated by the + # "defines" method + act_qk_matmul = "\n".join([ + f"ThresholdsActivation<", + f" SeqFold, KVLen/SeqFold, {num}, AccQKMatMul, OutQKMatMul", f">" + ]) + + # Attention-value matmul can have an optional activation function set to + # thresholding activations via node attribute + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # In this case there will be a thresholds parameter initializer + thresholds = model.get_initializer( + self.get_input_name_by_name("thresholds_av_matmul") ) - # Open a file to store the thresholds parameters as C++ code - with open(f"{code_gen_dir}/params.hpp", "w") as file: - # Start writing a type alias definitions - file.write("\n".join([ - f"using ActQKMatMul = {act_qk_matmul};", - f"ActQKMatMul act_qk_matmul = {thresholds};" - ])) + # Get the datatype of the thresholds + thresholds_dtype = DataType[self.get_nodeattr("AccAVMatMul")] + # Format the thresholds as C++ array code: AV matmul outputs fold + # along the value embedding dimension + thresholds_av_matmul, num = prepare_thresholds( + thresholds, vdim, embfold, thresholds_dtype + ) + # Replace default pass-through activation by thresholding activation + # Note: Relies on type and shape definitions generated by the + # "defines" method + act_av_matmul = "\n".join([ + f"ThresholdsActivation<", + f" EmbFold, VDim/EmbFold, {num}, AccAVMatMul, OutAVMatMul", + f">" + ]) + + # Softmax can have an optional activation function set to thresholding + # activations via node attribute + if self.get_nodeattr("ActASoftmax") == "thresholds": + # In this case there will be a thresholds parameter initializer + thresholds = model.get_initializer( + self.get_input_name_by_name("thresholds_a_softmax") + ) + # Get the datatype of the thresholds + thresholds_dtype = DataType[self.get_nodeattr("AccASoftmax")] + # Format the thresholds as C++ array code: Softmax outputs fold + # along the key-value sequence length dimension + thresholds_a_softmax, num = prepare_thresholds( + thresholds, kvlen, seqfold, thresholds_dtype + ) + # Replace default pass-through activation by thresholding activation + # Note: Relies on type and shape definitions generated by the + # "defines" method + act_a_softmax = "\n".join([ + f"ThresholdsActivation<", + f" SeqFold, KVLen/SeqFold, {num}, AccASoftmax, AType", + f">" + ]) + + # Open a file to store the thresholds parameters as C++ code + with open(f"{code_gen_dir}/params.hpp", "w") as file: + # Write lines of C++ code separated by newlines to the file + file.write("\n".join([ + # Add type definition and threshold initialization of the + # query-key matmul activation + f"using ActQKMatMul = {act_qk_matmul};", + f"ActQKMatMul act_qk_matmul = {thresholds_qk_matmul};", + # Add type definition and threshold initialization of the + # attention-value matmul activation + f"using ActAVMatMul = {act_av_matmul};", + f"ActAVMatMul act_av_matmul = {thresholds_av_matmul};", + # Add type definition and threshold initialization of the + # softmax activation + f"using ActASoftmax = {act_a_softmax};", + f"ActASoftmax act_a_softmax = {thresholds_a_softmax};", + # Append a newline at the end of the file (to avoid problems + # when including, required by C standard?) + "\n" + ])) # Generates C++ code of type alias, global constant and macro definitions def defines(self, var): @@ -678,24 +810,13 @@ def hls_type(name): "AccQKMatMul", "OutQKMatMul", "AccAVMatMul", - "OutAVMatMul" + "OutAVMatMul", + "AccASoftmax" ), - # Type alias definitions for the activation functions - # f"using ActQKMatMul = {self.get_nodeattr('ActQKMatMul')};", - f"using ActQKMatMul = ThresholdsActivation<", - f" SeqFold, KVLen / SeqFold, 16, AccQKMatMul, OutQKMatMul", - f">;", - - # f"using ActAVMatMul = {self.get_nodeattr('ActAVMatMul')};", - f"using ActAVMatMul = ThresholdsActivation<", - f" EmbFold, VDim / EmbFold, 16, AccAVMatMul, OutAVMatMul", - f">;", - - # f"using ActASoftmax = {self.get_nodeattr('ActASoftmax')};", - f"using ActASoftmax = ThresholdsActivation<", - f" SeqFold, KVLen / SeqFold, 16, float, AType", - f">;", - + # Include the activation function type definitions and parameters + # Note: The typedefs in this header require the typedefs above, + # thus adding this to the global includes is not possible. + f'#include "params.hpp"', # Type alias of the properly configured attention operator class f"using Attention = ScaledDotProductAttention<", f" QKDim,", From 76e5e0e244f8ffa2b3e2a22cbb77f0c3ec471329 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 17 Aug 2023 14:37:54 +0200 Subject: [PATCH 25/88] [Attention] Initialize the attention operator using generated thresholds --- src/finn/custom_op/fpgadataflow/attention.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 678b4a81ea..dc2a3bbc94 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -920,13 +920,14 @@ def strm_decl(self): def docompute(self): # Write the body of the attention top-level function self.code_gen_dict["$DOCOMPUTE$"] = [ - # Instantiate the attention operator and connect to the streams + # Instantiate the attention operator and connect to the generated + # threshold parameters # Note: Assumes "Attention" to be aliased appropriate configuration # in defines with. - "Attention attention;" - # "{" - # " act_qk_matmul, act_av_matmul, act_a_softmax" - # "};", + "Attention attention {", + " act_qk_matmul, act_av_matmul, act_a_softmax", + "};", + # Connect the attention operator to the input and output streams "attention(q, k, v, out);", ] From b152f23509ae45e60bf489ce7d23daf79faaa417 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 17 Aug 2023 17:36:35 +0200 Subject: [PATCH 26/88] [Attention] Numpy softmax matching overflow behavior of the HLS operator --- .../test_fpgadataflow_attention.py | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 3ddb7402ed..2ae9841804 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -6,8 +6,6 @@ # Use numpy for python execution / computing the ground truth expected values import numpy as np -# Numpy compatible implementation of the softmax operation -from scipy.special import softmax # Utility types and function for creating onnx nodes and graphs from onnx import TensorProto, helper @@ -33,6 +31,25 @@ from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +# Softmax function on numpy arrays with overflow handling matching the HLS +# operator +def softmax(x, axis): + # For overflow handling, find the maximum value along axis and place ones at + # each occurrence + max_ones = (x == np.max(x, axis=axis, keepdims=True)).astype(np.float32) + # Count the occurrences of the maximum along the normalization axis + max_counts = np.sum(max_ones, axis=axis, keepdims=True) + # Exponential of the input + exp = np.exp(x) + # Compute the total along axis + total = np.sum(exp, axis=1, keepdims=True) + # Detect overflow of the summation + overflow = np.isinf(total) + # Replace overflows by equal weight given to all instances of the maximum + # input value. For non overflow just compute normal softmax + return np.where(overflow, max_ones / max_counts, exp / total) + + # Python/Numpy model of the scaled dot-product attention operator as it is (will # be...) implemented in the attention-hlslib @dataclass @@ -166,9 +183,12 @@ def qk_matmul(self, query, key): # Computes the softmax normalization of attention weights with activation # function simulating quantization via thresholding def softmax(self, attention): - # TODO: Correctly model OUR softmax implementation, especially its - # overflow handling - return multithreshold(softmax(attention, axis=1), self.a_thresholds) + # Input and output scale factors for float <-> int conversion + iscale = 1.0 / (self.OutQKMatMul.get_num_possible_values() - 1) + # Scale the inputs, normalize using softmax and activate via thresholds + return multithreshold( + softmax(iscale * attention, axis=1), self.a_thresholds + ) # Computes the attention-value matmul with activation function simulating # quantization via thresholding From 8bd5a20f1c92e1e41b69119f695523220da0d57e Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 21 Aug 2023 14:45:03 +0200 Subject: [PATCH 27/88] [Attention] Satisfy attention output type constraint --- tests/fpgadataflow/test_fpgadataflow_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 2ae9841804..c7c60d0ca9 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -364,7 +364,7 @@ def test_attention_cppsim( AccQKMatMul=DataType["UINT11"], OutQKMatMul=DataType["UINT4"], AccAVMatMul=DataType["UINT11"], - OutAVMatMul=DataType["UINT4"] + OutAVMatMul=OType ) # Create a QONNX model wrapper for testing From 65de26d1020684a23b60a77367ee8265c28bc459 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 21 Aug 2023 14:50:10 +0200 Subject: [PATCH 28/88] [Attention] Increase test bitwidth to see some more interesting behavior --- .../test_fpgadataflow_attention.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index c7c60d0ca9..0e137247b2 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -310,12 +310,12 @@ def make_modelwrapper(self): # Folding along the sequence dimensions @pytest.mark.parametrize("SeqFold", [8]) # Datatypes of queries, keys and values, mask and output -@pytest.mark.parametrize("QType", [DataType["UINT4"]]) -@pytest.mark.parametrize("KType", [DataType["UINT4"]]) -@pytest.mark.parametrize("VType", [DataType["UINT4"]]) -@pytest.mark.parametrize("MType", [DataType["UINT4"]]) -@pytest.mark.parametrize("AType", [DataType["UINT4"]]) -@pytest.mark.parametrize("OType", [DataType["UINT4"]]) +@pytest.mark.parametrize("QType", [DataType["UINT6"]]) +@pytest.mark.parametrize("KType", [DataType["UINT6"]]) +@pytest.mark.parametrize("VType", [DataType["UINT6"]]) +@pytest.mark.parametrize("MType", [DataType["UINT6"]]) +@pytest.mark.parametrize("AType", [DataType["UINT6"]]) +@pytest.mark.parametrize("OType", [DataType["UINT6"]]) # Different modes to provide a mask @pytest.mark.parametrize("mask", ["none"]) # This is a slow running fpgadataflow type of test which requires vivado @@ -361,9 +361,9 @@ def test_attention_cppsim( AType=AType, OType=OType, # Accumulator type configuration - AccQKMatMul=DataType["UINT11"], - OutQKMatMul=DataType["UINT4"], - AccAVMatMul=DataType["UINT11"], + AccQKMatMul=DataType["UINT16"], + OutQKMatMul=DataType["UINT6"], + AccAVMatMul=DataType["UINT16"], OutAVMatMul=OType ) From ce1e19bd5901938f73bb2ef84dd1ac560765464f Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 21 Aug 2023 15:10:45 +0200 Subject: [PATCH 29/88] [Attention] Remove python mode node execution --- src/finn/custom_op/fpgadataflow/attention.py | 47 +------------------- 1 file changed, 1 insertion(+), 46 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index dc2a3bbc94..87ffc572ad 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -90,12 +90,6 @@ def get_nodeattr_types(self): # a mask sent as the fourth input or a causal attention mask which # is generated by the operator itself. "mask_mode": ("s", True, "none", {"none", "input", "causal"}), - - # Execution mode of the operator - # TODO: Remove once switching to HLSCustomOp - # TODO: Not possible right now, python mode is still required by - # dummy unit test - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), }) # Return updated attribute dictionary return attrs @@ -242,47 +236,8 @@ def execute_node(self, context, graph): # Save the folded inputs to file to be used by simulation np.save(os.path.join(code_gen_dir, f"{name}.npy"), x) - # Support python execution mode for now - # TODO: Remove python mode once switching to HLSCustomOp - if mode == "python": - # Numpy compatible softmax implementation - from scipy.special import softmax - # Generate random input data for testing - from qonnx.util.basic import gen_finn_dt_tensor # noqa qonnx - # dependency is specified in setup.cfg as well as in fetch-repos.sh - - # Read input tensors of the query, key and value inputs from context - q = context[self.onnx_node.input[0]] - k = context[self.onnx_node.input[1]] - v = context[self.onnx_node.input[2]] - # Get the shared embedding dimension of queries and keys - d = self.get_nodeattr('QKDim') - # Start with zero mask - mask = 0 - # The actual attention mask may be provided as the fourth input - if self.get_nodeattr("mask_mode") == "input": - # Get the mask tensor from the execution context - mask = context[self.onnx_node.input[3]] - # Another option is to generate a causal attention mask on the fly - elif self.get_nodeattr("mask_mode") == "causal": - # Get the datatype of the attention mask - mask_dtype = DataType[self.get_nodeattr("MType")] - # Start with an all zero attention mask - mask = 0 * gen_finn_dt_tensor( - mask_dtype, (q.shape[0], k.shape[0]) - ) - # Generate upper triangular causal attention mask - mask[np.triu_indices_from(mask, 1)] = - np.inf - # Compute the attention matrix between queries and keys - attention = softmax(q @ k.T * (d ** -0.5) + mask, axis=-1) - # Compute product of attention weights and value input - o = attention @ v - # Get the name of the output - o_name = self.onnx_node.output[0] - # Save the output tensor to the execution context - context[o_name] = o # CPP Simulation of the HLS operator - elif mode == "cppsim": + if mode == "cppsim": # Execute the precompiled C++ simulation program # Note: Reusing the HLSCustomOp base implementation is probably fine super().exec_precompiled_singlenode_model() From 7500606357e9812b419b9f6de430a34ac89dd575 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 22 Aug 2023 13:29:08 +0200 Subject: [PATCH 30/88] [Attention] Fix cppsim test accumulator bitwidth Note: Currently there is no method for optimizing the accumulator width of both, the HLSCustomOp and the python simulation. Thus, to make the tests pass, both must be specified manually to the maximum possible accumulator bitwidth. Doing the MinimizeAccumulatorWidth transform would cause the HLS and python operator behavior to diverge. --- src/finn/custom_op/fpgadataflow/attention.py | 4 ++-- tests/fpgadataflow/test_fpgadataflow_attention.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 87ffc572ad..ec67eabc94 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -509,8 +509,8 @@ def minimize_accumulator_width(self, model): # noqa: model is unused qk_min = self.get_nodeattr("QKDim") * QType.min() * KType.min() qk_max = self.get_nodeattr("QKDim") * QType.max() * KType.max() # Minimal and maximal possible results of attention-value multiplication - av_min = self.get_nodeattr("VDim") * AType.min() * VType.min() - av_max = self.get_nodeattr("VDim") * AType.max() * VType.max() + av_min = self.get_nodeattr("KVLen") * AType.min() * VType.min() + av_max = self.get_nodeattr("KVLen") * AType.max() * VType.max() # Update the accumulator types to fit the min-max range # TODO: Is this correct? _qk_max = max(-qk_min, 1 + qk_max) diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 0e137247b2..67f991b4aa 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -298,13 +298,13 @@ def make_modelwrapper(self): # Size of query and key embedding dimension -@pytest.mark.parametrize("QKDim", [4]) +@pytest.mark.parametrize("QKDim", [4, 8, 16]) # Size of value embedding dimension -@pytest.mark.parametrize("VDim", [8]) +@pytest.mark.parametrize("VDim", [4, 8, 16]) # Length of key and value sequences -@pytest.mark.parametrize("KVLen", [24]) +@pytest.mark.parametrize("KVLen", [16, 24]) # Length of query sequence -@pytest.mark.parametrize("QLen", [16]) +@pytest.mark.parametrize("QLen", [16, 24]) # Folding along the embedding dimensions @pytest.mark.parametrize("EmbFold", [2]) # Folding along the sequence dimensions @@ -361,9 +361,9 @@ def test_attention_cppsim( AType=AType, OType=OType, # Accumulator type configuration - AccQKMatMul=DataType["UINT16"], + AccQKMatMul=DataType["UINT18"], OutQKMatMul=DataType["UINT6"], - AccAVMatMul=DataType["UINT16"], + AccAVMatMul=DataType["UINT18"], OutAVMatMul=OType ) From 096364cdcb39a8ea081a922019c8d7145a4079ad Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 22 Aug 2023 17:11:02 +0200 Subject: [PATCH 31/88] [Attention] Increase test bitwidth to see some more interesting behavior --- .../test_fpgadataflow_attention.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 67f991b4aa..fd64bebc99 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -310,12 +310,12 @@ def make_modelwrapper(self): # Folding along the sequence dimensions @pytest.mark.parametrize("SeqFold", [8]) # Datatypes of queries, keys and values, mask and output -@pytest.mark.parametrize("QType", [DataType["UINT6"]]) -@pytest.mark.parametrize("KType", [DataType["UINT6"]]) -@pytest.mark.parametrize("VType", [DataType["UINT6"]]) -@pytest.mark.parametrize("MType", [DataType["UINT6"]]) -@pytest.mark.parametrize("AType", [DataType["UINT6"]]) -@pytest.mark.parametrize("OType", [DataType["UINT6"]]) +@pytest.mark.parametrize("QType", [DataType["UINT8"]]) +@pytest.mark.parametrize("KType", [DataType["UINT8"]]) +@pytest.mark.parametrize("VType", [DataType["UINT8"]]) +@pytest.mark.parametrize("MType", [DataType["UINT8"]]) +@pytest.mark.parametrize("AType", [DataType["UINT8"]]) +@pytest.mark.parametrize("OType", [DataType["UINT8"]]) # Different modes to provide a mask @pytest.mark.parametrize("mask", ["none"]) # This is a slow running fpgadataflow type of test which requires vivado @@ -361,9 +361,9 @@ def test_attention_cppsim( AType=AType, OType=OType, # Accumulator type configuration - AccQKMatMul=DataType["UINT18"], - OutQKMatMul=DataType["UINT6"], - AccAVMatMul=DataType["UINT18"], + AccQKMatMul=DataType["UINT22"], + OutQKMatMul=DataType["UINT8"], + AccAVMatMul=DataType["UINT22"], OutAVMatMul=OType ) From e0f3fcd461509d97e63e450b6398deeb32b308ca Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 23 Aug 2023 17:44:52 +0200 Subject: [PATCH 32/88] [Attention] Add RTL simulation unit test Note: Does currently fail, probably due to float IPs (for softmax operation) missing in verilator simulation. --- src/finn/custom_op/fpgadataflow/templates.py | 4 +- .../test_fpgadataflow_attention.py | 118 ++++++++++++++++-- 2 files changed, 114 insertions(+), 8 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 3d89a0ab23..150ba3b578 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -88,13 +88,15 @@ set config_proj_part "$FPGAPART$" set config_bnnlibdir "$::env(FINN_ROOT)/deps/finn-hlslib" puts "finn-hlslib dir: $config_bnnlibdir" +set config_attentionlibdir "$::env(FINN_ROOT)/deps/attention-hlslib" +puts "attention-hlslib dir: $config_attentionlibdir" set config_customhlsdir "$::env(FINN_ROOT)/custom_hls" puts "custom HLS dir: $config_customhlsdir" set config_toplevelfxn "$TOPFXN$" set config_clkperiod $CLKPERIOD$ open_project $config_proj_name -add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++14 -I$config_bnnlibdir -I$config_customhlsdir" +add_files $config_hwsrcdir/top_$TOPFXN$.cpp -cflags "-std=c++14 -I$config_bnnlibdir -I$config_customhlsdir -I$config_attentionlibdir" set_top $config_toplevelfxn open_solution sol1 diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index fd64bebc99..9f85528dba 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -29,6 +29,9 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim # Softmax function on numpy arrays with overflow handling matching the HLS @@ -298,7 +301,7 @@ def make_modelwrapper(self): # Size of query and key embedding dimension -@pytest.mark.parametrize("QKDim", [4, 8, 16]) +@pytest.mark.parametrize("QKDim", [4, 8, 16]) # noqa: Duplicated code fragment # Size of value embedding dimension @pytest.mark.parametrize("VDim", [4, 8, 16]) # Length of key and value sequences @@ -342,9 +345,9 @@ def test_attention_cppsim( # Type of mask to use: either 'none', 'input', or 'causal' mask ): - # Attention instance simulating in python and generating a matching QONNX - # configuration - attention = MockScaledDotProductAttention( + # Attention instance simulating in python and generating a matching + # QONNX configuration + attention = MockScaledDotProductAttention( # noqa: Duplicated code fragment # Shape configuration QKDim=QKDim, QLen=QLen, @@ -383,13 +386,13 @@ def test_attention_cppsim( model = model.transform(CompileCppSim()) # Compute ground-truth output in software - o_expected = attention(q, k, v) + o_expected = attention(q, k, v) # noqa: Duplicated code fragment # Execute the onnx model to collect the result o_produced = execute_onnx(model, context)["O"] # Log outputs for debugging - print(f"{o_expected}\n", file=open('o_expected.txt', 'w')) - print(f"{o_produced}\n", file=open('o_produced.txt', 'w')) + print(f"{o_expected}\n", file=open('o_expected_cppsim.txt', 'w')) + print(f"{o_produced}\n", file=open('o_produced_cppsim.txt', 'w')) # Save the ONNX model graph for debugging model.save("attention-cppsim.onnx") @@ -402,3 +405,104 @@ def test_attention_cppsim( # Tests rtl simulation of single scaled dot-product attention head def test_fpgadataflow_attention_rtlsim(): pass + + +# Size of query and key embedding dimension +@pytest.mark.parametrize("QKDim", [4]) # noqa: Duplicated code fragment +# Size of value embedding dimension +@pytest.mark.parametrize("VDim", [4]) +# Length of key and value sequences +@pytest.mark.parametrize("KVLen", [16]) +# Length of query sequence +@pytest.mark.parametrize("QLen", [16]) +# Folding along the embedding dimensions +@pytest.mark.parametrize("EmbFold", [2]) +# Folding along the sequence dimensions +@pytest.mark.parametrize("SeqFold", [8]) +# Datatypes of queries, keys and values, mask and output +@pytest.mark.parametrize("QType", [DataType["UINT8"]]) +@pytest.mark.parametrize("KType", [DataType["UINT8"]]) +@pytest.mark.parametrize("VType", [DataType["UINT8"]]) +@pytest.mark.parametrize("MType", [DataType["UINT8"]]) +@pytest.mark.parametrize("AType", [DataType["UINT8"]]) +@pytest.mark.parametrize("OType", [DataType["UINT8"]]) +# Different modes to provide a mask +@pytest.mark.parametrize("mask", ["none"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests rtl simulation of single scaled dot-product attention head +def test_attention_rtlsim( + # Shape configuration + QKDim, # noqa: "Argument should be lowercase" + VDim, # noqa + KVLen, # noqa + QLen, # noqa + # Folding configuration + EmbFold, # noqa + SeqFold, # noqa + # Type configuration + QType, # noqa + KType, # noqa + VType, # noqa + MType, # noqa + AType, # noqa + OType, # noqa + # Type of mask to use: either 'none', 'input', or 'causal' + mask +): + # Attention instance simulating in python and generating a matching + # QONNX configuration + attention = MockScaledDotProductAttention( # noqa: Duplicated code fragment + # Shape configuration + QKDim=QKDim, + QLen=QLen, + VDim=VDim, + KVLen=KVLen, + # Folding configuration + EmbFold=EmbFold, + SeqFold=SeqFold, + # Type configuration + QType=QType, + KType=KType, + VType=VType, + MType=MType, + AType=AType, + OType=OType, + # Accumulator type configuration + AccQKMatMul=DataType["UINT22"], + OutQKMatMul=DataType["UINT8"], + AccAVMatMul=DataType["UINT22"], + OutAVMatMul=OType + ) + + # Create a QONNX model wrapper for testing + model = attention.make_modelwrapper() + # Sample some random inputs + q, k, v = attention.make_rand_input() + # Prepare execution context + context = { + "Q": q, "K": k, "V": v, "mask": mask + } + # Set model execution mode to RTL simulation + model = model.transform(SetExecMode("rtlsim")) + # Generates the C++ source and compiles the RTL simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 10)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + # Compute ground-truth output in software + o_expected = attention(q, k, v) # noqa: Duplicated code fragment + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context)["O"] + + # Log outputs for debugging + print(f"{o_expected}\n", file=open('o_expected_rtlsim.txt', 'w')) + print(f"{o_produced}\n", file=open('o_produced_rtlsim.txt', 'w')) + # Save the ONNX model graph for debugging + model.save("attention-rtlsim.onnx") + + # Test whether the expectation and the onnx model output match + assert np.allclose(o_produced, o_expected), "rtlsim exec failed" From 969da0a458c8a848b79306132f36396b4ceace2e Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 24 Nov 2023 19:54:36 +0100 Subject: [PATCH 33/88] [Attention] Sketch multi-head splitting and merging custom ops --- .../custom_op/fpgadataflow/attention_heads.py | 752 ++++++++++++++++++ 1 file changed, 752 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/attention_heads.py diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py new file mode 100644 index 0000000000..cc6e12e163 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -0,0 +1,752 @@ +# Operating system stuff, e.g. paths +import os +# Python warning subsystem +import warnings +# Numpy math and arrays +import numpy as np + +# Protobuf onnx graph node type +from onnx import NodeProto # noqa +# Helper for creating ONNX nodes +from onnx import helper as oh # noqa + +# Derive custom operators form the FINN base custom op +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +# QONNX/FINN datatypes +from qonnx.core.datatype import DataType # noqa qonnx dependency is specified +# in setup.cfg as well as in fetch-repos.sh +# QONNX wrapper to ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper # noqa + + +# Splitting of attention heads (after input projections) custom operator +class SplitMultiHeads(HLSCustomOp): + # Initializes the operator given an onnx graph node + def __init__(self, onnx_node, **kwargs): + # Just forward all arguments to the init method of the CustomOp base + super().__init__(onnx_node, **kwargs) + + # Defines attributes which must be present on this node + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = super().get_nodeattr_types() + # Update attributes dictionary for new custom operator + attrs.update({ + # Number of attention heads + "heads": ("i", True, 1), + # Specifies whether the output is packed as a single output tensor + # or split as multiple output tensors + "packed": ("i", True, 1), + # Data type of input and output elements + "dtype": ("s", True, ""), + # Number of input elements to be split + "num_elems": ("i", True, 1), + # Number of inputs to be processed sequentially + "num_inputs": ("ints", True, [1]) + }) + # Return updated attribute dictionary + return attrs + + # Number of attention heads attribute as property for convenience + @property + def heads(self): + return self.get_nodeattr("heads") + + # Packed attribute as property for convenience + @property + def packed(self): + # Note: Converts from int to bool + return bool(self.get_nodeattr("packed")) + + # Datatype attribute as property for convenience + @property + def dtype(self): + # Note: Converts from string to QONNX data type + return DataType[self.get_nodeattr("dtype")] + + # Number of elements attribute as property for convenience + @property + def num_elems(self): + return self.get_nodeattr("num_elems") + + # Number of inputs attribute as property for convenience + @property + def num_inputs(self): + return self.get_nodeattr("num_inputs") + + # Makes an operation compatible with the output shape for shape inference + # Note: Propagates shape forward, i.e., never asks for the shape of the + # output, even if it seems easier. + def make_shape_compatible_op(self, model: ModelWrapper): # noqa + # Get the node wrapped by this custom op + node = self.onnx_node + # Get the shape of the input tensor for inferring the number of + # heads and correctly propagating shapes + shape = model.get_tensor_shape(node.input[0]) + # Determine the rank of the input tensor to support batched and + # non-batched inputs + rank = len(shape) + # The input shape determines the sequence length + seq, _, dim = shape if (rank == 3) else (shape[0], 1, shape[1]) + # Packed outputs a represented by a reshape operation producing one + # tensor + if self.packed: + # Create a new name for the temporary shape tensor + shape = model.make_new_valueinfo_name() + # Set the target shape of slices heads + model.set_initializer( + shape, np.asarray([self.heads, seq, dim // self.heads]) + ) + # Return a node simulating the shape effect of slicing into + # multi-heads + return oh.make_node( + "Reshape", [node.input[0], shape], [node.output[0]] + ) + # Prepare a dummy input to simulate reordering of batch/head dimension + # to the front + mock_input = model.make_new_valueinfo_name() + # Set the target shape of slices heads + model.set_tensor_shape( + mock_input, [1, seq, dim] if rank == 3 else [seq, dim] + ) + # If the outputs are not packed, the operation is represented as a split + # operation producing number of heads outputs along the last axis + return oh.make_node( + "Split", [mock_input], node.output, num_outputs=self.heads, axis=-1 + ) + + # Infers the datatype of the node output + def infer_node_datatype(self, model: ModelWrapper): # noqa + # Get the node wrapped by this custom op + node = self.onnx_node + # Propagate the type from the input to each output tensor + for o in node.output: + # Slicing simply propagates the type of the input to the output + model.set_tensor_datatype( + o, model.get_tensor_datatype(node.input[0]) + ) + + # Executes multi-head slicing in python + def execute_node(self, context, graph): + # Get the node wrapped by this custom op + node = self.onnx_node + # Get the input out of the execution context + # Note: Shape must be either seq x 1 x dim or seq x dim + inp = context[node.input[0]] + # Packed execution boils down to a reshape of the single input to a + # single output + if self.packed: + # Reshape to separate the heads out of the embedding dimensions, + # finally transpose to heads first layout + out = inp.reshape(inp.shape[0], self.heads, -1).transpose(1, 0, 2) + # Write the output into the execution context + context[node.output[0]] = out + # Split is realized as the split operation of numpy + else: + # Produces multiple outputs as a list + splits = np.split(inp, indices_or_sections=self.heads, axis=-1) + # Correspondence between outputs and splits in order + for o, out in zip(node.output, splits): + # Write the output into the execution context + context[o] = out + + # Verifies the node attributes, inputs and outputs + def verify_node(self): + # TODO: Implement + return [] + + # Note: End of QONNX CustomOp region, below is FINN HLSCustomOp stuff + + # Gets the datatype of input at index ind + def get_input_datatype(self, ind=0): + # All inputs (there should only be one) have the same type + return self.dtype + + # Gets the datatype of the output at index ind + def get_output_datatype(self, ind=0): + # All outputs will hae the same type, which is the same as the input + return self.dtype + + # Gets the shape of the input at index ind without folding + def get_normal_input_shape(self, ind=0): + # There is only one input with shape configured as attributes + # Unpack multi-axis inputs list to yield a flat tuple as shape + return *self.num_inputs, self.num_elems + + # Gets the shape of the output at index ind without folding + def get_normal_output_shape(self, ind=0): + # Packed layout is currently not implemented + assert not self.packed, "Packed multi-heads are not implemented yet" + # All output have the same shape, which correspond to distributing the + # number of input elements to the heads specified as attributes + # Unpack multi-axis inputs list to yield a flat tuple as shape + return *self.num_inputs, self.num_elems // self.heads + + # Gets the shape of the input at index ind with folding + def get_folded_input_shape(self, ind=0): + # No folding for now, normal and folded shape are the same + return self.get_normal_input_shape(ind=ind) + + # Gets the shape of the output at index ind with folding + def get_folded_output_shape(self, ind=0): + # No folding for now, normal and folded shape are the same + return self.get_normal_output_shape(ind=ind) + + # Widths of the input data stream of the input at index ind + def get_instream_width(self, ind=0): + # Get the number of bits used to represent the input + i_bits = self.get_input_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded input + *_, elems = self.get_folded_input_shape(ind) + # Width of a stream receiving input elements in parallel + return elems * i_bits + + # Widths of the output data stream of the output at index ind + def get_outstream_width(self, ind=0): + # Get the number of bits used to represent the output + o_bits = self.get_output_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded output + *_, elems = self.get_folded_output_shape(ind) + # Width of a stream producing output elements in parallel + return elems * o_bits + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + # Note: There is just one input. + i_bits_max = self.get_instream_width(ind=0) + # Find the widths of the widest output + # Note: there is one output per head + o_bits_max = max( + (self.get_outstream_width(ind) for ind in range(self.heads)) + ) + # Find the biggest of the inputs/outputs + return max([i_bits_max, o_bits_max]) + + # Gets the number of expected output values, i.e. how many times read() + # could/should be called on any output stream of this operator + def get_number_output_values(self): + # Elements over all but the last dimension of the output folded along + # the embedding dimension + return np.prod(self.get_folded_output_shape()[:-1]) + + # Note: End of shape and datatype utilities + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # Currently nothing to include + pass + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Input and output element datatypes + f"using IType = {self.dtype.get_hls_datatype_str()};", + f"using OType = {self.dtype.get_hls_datatype_str()};", + # Input and output HLS stream datatypes + f"using IStream = hls::stream<" + f" ap_uint<{self.get_instream_width()}>" + f">;", + f"using OStream = hls::stream<" + f" ap_uint<{self.get_outstream_width()}>" + f">;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [ + # Deduce the datatype of elements packed into the input stream + f'using IPacked = decltype(IStream{{}}.read());', + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + f'npy2apintstream(', + f' "{code_gen_dir}/in.npy", in, false', + ');' + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Declare input and output streams + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # Note: Assumes stream type aliases to be set in defines + "IStream in;", *(f"OStream out{i};" for i in range(self.heads)) + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + # Generates the bit-slicing indices string for the ith split of the + # input + def split(i): + # Assemble a C++ indexing/bit-slicing string + return f"({i + 1} * OType::width - 1, {i} * OType::width)" + + # Write the body of the head-splitting top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Repeat for the number of inputs + # Note: Repeat for all num_inputs dimensions + f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", + # Pipeline the steps of this loop + f"#pragma HLS pipeline II=1 style=flp", + # Read the next input element from the stream + f"const auto x = in.read();", + # Split the next element from the input stream into the number of + # output elements per head and write into the corresponding stream + *(f"out{i}.write(x{split(i)});" for i in range(self.heads)), + # End of for-loop over repetitions body + f"}}" + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C** simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the code generation + # dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Generate function call for reading from the output stream into the + # output file + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + # Deduce the datatype of elements packed into the output stream + f'using OPacked = decltype(OStream{{}}.read());' + ] + # Generate code for each output stream + for i in range(self.heads): + # Append each reading/writing function call + self.code_gen_dict["$DATAOUTSTREAM$"] += [ + # Generate function call reading from stream into the output + # file + # Note: Outputs are always represented as numpy floats + f'apintstream2npy(', + f' out{i}, {shape}, "{code_gen_dir}/out{i}.npy", false', + f');' + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the head + # splitting operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # @formatter:off Prevent Python formatter from messing with C++ + # formatting + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + # Input HLS stream + f" IStream &in", ",".join([ + # One output HLS stream per head # noqa: Formatting + f" OStream &out{i}" for i in range(self.heads) + ]), + f")", + # @formatter:off + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the input stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=in" + ] + # Connect each output stream with an axi stream interface + for i in range(self.heads): + # Add new interface directive for the output stream + self.code_gen_dict["$PRAGMAS$"] += [ + f"#pragma HLS INTERFACE axis port=out{i}" + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + +# Merging of attention heads (before output projections) custom operator +class MergeMultiHeads(HLSCustomOp): + # Initializes the operator given an onnx graph node + def __init__(self, onnx_node, **kwargs): + # Just forward all arguments to the init method of the CustomOp base + super().__init__(onnx_node, **kwargs) + + # Defines attributes which must be present on this node + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = super().get_nodeattr_types() + # Update attributes dictionary for new custom operator + attrs.update({ + # Number of attention heads + "heads": ("i", True, 1), + # Specifies whether the output is packed as a single output tensor + # or split as multiple output tensors + "packed": ("i", True, 1), + # Data type of input and output elements + "dtype": ("s", True, ""), + # Number of input elements to be split + "num_elems": ("i", True, 1), + # Number of inputs to be processed sequentially + "num_inputs": ("ints", True, [1]), + # Output needs to be squeezed + "squeezed": ("i", True, 0), + }) + # Return updated attribute dictionary + return attrs + + # Number of attention heads attribute as property for convenience + @property + def heads(self): + return self.get_nodeattr("heads") + + # Packed attribute as property for convenience + @property + def packed(self): + # Note: Converts from int to bool + return bool(self.get_nodeattr("packed")) + + # Datatype attribute as property for convenience + @property + def dtype(self): + # Note: Converts from string to QONNX data type + return DataType[self.get_nodeattr("dtype")] + + # Number of elements attribute as property for convenience + @property + def num_elems(self): + return self.get_nodeattr("num_elems") + + # Number of inputs attribute as property for convenience + @property + def num_inputs(self): + return self.get_nodeattr("num_inputs") + + # Squeezed output attribute as property for convenience + @property + def squeezed(self): + # Note: Converts from int to bool + return bool(self.get_nodeattr("squeezed")) + + # Makes an operation compatible with the output shape for shape inference + # Note: Propagates shape forward, i.e., never asks for the shape of the + # output, even if it seems easier. + def make_shape_compatible_op(self, model: ModelWrapper): # noqa + # Get the node wrapped by this custom op + node = self.onnx_node + # Squeeze single-element batch dimension from the output? + squeezed = self.squeezed + # Packed inputs a represented by a reshape operation consuming one + # tensor + if self.packed: + # Get the shape of the input tensor for inferring the number of + # heads and correctly propagating shapes + h, seq, dim = model.get_tensor_shape(node.input[0]) + # Attribute heads must match wht is annotated at the input + assert h == self.heads, \ + f"Shape annotation and number of heads differ: {node.name}" + # Distribute the heads into the embedding dimension + dim = self.heads * dim + # Create a new name for the temporary shape tensor + shape = model.make_new_valueinfo_name() + # Set the target shape of slices heads + model.set_initializer( + shape, np.asarray([seq, dim] if squeezed else [seq, 1, dim]) + ) + # Return a node simulating the shape effect of merging multi-heads + return oh.make_node( + "Reshape", [node.input[0], shape], [node.output[0]] + ) + # If the inputs are not packed, the operation is represented as a concat + # operation consuming number of heads inputs concatenating along the + # last axis + return oh.make_node("Concat", node.input, node.output, axis=-1) + + # Infers the datatype of the node output + def infer_node_datatype(self, model: ModelWrapper): # noqa + # Get the node wrapped by this custom op + node = self.onnx_node + # Merging simply propagates the type of the input to the output + model.set_tensor_datatype( + node.output[0], model.get_tensor_datatype(node.input[0]) + ) + + # Executes multi-head merging in python + def execute_node(self, context, graph): + # Get the node wrapped by this custom op + node = self.onnx_node + # Get the input out of the execution context + # Note: Shape must be heads x seq x dim + inp = context[node.input[0]] + # Packed execution boils down to a reshape of the single input to a + # single output + if self.packed: + # Transpose back into sequence first layout then reintegrate the + # heads via reshape + out = inp.transpose(1, 0, 2).reshape( + inp.shape[1], 1, self.heads * inp.shape[-1] + ) + # Split is realized as the concat operation of numpy + else: + # Collect the list of inputs from the execution context and + # concatenate along the last axis + out = np.concatenate([context[i] for i in node.input], axis=-1) + # Reshape to simulate the batch dimensions if it is not present + out = out.reshape(out.shape[0], 1, out.shape[-1]) + # Optionally squeeze the output (remove batch dimension of size 1) + if self.squeezed: + # Squeeze batch dimension via reshape + out = out.reshape(out.shape[0], out.shape[-1]) + # Write the output into the execution context. Force output shape + # which might be squeezed + context[node.output[0]] = out + + # Verifies the node attributes, inputs and outputs + def verify_node(self): + # TODO: Implement + return [] + + # Note: End of QONNX CustomOp region, below is FINN HLSCustomOp stuff + + # Gets the datatype of input at index ind + def get_input_datatype(self, ind=0): + # All inputs (there should only be one) have the same type + return self.dtype + + # Gets the datatype of the output at index ind + def get_output_datatype(self, ind=0): + # All outputs will have the same type, which is the same as the input + return self.dtype + + # Gets the shape of the input at index ind without folding + def get_normal_input_shape(self, ind=0): + # Packed layout is currently not implemented + assert not self.packed, "Packed multi-heads are not implemented yet" + # There is only one input with shape configured as attributes + # Unpack multi-axis inputs list to yield a flat tuple as shape + return *self.num_inputs, self.num_elems + + # Gets the shape of the output at index ind without folding + def get_normal_output_shape(self, ind=0): + # All output have the same shape, which correspond to collecting the + # number of input elements from the heads specified as attributes + # Unpack multi-axis inputs list to yield a flat tuple as shape + return *self.num_inputs, self.num_elems * self.heads + + # Gets the shape of the input at index ind with folding + def get_folded_input_shape(self, ind=0): + # No folding for now, normal and folded shape are the same + return self.get_normal_input_shape(ind=ind) + + # Gets the shape of the output at index ind with folding + def get_folded_output_shape(self, ind=0): + # No folding for now, normal and folded shape are the same + return self.get_normal_output_shape(ind=ind) + + # Widths of the input data stream of the input at index ind + def get_instream_width(self, ind=0): + # Get the number of bits used to represent the input + i_bits = self.get_input_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded input + *_, elems = self.get_folded_input_shape(ind) + # Width of a stream receiving input elements in parallel + return elems * i_bits + + # Widths of the output data stream of the output at index ind + def get_outstream_width(self, ind=0): + # Get the number of bits used to represent the output + o_bits = self.get_output_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded output + *_, elems = self.get_folded_output_shape(ind) + # Width of a stream producing output elements in parallel + return elems * o_bits + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + # Note: There is just one input. + i_bits_max = self.get_instream_width(ind=0) + # Find the widths of the widest output + # Note: there is one output per head + o_bits_max = max( + (self.get_outstream_width(ind) for ind in range(self.heads)) + ) + # Find the biggest of the inputs/outputs + return max([i_bits_max, o_bits_max]) + + # Gets the number of expected output values, i.e. how many times read() + # could/should be called on any output stream of this operator + def get_number_output_values(self): + # Elements over all but the last dimension of the output folded along + # the embedding dimension + return np.prod(self.get_folded_output_shape()[:-1]) + + # Note: End of shape and datatype utilities + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # Currently nothing to include + pass + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Input and output element datatypes + f"using IType = {self.dtype.get_hls_datatype_str()};", + f"using OType = {self.dtype.get_hls_datatype_str()};", + # Input and output HLS stream datatypes + f"using IStream = hls::stream<" + f" ap_uint<{self.get_instream_width()}>" + f">;", + f"using OStream = hls::stream<" + f" ap_uint<{self.get_outstream_width()}>" + f">;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [ + # Deduce the datatype of elements packed into the input stream + f'using IPacked = decltype(IStream{{}}.read());' + ] + # Generate code for each input stream + for i in range(self.heads): + # Append each reading/writing function call + self.code_gen_dict["$READNPYDATA$"] += [ + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + f'npy2apintstream(', + f' "{code_gen_dir}/in{i}.npy", in{i}, false', + ');' + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Declare input and output streams + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # Note: Assumes stream type aliases to be set in defines + "OStream out;", *(f"IStream in{i};" for i in range(self.heads)) + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + reversed_reads = ", ".join([ + f"in{i}.read()" for i in reversed(range(self.heads)) + ]) + + # Write the body of the head-splitting top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Repeat for the number of inputs + # Note: Repeat for all num_inputs dimensions + f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", + # Pipeline the steps of this loop + f"#pragma HLS pipeline II=1 style=flp", + # Read the next input element from each input stream and concatenate + # using the comma operator overload of ap_uint, writing into the + # output stream + f"out.write(({reversed_reads}));" + # End of for-loop over repetitions body + f"}}" + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C** simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the code generation + # dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Generate function call for reading from the output stream into the + # output file + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + # Deduce the datatype of elements packed into the output stream + f'using OPacked = decltype(OStream{{}}.read());', + # Generate function call reading from stream into the output file + # Note: Outputs are always represented as numpy floats + f'apintstream2npy(', + f' out, {shape}, "{code_gen_dir}/out.npy", false', + ');', + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the head + # splitting operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # @formatter:off Prevent Python formatter from messing with C++ + # formatting + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + # Output HLS stream + f" OStream &out", ",".join([ + # One input HLS stream per head # noqa: Formatting + f" IStream &in{i}" for i in range(self.heads) + ]), + f")", + # @formatter:off + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the output stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=out" + ] + # Connect each input stream with an axi stream interface + for i in range(self.heads): + # Add new interface directive for the input stream + self.code_gen_dict["$PRAGMAS$"] += [ + f"#pragma HLS INTERFACE axis port=in{i}" + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) From 6abb53799ce298c8812700887d3e601326d4c0fa Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 28 Nov 2023 15:58:52 +0100 Subject: [PATCH 34/88] [Attention] Add cppsim and python tests for head splitting and merging This includes C++ simulation node execution. Addresses some issue regarding the generated bit-slicing HLS string for multi-head merging, which happend to use the element-wise type instead of the packed type. --- src/finn/custom_op/fpgadataflow/__init__.py | 5 + .../custom_op/fpgadataflow/attention_heads.py | 136 +++++- .../test_fpgadataflow_attention_heads.py | 393 ++++++++++++++++++ 3 files changed, 512 insertions(+), 22 deletions(-) create mode 100644 tests/fpgadataflow/test_fpgadataflow_attention_heads.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 9624710dca..a4bd6a2ce4 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -56,6 +56,9 @@ from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU from finn.custom_op.fpgadataflow.attention import ScaledDotProductAttention +from finn.custom_op.fpgadataflow.attention_heads import ( + SplitMultiHeads, MergeMultiHeads +) custom_op = dict() @@ -84,3 +87,5 @@ custom_op["StreamingMaxPool"] = StreamingMaxPool custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour custom_op["ScaledDotProductAttention"] = ScaledDotProductAttention +custom_op["SplitMultiHeads"] = SplitMultiHeads +custom_op["MergeMultiHeads"] = MergeMultiHeads diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index cc6e12e163..91d09dfecc 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -1,7 +1,5 @@ # Operating system stuff, e.g. paths import os -# Python warning subsystem -import warnings # Numpy math and arrays import numpy as np @@ -42,7 +40,10 @@ def get_nodeattr_types(self): # Number of input elements to be split "num_elems": ("i", True, 1), # Number of inputs to be processed sequentially - "num_inputs": ("ints", True, [1]) + "num_inputs": ("ints", True, [1]), + # Possible execution modes for simulating this node + # Note: Override to support python mode + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}) }) # Return updated attribute dictionary return attrs @@ -127,7 +128,7 @@ def infer_node_datatype(self, model: ModelWrapper): # noqa ) # Executes multi-head slicing in python - def execute_node(self, context, graph): + def _execute_node_python(self, context, graph): # noqa: graph unused # Get the node wrapped by this custom op node = self.onnx_node # Get the input out of the execution context @@ -150,6 +151,50 @@ def execute_node(self, context, graph): # Write the output into the execution context context[o] = out + # Executes multi-head slicing in C++ simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the input out of the execution context + # Note: Shape must be either seq x 1 x dim or seq x dim + inp = context[node.input[0]] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=0), \ + f"Input shape mismatch for {node.input[0]}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=0)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"in.npy"), inp) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Enumerate the node outputs + for i, name in enumerate(node.output): + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, f"out{i}.npy")) + # Reshape the folded output and insert into the execution context + context[name] = out.reshape(self.get_normal_output_shape(ind=i)) + + # Executes multi-head slicing in RTL simulation + def _execute_node_rtlsim(self, context, graph): + raise NotImplementedError("RTL Simulation is not implemented yet") + + # Executes multi-head slicing in simulation (either python c++ or rtl sim) + def execute_node(self, context, graph): + # Get the configured execution mode + mode = self.get_nodeattr("exec_mode") + # Lookup table mapping execution modes to implementing methods + exec_fns = { + "python": self._execute_node_python, + "cppsim": self._execute_node_cppsim, + "rtlsim": self._execute_node_rtlsim, + } + # Select and execute the function by mode string + exec_fns[mode](context, graph) + # Verifies the node attributes, inputs and outputs def verify_node(self): # TODO: Implement @@ -238,7 +283,7 @@ def get_number_output_values(self): # code def global_includes(self): # Currently nothing to include - pass + self.code_gen_dict["$GLOBALS$"] = [] # Generates C++ code of type alias, global constant and macro definitions def defines(self, var): @@ -247,6 +292,10 @@ def defines(self, var): # Input and output element datatypes f"using IType = {self.dtype.get_hls_datatype_str()};", f"using OType = {self.dtype.get_hls_datatype_str()};", + # Datatype of elements packed into the input stream + f"using IPacked = ap_uint<{self.get_instream_width()}>;", + # Datatype of elements packed into the output stream + f"using OPacked = ap_uint<{self.get_outstream_width()}>;", # Input and output HLS stream datatypes f"using IStream = hls::stream<" f" ap_uint<{self.get_instream_width()}>" @@ -264,8 +313,6 @@ def read_npy_data(self): # Generate function calls for reading the input files into the input # streams self.code_gen_dict["$READNPYDATA$"] = [ - # Deduce the datatype of elements packed into the input stream - f'using IPacked = decltype(IStream{{}}.read());', # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', @@ -288,7 +335,7 @@ def docompute(self): # input def split(i): # Assemble a C++ indexing/bit-slicing string - return f"({i + 1} * OType::width - 1, {i} * OType::width)" + return f"({i + 1} * OPacked::width - 1, {i} * OPacked::width)" # Write the body of the head-splitting top-level function self.code_gen_dict["$DOCOMPUTE$"] = [ @@ -320,12 +367,8 @@ def dataoutstrm(self): shape = f"""{{{ ','.join((str(i) for i in self.get_folded_output_shape())) }}}""" - # Generate function call for reading from the output stream into the - # output file - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - # Deduce the datatype of elements packed into the output stream - f'using OPacked = decltype(OStream{{}}.read());' - ] + # Start collecting function calls to write the output data stream + self.code_gen_dict["$DATAOUTSTREAM$"] = [] # Generate code for each output stream for i in range(self.heads): # Append each reading/writing function call @@ -411,6 +454,9 @@ def get_nodeattr_types(self): "num_inputs": ("ints", True, [1]), # Output needs to be squeezed "squeezed": ("i", True, 0), + # Possible execution modes for simulating this node + # Note: Override to support python mode + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}) }) # Return updated attribute dictionary return attrs @@ -492,7 +538,7 @@ def infer_node_datatype(self, model: ModelWrapper): # noqa ) # Executes multi-head merging in python - def execute_node(self, context, graph): + def _execute_node_python(self, context, graph): # noqa: graph unused # Get the node wrapped by this custom op node = self.onnx_node # Get the input out of the execution context @@ -521,6 +567,53 @@ def execute_node(self, context, graph): # which might be squeezed context[node.output[0]] = out + # Executes multi-head slicing in C++ simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + + # Enumerate the node outputs + for i, name in enumerate(node.input): + # Get the input out of the execution context + # Note: Shape must be either 1 x seq x dim or seq x dim + inp = context[name] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=i), \ + f"Input shape mismatch for {name}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=i)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"in{i}.npy"), inp) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, f"out.npy")) + # Reshape the folded output and insert into the execution context + context[node.output[0]] = out.reshape( + self.get_normal_output_shape(ind=0) + ) + + # Executes multi-head slicing in RTL simulation + def _execute_node_rtlsim(self, context, graph): + raise NotImplementedError("RTL Simulation is not implemented yet") + + # Executes multi-head slicing in simulation (either python c++ or rtl sim) + def execute_node(self, context, graph): + # Get the configured execution mode + mode = self.get_nodeattr("exec_mode") + # Lookup table mapping execution modes to implementing methods + exec_fns = { + "python": self._execute_node_python, + "cppsim": self._execute_node_cppsim, + "rtlsim": self._execute_node_rtlsim, + } + # Select and execute the function by mode string + exec_fns[mode](context, graph) + # Verifies the node attributes, inputs and outputs def verify_node(self): # TODO: Implement @@ -609,7 +702,7 @@ def get_number_output_values(self): # code def global_includes(self): # Currently nothing to include - pass + self.code_gen_dict["$GLOBALS$"] = [] # Generates C++ code of type alias, global constant and macro definitions def defines(self, var): @@ -618,6 +711,10 @@ def defines(self, var): # Input and output element datatypes f"using IType = {self.dtype.get_hls_datatype_str()};", f"using OType = {self.dtype.get_hls_datatype_str()};", + # Datatype of elements packed into the input stream + f"using IPacked = ap_uint<{self.get_instream_width()}>;", + # Datatype of elements packed into the output stream + f"using OPacked = ap_uint<{self.get_outstream_width()}>;", # Input and output HLS stream datatypes f"using IStream = hls::stream<" f" ap_uint<{self.get_instream_width()}>" @@ -634,10 +731,7 @@ def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") # Generate function calls for reading the input files into the input # streams - self.code_gen_dict["$READNPYDATA$"] = [ - # Deduce the datatype of elements packed into the input stream - f'using IPacked = decltype(IStream{{}}.read());' - ] + self.code_gen_dict["$READNPYDATA$"] = [] # Generate code for each input stream for i in range(self.heads): # Append each reading/writing function call @@ -696,8 +790,6 @@ def dataoutstrm(self): # Generate function call for reading from the output stream into the # output file self.code_gen_dict["$DATAOUTSTREAM$"] = [ - # Deduce the datatype of elements packed into the output stream - f'using OPacked = decltype(OStream{{}}.read());', # Generate function call reading from stream into the output file # Note: Outputs are always represented as numpy floats f'apintstream2npy(', diff --git a/tests/fpgadataflow/test_fpgadataflow_attention_heads.py b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py new file mode 100644 index 0000000000..920fffddab --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py @@ -0,0 +1,393 @@ +# Testing framework +import pytest # noqa pytest dependecy is listed in setup.cfg + +# Use numpy for python execution / computing the ground truth expected values +import numpy as np + +# Protobuf onnx graph node type +from onnx import NodeProto, TensorProto # noqa +# Helper for creating ONNX nodes +from onnx import helper as oh # noqa + +# QONNX/FINN datatypes +from qonnx.core.datatype import DataType # noqa qonnx dependency is specified +# in setup.cfg as well as in fetch-repos.sh +# QONNX wrapper to ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper # noqa +# Execute onnx model graphs +from qonnx.core.onnx_exec import execute_onnx # noqa +# Utility for wrapping onnx graphs and generating tensor of FINN datatypes +from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # noqa + +# Graph transformation giving unique names to each node in a QONNX model graph +from qonnx.transformation.general import GiveUniqueNodeNames # noqa + +# FINN graph transformations for preparing simulation (cppsim or rtlsim) +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim + + +# Creates a model executing mult-head splitting +def mock_split_multi_heads(seq, dim, heads, dtype): + # Create a node representing the attention heads splitting operation + node = oh.make_node( + # Operator type from the name of the fpgadataflow hlscustomop + op_type="SplitMultiHeads", + # Specify the domain, i.e., the package to look for the custom operator + # implementation + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from HLSCustomOp + backend="fpgadataflow", + # Just one input + inputs=["inp"], + # Enumerate the outputs + outputs=[f"out{i}" for i in range(heads)], + # Number of attention heads to split the input into + heads=heads, + # Packed output is not supported for now + packed=False, + # Datatype of inputs and outputs + dtype=dtype, + # Number of input elements, i.e., embedding dimension + num_elems=dim, + # Number of embeddings in the whole input sequence / feature map + num_inputs=[seq] + ) + # Construct the input tensor value info + inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [seq, dim]) + # Construct output tensor value infos + out = [oh.make_tensor_value_info( + f"out{i}", TensorProto.FLOAT, [seq, dim // heads]) for i in range(heads) + ] + # Create a graph connecting the node to the inputs and outputs + graph = oh.make_graph([node], inputs=[inp], outputs=out, name="split") + # Wrap the ONNX graph in QONNX model wrapper + model = ModelWrapper(qonnx_make_model(graph, producer_name='split')) + + # Add datatype annotation to the value info of input tensor + model.set_tensor_datatype("inp", DataType[dtype]) + # Add datatype annotation to the value infor of each output tensor + for out in (f"out{i}" for i in range(heads)): + model.set_tensor_datatype(out, DataType[dtype]) + + # Return the wrapped onnx model + return model + + +# Creates a model executing mult-head merging +def mock_merge_multi_heads(seq, dim, heads, dtype): + # Create a node representing the attention heads merging operation + node = oh.make_node( + # Operator type from the name of the fpgadataflow hlscustomop + op_type="MergeMultiHeads", + # Specify the domain, i.e., the package to look for the custom operator + # implementation + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from HLSCustomOp + backend="fpgadataflow", + # Enumerate the inputs + inputs=[f"inp{i}" for i in range(heads)], + # Just one output + outputs=["out"], + # Number of attention heads to split the input into + heads=heads, + # Packed output is not supported for now + packed=False, + # Datatype of inputs and outputs + dtype=dtype, + # Number of input elements, i.e., embedding dimension + num_elems=dim // heads, + # Number of embeddings in the whole input sequence / feature map + num_inputs=[seq], + # Assume squeezed output by default + squeezed=True + ) + # Construct input tensor value infos + inp = [oh.make_tensor_value_info( + f"inp{i}", TensorProto.FLOAT, [seq, dim // heads]) for i in range(heads) + ] + # Construct the output tensor value info + out = oh.make_tensor_value_info("out", TensorProto.FLOAT, [seq, dim]) + # Create a graph connecting the node to the inputs and outputs + graph = oh.make_graph([node], inputs=inp, outputs=[out], name="merge") + # Wrap the ONNX graph in QONNX model wrapper + model = ModelWrapper(qonnx_make_model(graph, producer_name='merge')) + + # Add datatype annotation to the value infor of each input tensor + for inp in (f"inp{i}" for i in range(heads)): + model.set_tensor_datatype(inp, DataType[dtype]) + # Add datatype annotation to the value info of output tensor + model.set_tensor_datatype("out", DataType[dtype]) + + # Return the wrapped onnx model + return model + + +# Sequence length to simulate, i.e., number of individual inputs to be split +@pytest.mark.parametrize("seq", [64]) +# Number of input elements to be split, i.e., size of embedding dimension +@pytest.mark.parametrize("dim", [32]) +# Number of heads to split the input into +@pytest.mark.parametrize("heads", [1, 2, 4, 8]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["UINT8"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests splitting of tensors to multiple attention heads using python mode +# execution +# Note: No actual attention operation is performed +def test_attention_heads_split_python(seq, dim, heads, dtype): + # Make dummy model for testing + model = mock_split_multi_heads(seq, dim, heads, dtype) + + # Prepare the execution context + context = {"inp": gen_finn_dt_tensor(DataType[dtype], (seq, dim))} + + # Set model execution mode to python simulation + model = model.transform(SetExecMode("python")) + model = model.transform(GiveUniqueNodeNames()) + + # Compute ground-truth output in software + o_expected = np.split(context["inp"], heads, axis=-1) # noqa: Duplicate + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context) + + # Validate each output separately + for i, out in enumerate((f"out{i}" for i in range(heads))): + # Compare expected (retrieved by index) to produced (retrieve by key) + assert (o_produced[out] == o_expected[i]).all() + + +# Sequence length to simulate, i.e., number of individual inputs to be split +@pytest.mark.parametrize("seq", [64]) +# Number of input elements to be split, i.e., size of embedding dimension +@pytest.mark.parametrize("dim", [32]) +# Number of heads to split the input into +@pytest.mark.parametrize("heads", [1, 2, 4, 8]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["UINT8"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests splitting of tensors to multiple attention heads using python mode +# execution +# Note: No actual attention operation is performed +def test_attention_heads_split_cppsim(seq, dim, heads, dtype): + # Make dummy model for testing + model = mock_split_multi_heads(seq, dim, heads, dtype) + + # Prepare the execution context + context = {"inp": gen_finn_dt_tensor(DataType[dtype], (seq, dim))} + + # Set model execution mode to Python simulation + model = model.transform(SetExecMode("cppsim")) + # Generates the C++ source and compiles the C++ simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + + # Compute ground-truth output in software + o_expected = np.split(context["inp"], heads, axis=-1) # noqa: Duplicate + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context) + + # Validate each output separately + for i, out in enumerate((f"out{i}" for i in range(heads))): + # Compare expected (retrieved by index) to produced (retrieve by key) + assert (o_produced[out] == o_expected[i]).all() + + +# Sequence length to simulate, i.e., number of individual inputs to be split +@pytest.mark.parametrize("seq", [64]) +# Number of input elements to be split, i.e., size of embedding dimension +@pytest.mark.parametrize("dim", [32]) +# Number of heads to split the input into +@pytest.mark.parametrize("heads", [4]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["UINT8"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests splitting of tensors to multiple attention heads using python mode +# execution +# Note: No actual attention operation is performed +def test_attention_heads_split_rtlsim(seq, dim, heads, dtype): + # Make dummy model for testing + model = mock_split_multi_heads(seq, dim, heads, dtype) + + # Prepare the execution context + context = {"inp": gen_finn_dt_tensor(DataType[dtype], (seq, dim))} + + # Set model execution mode to Python simulation + model = model.transform(SetExecMode("rtlsim")) + # Generates the C++ source and compiles the RTL simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 10)) # noqa + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + # Compute ground-truth output in software + o_expected = np.split(context["inp"], heads, axis=-1) # noqa: Duplicate + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context) + + # Validate each output separately + for i, out in enumerate((f"out{i}" for i in range(heads))): + # Compare expected (retrieved by index) to produced (retrieve by key) + assert (o_produced[out] == o_expected[i]).all() + + +# Sequence length to simulate, i.e., number of individual inputs to be split +@pytest.mark.parametrize("seq", [64]) # noqa: Duplicate, test setup +# Number of input elements to be split, i.e., size of embedding dimension +@pytest.mark.parametrize("dim", [32]) +# Number of heads to split the input into +@pytest.mark.parametrize("heads", [1, 2, 4, 8]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["UINT8"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests merging of tensors from multiple attention heads using python mode +# execution +# Note: No actual attention operation is performed +def test_attention_heads_merge_python(seq, dim, heads, dtype): + # Make dummy model for testing + model = mock_merge_multi_heads(seq, dim, heads, dtype) + + # Create a random input tensor of shape and datatype + def make_inp_tensor(): + return gen_finn_dt_tensor(DataType[dtype], (seq, dim // heads)) + + # Prepare the execution context + context = { + f"inp{i}": make_inp_tensor() for i in range(heads) + } + + # Set model execution mode to Python simulation + model = model.transform(SetExecMode("python")) + model = model.transform(GiveUniqueNodeNames()) + + # Compute ground-truth output in software + o_expected = np.concatenate( + [context[f"inp{i}"] for i in range(heads)], axis=-1 + ) + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context)["out"] + + # Compare expected to produced output + assert (o_produced == o_expected).all() # noqa + + +# Sequence length to simulate, i.e., number of individual inputs to be split +@pytest.mark.parametrize("seq", [64]) # noqa: Duplicate, test setup +# Number of input elements to be split, i.e., size of embedding dimension +@pytest.mark.parametrize("dim", [32]) +# Number of heads to split the input into +@pytest.mark.parametrize("heads", [1, 2, 4, 8]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["UINT8"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests merging of tensors from multiple attention heads using python mode +# execution +# Note: No actual attention operation is performed +def test_attention_heads_merge_cppsim(seq, dim, heads, dtype): + # Make dummy model for testing + model = mock_merge_multi_heads(seq, dim, heads, dtype) + + # Create a random input tensor of shape and datatype + def make_inp_tensor(): + return gen_finn_dt_tensor(DataType[dtype], (seq, dim // heads)) + + # Prepare the execution context + context = { + f"inp{i}": make_inp_tensor() for i in range(heads) + } + + # Set model execution mode to C++ simulation + model = model.transform(SetExecMode("cppsim")) + # Generates the C++ source and compiles the C++ simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + + # Compute ground-truth output in software + o_expected = np.concatenate( + [context[f"inp{i}"] for i in range(heads)], axis=-1 + ) + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context)["out"] + + # Compare expected to produced output + assert (o_produced == o_expected).all() # noqa + + +# Sequence length to simulate, i.e., number of individual inputs to be split +@pytest.mark.parametrize("seq", [64]) # noqa: Duplicate, test setup +# Number of input elements to be split, i.e., size of embedding dimension +@pytest.mark.parametrize("dim", [32]) +# Number of heads to split the input into +@pytest.mark.parametrize("heads", [4]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["UINT8"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests merging of tensors from multiple attention heads using python mode +# execution +# Note: No actual attention operation is performed +def test_attention_heads_merge_rtlsim(seq, dim, heads, dtype): + # Make dummy model for testing + model = mock_merge_multi_heads(seq, dim, heads, dtype) + + # Create a random input tensor of shape and datatype + def make_inp_tensor(): + return gen_finn_dt_tensor(DataType[dtype], (seq, dim // heads)) + + # Prepare the execution context + context = { + f"inp{i}": make_inp_tensor() for i in range(heads) + } + + # Set model execution mode to RTL simulation + model = model.transform(SetExecMode("rtlsim")) + # Generates the C++ source and compiles the RTL simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 10)) # noqa + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + # Compute ground-truth output in software + o_expected = np.concatenate( + [context[f"inp{i}"] for i in range(heads)], axis=-1 + ) + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context)["out"] + + # Compare expected to produced output + assert (o_produced == o_expected).all() # noqa \ No newline at end of file From eb07a2b84026c9f97c5a98408cc35ee80ca6bbee Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 29 Nov 2023 14:51:04 +0100 Subject: [PATCH 35/88] [Attention] Add RTL node execution to head splitting and merging --- .../custom_op/fpgadataflow/attention_heads.py | 194 +++++++++++++++--- .../test_fpgadataflow_attention_heads.py | 10 +- 2 files changed, 168 insertions(+), 36 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 91d09dfecc..30664399bf 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -8,14 +8,17 @@ # Helper for creating ONNX nodes from onnx import helper as oh # noqa -# Derive custom operators form the FINN base custom op -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp # QONNX/FINN datatypes from qonnx.core.datatype import DataType # noqa qonnx dependency is specified # in setup.cfg as well as in fetch-repos.sh # QONNX wrapper to ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper # noqa +# Converts inputs/outputs to/from RTL simulation format +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +# Derive custom operators form the FINN base custom op +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp + # Splitting of attention heads (after input projections) custom operator class SplitMultiHeads(HLSCustomOp): @@ -179,8 +182,60 @@ def _execute_node_cppsim(self, context, graph): # noqa: graph unused context[name] = out.reshape(self.get_normal_output_shape(ind=i)) # Executes multi-head slicing in RTL simulation - def _execute_node_rtlsim(self, context, graph): - raise NotImplementedError("RTL Simulation is not implemented yet") + def _execute_node_rtlsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Get the input out of the execution context + # Note: Shape must be either seq x 1 x dim or seq x dim + inp = context[node.input[0]] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=0), \ + f"Input shape mismatch for {node.input[0]}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=0)) + # Path to store the intermediate input in numpy format + filename = os.path.join(code_gen_dir, "in.npy") + # Save the folded inputs to file to be used by simulation + np.save(filename, inp) + # Start collecting inputs/outputs to the RTL simulation in a dictionary + # Note: Prepare one output list per head + io_dict = { + "inputs": {}, "outputs": {f"out{i}": [] for i in range(self.heads)} + } + # Type and width of the input tensor + dtype = self.get_input_datatype(ind=0) + width = self.get_instream_width(ind=0) + # Convert inputs to RTL simulation format + io_dict["inputs"]["in"] = npy_to_rtlsim_input(filename, dtype, width) + + # Setup PyVerilator simulation of the node + sim = self.get_rtlsim() + # Reset the RTL simulation + super().reset_rtlsim(sim) + super().toggle_clk(sim) + # Run the RTL Simulation + self.rtlsim_multi_io(sim, io_dict) + + # Enumerate the node outputs + for i, name in enumerate(node.output): + # Collect the output from RTL simulation + out = io_dict["outputs"][f"out{i}"] + # Type and sizes of the output tensor + dtype = self.get_output_datatype(ind=i) + width = self.get_outstream_width(ind=i) + shape = self.get_folded_output_shape(ind=i) + # Path to store the intermediate numpy file + filename = os.path.join(code_gen_dir, f"out{i}.npy") + # Convert from RTL simulation format to numpy format + rtlsim_output_to_npy( + out, filename, dtype, shape, width, dtype.bitwidth() + ) + # Load the generated output numpy file + out = np.load(filename) + # Reshape the folded output and insert into the execution context + context[name] = out.reshape(self.get_normal_output_shape(ind=i)) # Executes multi-head slicing in simulation (either python c++ or rtl sim) def execute_node(self, context, graph): @@ -274,8 +329,11 @@ def get_ap_int_max_w(self): # could/should be called on any output stream of this operator def get_number_output_values(self): # Elements over all but the last dimension of the output folded along - # the embedding dimension - return np.prod(self.get_folded_output_shape()[:-1]) + # the embedding dimension. Need to count across the number of heads, as + # RTL simulation actually counts individual inputs, not cycles with + # inputs, i.e., producing N heads outputs per cycle in parallel, count + # N outputs per cycle... + return np.prod(self.get_folded_output_shape()[:-1]) * self.heads # Note: End of shape and datatype utilities @@ -316,17 +374,20 @@ def read_npy_data(self): # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', - f' "{code_gen_dir}/in.npy", in, false', - ');' + f'"{code_gen_dir}/in.npy", in_{self.hls_sname()}, false', + f');' ] # Generates C++ code for declaring all streams involved in C++ simulation # for testing def strm_decl(self): # Declare input and output streams + # Note: Assumes stream type aliases to be set in defines self.code_gen_dict["$STREAMDECLARATIONS$"] = [ - # Note: Assumes stream type aliases to be set in defines - "IStream in;", *(f"OStream out{i};" for i in range(self.heads)) + # There is one input datastream + f"IStream in_{self.hls_sname()};", + # There is one output datastream per head + *(f"OStream out{i}_{self.hls_sname()};" for i in range(self.heads)) ] # Generates C++ code for calling the computation part of the operator @@ -337,6 +398,10 @@ def split(i): # Assemble a C++ indexing/bit-slicing string return f"({i + 1} * OPacked::width - 1, {i} * OPacked::width)" + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + # Write the body of the head-splitting top-level function self.code_gen_dict["$DOCOMPUTE$"] = [ # Repeat for the number of inputs @@ -345,10 +410,10 @@ def split(i): # Pipeline the steps of this loop f"#pragma HLS pipeline II=1 style=flp", # Read the next input element from the stream - f"const auto x = in.read();", + f"const auto x = in_{self.hls_sname()}.read();", # Split the next element from the input stream into the number of # output elements per head and write into the corresponding stream - *(f"out{i}.write(x{split(i)});" for i in range(self.heads)), + *(f"{out(i)}.write(x{split(i)});" for i in range(self.heads)), # End of for-loop over repetitions body f"}}" ] @@ -369,6 +434,11 @@ def dataoutstrm(self): }}}""" # Start collecting function calls to write the output data stream self.code_gen_dict["$DATAOUTSTREAM$"] = [] + + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + # Generate code for each output stream for i in range(self.heads): # Append each reading/writing function call @@ -377,7 +447,7 @@ def dataoutstrm(self): # file # Note: Outputs are always represented as numpy floats f'apintstream2npy(', - f' out{i}, {shape}, "{code_gen_dir}/out{i}.npy", false', + f'{out(i)}, {shape}, "{code_gen_dir}/out{i}.npy", false', f');' ] @@ -399,9 +469,9 @@ def blackboxfunction(self): # Note: Assumes stream type aliases to be set in defines f"void {self.onnx_node.name} (", # Input HLS stream - f" IStream &in", ",".join([ + f" IStream &in_{self.hls_sname()}, ", ",".join([ # One output HLS stream per head # noqa: Formatting - f" OStream &out{i}" for i in range(self.heads) + f" OStream &out{i}_{self.hls_sname()}" for i in range(self.heads) ]), f")", # @formatter:off @@ -414,13 +484,13 @@ def pragmas(self): # the top-level function arguments self.code_gen_dict["$PRAGMAS$"] = [ # Connect the input stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=in" + f"#pragma HLS INTERFACE axis port=in_{self.hls_sname()}" ] # Connect each output stream with an axi stream interface for i in range(self.heads): # Add new interface directive for the output stream self.code_gen_dict["$PRAGMAS$"] += [ - f"#pragma HLS INTERFACE axis port=out{i}" + f"#pragma HLS INTERFACE axis port=out{i}_{self.hls_sname()}" ] # No block-level I/O protocol for the function return value self.code_gen_dict["$PRAGMAS$"].append( @@ -598,8 +668,66 @@ def _execute_node_cppsim(self, context, graph): # noqa: graph unused ) # Executes multi-head slicing in RTL simulation - def _execute_node_rtlsim(self, context, graph): - raise NotImplementedError("RTL Simulation is not implemented yet") + def _execute_node_rtlsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + # Start collecting inputs/outputs to the RTL simulation in a dictionary + # Note: Prepare one output list per head + io_dict = { + "inputs": {}, "outputs": {"out": []} + } + + # Enumerate the node outputs + for i, name in enumerate(node.input): + # Get the input out of the execution context + # Note: Shape must be either 1 x seq x dim or seq x dim + inp = context[name] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=i), \ + f"Input shape mismatch for {name}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=i)) + # Path to store the intermediate input in numpy format + filename = os.path.join(code_gen_dir, f"in{i}.npy") + # Save the folded inputs to file to be used by simulation + np.save(filename, inp) + # Type and width of the input tensor + dtype = self.get_input_datatype(ind=i) + width = self.get_instream_width(ind=i) + # Convert inputs to RTL simulation format + io_dict["inputs"][f"in{i}"] = npy_to_rtlsim_input( + filename, dtype, width + ) + + # Setup PyVerilator simulation of the node + sim = self.get_rtlsim() + # Reset the RTL simulation + super().reset_rtlsim(sim) + super().toggle_clk(sim) + # Run the RTL Simulation + self.rtlsim_multi_io(sim, io_dict) + + # Collect the output from RTL simulation + out = io_dict["outputs"]["out"] + # Type and sizes of the output tensor + dtype = self.get_output_datatype(ind=0) + width = self.get_outstream_width(ind=0) + shape = self.get_folded_output_shape(ind=0) + # Path to store the intermediate numpy file + filename = os.path.join(code_gen_dir, "out.npy") + # Convert from RTL simulation format to numpy format + rtlsim_output_to_npy( + out, filename, dtype, shape, width, dtype.bitwidth() + ) + # Load the output numpy file generated by the RTL simulation + out = np.load(filename) + # Reshape the folded output and insert into the execution context + context[node.output[0]] = out.reshape( + self.get_normal_output_shape(ind=0) + ) # Executes multi-head slicing in simulation (either python c++ or rtl sim) def execute_node(self, context, graph): @@ -739,23 +867,27 @@ def read_npy_data(self): # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', - f' "{code_gen_dir}/in{i}.npy", in{i}, false', - ');' + f'"{code_gen_dir}/in{i}.npy", in{i}_{self.hls_sname()}, false', + f');' ] # Generates C++ code for declaring all streams involved in C++ simulation # for testing def strm_decl(self): # Declare input and output streams + # Note: Assumes stream type aliases to be set in defines self.code_gen_dict["$STREAMDECLARATIONS$"] = [ - # Note: Assumes stream type aliases to be set in defines - "OStream out;", *(f"IStream in{i};" for i in range(self.heads)) + # There is one output stream + f"OStream out_{self.hls_sname()};", + # There is one input stream per head + *(f"IStream in{i}_{self.hls_sname()};" for i in range(self.heads)) ] # Generates C++ code for calling the computation part of the operator def docompute(self): reversed_reads = ", ".join([ - f"in{i}.read()" for i in reversed(range(self.heads)) + f"in{i}_{self.hls_sname()}.read()" + for i in reversed(range(self.heads)) ]) # Write the body of the head-splitting top-level function @@ -768,7 +900,7 @@ def docompute(self): # Read the next input element from each input stream and concatenate # using the comma operator overload of ap_uint, writing into the # output stream - f"out.write(({reversed_reads}));" + f"out_{self.hls_sname()}.write(({reversed_reads}));" # End of for-loop over repetitions body f"}}" ] @@ -793,8 +925,8 @@ def dataoutstrm(self): # Generate function call reading from stream into the output file # Note: Outputs are always represented as numpy floats f'apintstream2npy(', - f' out, {shape}, "{code_gen_dir}/out.npy", false', - ');', + f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false', + f');', ] # Generates C++ code for saving the output of C++ simulation to a file in @@ -815,9 +947,9 @@ def blackboxfunction(self): # Note: Assumes stream type aliases to be set in defines f"void {self.onnx_node.name} (", # Output HLS stream - f" OStream &out", ",".join([ + f" OStream &out_{self.hls_sname()}, ", ",".join([ # One input HLS stream per head # noqa: Formatting - f" IStream &in{i}" for i in range(self.heads) + f" IStream &in{i}_{self.hls_sname()}" for i in range(self.heads) ]), f")", # @formatter:off @@ -830,13 +962,13 @@ def pragmas(self): # the top-level function arguments self.code_gen_dict["$PRAGMAS$"] = [ # Connect the output stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=out" + f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}" ] # Connect each input stream with an axi stream interface for i in range(self.heads): # Add new interface directive for the input stream self.code_gen_dict["$PRAGMAS$"] += [ - f"#pragma HLS INTERFACE axis port=in{i}" + f"#pragma HLS INTERFACE axis port=in{i}_{self.hls_sname()}" ] # No block-level I/O protocol for the function return value self.code_gen_dict["$PRAGMAS$"].append( diff --git a/tests/fpgadataflow/test_fpgadataflow_attention_heads.py b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py index 920fffddab..a9d4796a14 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention_heads.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py @@ -209,7 +209,7 @@ def test_attention_heads_split_cppsim(seq, dim, heads, dtype): # Number of input elements to be split, i.e., size of embedding dimension @pytest.mark.parametrize("dim", [32]) # Number of heads to split the input into -@pytest.mark.parametrize("heads", [4]) +@pytest.mark.parametrize("heads", [1, 2, 4, 8]) # Datatypes to simulate @pytest.mark.parametrize("dtype", ["UINT8"]) # This is a slow running fpgadataflow type of test which requires vivado @@ -289,7 +289,7 @@ def make_inp_tensor(): o_produced = execute_onnx(model, context)["out"] # Compare expected to produced output - assert (o_produced == o_expected).all() # noqa + assert (o_produced == o_expected).all() # noqa: Unresolved "all" warning # Sequence length to simulate, i.e., number of individual inputs to be split @@ -339,7 +339,7 @@ def make_inp_tensor(): o_produced = execute_onnx(model, context)["out"] # Compare expected to produced output - assert (o_produced == o_expected).all() # noqa + assert (o_produced == o_expected).all() # noqa: Unresolved "all" warning # Sequence length to simulate, i.e., number of individual inputs to be split @@ -347,7 +347,7 @@ def make_inp_tensor(): # Number of input elements to be split, i.e., size of embedding dimension @pytest.mark.parametrize("dim", [32]) # Number of heads to split the input into -@pytest.mark.parametrize("heads", [4]) +@pytest.mark.parametrize("heads", [1, 2, 4, 8]) # Datatypes to simulate @pytest.mark.parametrize("dtype", ["UINT8"]) # This is a slow running fpgadataflow type of test which requires vivado @@ -390,4 +390,4 @@ def make_inp_tensor(): o_produced = execute_onnx(model, context)["out"] # Compare expected to produced output - assert (o_produced == o_expected).all() # noqa \ No newline at end of file + assert (o_produced == o_expected).all() # noqa: Unresolved "all" warning From 81f0e0c0a291eb15fb27fabc610024528d0bb3fa Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 30 Nov 2023 18:00:36 +0100 Subject: [PATCH 36/88] [Attention] Simplify shapre inference and extend to rank-3 tenors --- src/finn/custom_op/fpgadataflow/attention.py | 14 +++++++- .../custom_op/fpgadataflow/attention_heads.py | 33 +++++++------------ 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index ec67eabc94..dd73677cd7 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -125,8 +125,20 @@ def is_valid_folding(self): def make_shape_compatible_op(self, model): # Infer the output shape from the input shapes o_shape = (self.get_nodeattr("QLen"), self.get_nodeattr("VDim")) + # Get the node wrapped by this custom op + node = self.onnx_node + # Get the shape of the input tensor for inferring the number of + # heads and correctly propagating shapes + shape = model.get_tensor_shape(node.input[0]) + # Determine the rank of the input tensor to support batched and + # non-batched inputs + rank = len(shape) # Constant operation producing output of given shape - return super().make_const_shape_op(o_shape) + # Note: Rank == 3 allows for propagating yet unrolled multi-attention + # heads. + return super().make_const_shape_op( + (shape[0], *o_shape) if (rank == 3) else o_shape + ) # Infers the output data types and updates the input datatypes of the node def infer_node_datatype(self, model): diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 30664399bf..3e8aebdab3 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -572,31 +572,20 @@ def make_shape_compatible_op(self, model: ModelWrapper): # noqa node = self.onnx_node # Squeeze single-element batch dimension from the output? squeezed = self.squeezed + # Assume unpacked inputs by default, here seq sill be the number of + # input feature maps + seq = self.num_inputs # Packed inputs a represented by a reshape operation consuming one # tensor if self.packed: - # Get the shape of the input tensor for inferring the number of - # heads and correctly propagating shapes - h, seq, dim = model.get_tensor_shape(node.input[0]) - # Attribute heads must match wht is annotated at the input - assert h == self.heads, \ - f"Shape annotation and number of heads differ: {node.name}" - # Distribute the heads into the embedding dimension - dim = self.heads * dim - # Create a new name for the temporary shape tensor - shape = model.make_new_valueinfo_name() - # Set the target shape of slices heads - model.set_initializer( - shape, np.asarray([seq, dim] if squeezed else [seq, 1, dim]) - ) - # Return a node simulating the shape effect of merging multi-heads - return oh.make_node( - "Reshape", [node.input[0], shape], [node.output[0]] - ) - # If the inputs are not packed, the operation is represented as a concat - # operation consuming number of heads inputs concatenating along the - # last axis - return oh.make_node("Concat", node.input, node.output, axis=-1) + # Drop the heads-first dimension from packed inputs + seq = self.num_inputs[1:] + # Distribute the heads into the embedding dimension + dim = self.heads * self.num_elems + # Constant operation producing output of given shape + return super().make_const_shape_op( + [*seq, dim] if squeezed else [*seq, 1, dim] + ) # Infers the datatype of the node output def infer_node_datatype(self, model: ModelWrapper): # noqa From 2df11c21ac4842328ad299ee8668ef8a5fba59b8 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 1 Dec 2023 12:04:33 +0100 Subject: [PATCH 37/88] [Attention] Fix code generation, interface names and FIFO depths --- src/finn/custom_op/fpgadataflow/attention.py | 78 +++++++++++++++----- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index dd73677cd7..edc146c9e4 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -90,6 +90,11 @@ def get_nodeattr_types(self): # a mask sent as the fourth input or a causal attention mask which # is generated by the operator itself. "mask_mode": ("s", True, "none", {"none", "input", "causal"}), + + # Input and output FIFO depths for multi-I/O nodes + # Note: Need to override here as there are three inputs + "inFIFODepths": ("ints", False, [2, 2, 2]), + "outFIFODepths": ("ints", False, [2]), }) # Return updated attribute dictionary return attrs @@ -194,10 +199,12 @@ def infer_node_datatype(self, model): str(mask_dtype), )) # Update the node datatype attribute of the attention mask - self.set_nodeattr("MType", mask_dtype.namke) + self.set_nodeattr("MType", mask_dtype.name) # Set the model output datatype - model.set_tensor_datatype(node.output[0], self.get_nodeattr('OType')) + model.set_tensor_datatype( + node.output[0], DataType[self.get_nodeattr('OType')] + ) # Executes the node def execute_node(self, context, graph): @@ -831,7 +838,7 @@ def read_npy_data(self): # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', - f' "{code_gen_dir}/q.npy", q, false', + f' "{code_gen_dir}/q.npy", q_{self.hls_sname()}, false', ');', # Deduce the datatype of elements packed into the key input stream @@ -839,7 +846,7 @@ def read_npy_data(self): # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', - f' "{code_gen_dir}/k.npy", k, false', + f' "{code_gen_dir}/k.npy", k_{self.hls_sname()}, false', ');', # Deduce the datatype of elements packed into the value input stream @@ -847,7 +854,7 @@ def read_npy_data(self): # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', - f' "{code_gen_dir}/v.npy", v, false', + f' "{code_gen_dir}/v.npy", v_{self.hls_sname()}, false', ');', ] @@ -862,7 +869,7 @@ def read_npy_data(self): # Generate function call reading from file into the input stream # Note: Inputs are always represented as numpy floats f'npy2apintstream(', - f' "{code_gen_dir}/m.npy", m, false', + f' "{code_gen_dir}/m.npy", m_{self.hls_sname()}, false', ');', ] @@ -872,7 +879,10 @@ def strm_decl(self): # Declare input (query, key, value) and output streams self.code_gen_dict["$STREAMDECLARATIONS$"] = [ # Note: Assumes stream type aliases to be set in defines - 'QStream q;', 'KStream k;', 'VStream v;', 'OStream out;' + f"QStream q_{self.hls_sname()};", + f"KStream k_{self.hls_sname()};", + f"VStream v_{self.hls_sname()};", + f"OStream out_{self.hls_sname()};" ] # If the mask is provided as an input, it needs a stream declaration as # well @@ -880,7 +890,7 @@ def strm_decl(self): # Append the mask stream to the declaration list self.code_gen_dict["$STREAMDECLARATIONS$"] += [ # Note: Assumes stream type aliases to be set in defines - 'MStream m;', + f"MStream m_{self.hls_sname()};", ] # Generates C++ code for calling the computation part of the operator @@ -891,11 +901,16 @@ def docompute(self): # threshold parameters # Note: Assumes "Attention" to be aliased appropriate configuration # in defines with. - "Attention attention {", - " act_qk_matmul, act_av_matmul, act_a_softmax", - "};", + f"Attention attention {{", + f" act_qk_matmul, act_av_matmul, act_a_softmax", + f"}};", # Connect the attention operator to the input and output streams - "attention(q, k, v, out);", + f"attention(" + f"q_{self.hls_sname()}, " + f"k_{self.hls_sname()}, " + f"v_{self.hls_sname()}, " + f"out_{self.hls_sname()}" + f");", ] # Generates C++ code for reading the output stream and converting back to @@ -920,7 +935,7 @@ def dataoutstrm(self): # Generate function call reading from stream into the output file # Note: Outputs are always represented as numpy floats f'apintstream2npy(', - f' out, {shape}, "{code_gen_dir}/out.npy", false', + f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false', ');', ] @@ -939,7 +954,10 @@ def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ # Note: Assumes stream type aliases to be set in defines f"void {self.onnx_node.name} (", - f" QStream &q, KStream &k, VStream &v, OStream &out", + f" QStream &q_{self.hls_sname()}," + f" KStream &k_{self.hls_sname()}," + f" VStream &v_{self.hls_sname()}," + f" OStream &out_{self.hls_sname()}", f")", ] @@ -950,15 +968,37 @@ def pragmas(self): # the top-level function arguments self.code_gen_dict["$PRAGMAS$"] = [ # Connect the query input stream with an axi stream interface - "#pragma HLS INTERFACE axis port=q", + f"#pragma HLS INTERFACE axis port=q_{self.hls_sname()}", # Connect the key input stream with an axi stream interface - "#pragma HLS INTERFACE axis port=k", + f"#pragma HLS INTERFACE axis port=k_{self.hls_sname()}", # Connect the value input stream with an axi stream interface - "#pragma HLS INTERFACE axis port=v", + f"#pragma HLS INTERFACE axis port=v_{self.hls_sname()}", # Connect the output stream with an axi stream interface - "#pragma HLS INTERFACE axis port=out", + f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}", ] # No block-level I/O protocol for the function return value self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" + f"#pragma HLS INTERFACE ap_ctrl_none port=return" ) + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary starting with clock + # and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + (f"q_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), + (f"k_{self.hls_sname()}", self.get_instream_width_padded(ind=1)), + (f"v_{self.hls_sname()}", self.get_instream_width_padded(ind=2)) + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0)) + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names From 4c60ce43c7a7ece2e6446a5e4ca499c77d472f89 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 1 Dec 2023 12:13:16 +0100 Subject: [PATCH 38/88] [Attention] Prevent absorbing thresholds into MVAU after forking matmul Absorbing only the first of the consumer branches detaches the rest of the graph. Absorbing all branches is not possible, as these might actually be different thresholds or even other types of ops. So in these cases now a matmul-only MVAU followed by standalone thresholding will be generated, keeping the graph intact. --- .../fpgadataflow/convert_to_hw_layers.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 897d714bf8..82e72c2a4c 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1335,7 +1335,11 @@ def apply(self, model): (WMEM * PE * SIMD) is violated.""" ) # see if we have any following thresholds - consumer = model.find_consumer(mm_output) + consumers = model.find_consumers(mm_output) + # Only a single consumer node can be absorbed. Absorbing one + # branch of a forking matmul would lead to detached nodes + # breaking the graph. + consumer = consumers[0] if len(consumers) == 1 else None if consumer is not None and consumer.op_type == "MultiThreshold": # TODO ensure integer thresholds? # create MVTU (i.e. including activation) @@ -1456,7 +1460,11 @@ def apply(self, model): (WMEM * PE * SIMD) is violated.""" ) # see if we have any following thresholds - consumer = model.find_consumer(mm_output) + consumers = model.find_consumers(mm_output) + # Only a single consumer node can be absorbed. Absorbing one + # branch of a forking matmul would lead to detached nodes + # breaking the graph. + consumer = consumers[0] if len(consumers) == 1 else None if consumer is not None and consumer.op_type == "MultiThreshold": # TODO ensure integer thresholds? # create MVTU (i.e. including activation) @@ -1610,7 +1618,11 @@ def apply(self, model): # create node with pe=channels as default pe = channels # see if we have any following thresholds - consumer = model.find_consumer(mm_output) + consumers = model.find_consumers(mm_output) + # Only a single consumer node can be absorbed. Absorbing one + # branch of a forking matmul would lead to detached nodes + # breaking the graph. + consumer = consumers[0] if len(consumers) == 1 else None if consumer is not None and consumer.op_type == "MultiThreshold": # create VVAU (i.e. including activation) mt_output = consumer.output[0] From b94aa7a9f0ed291afbc0c6ad00377bba592f2b05 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 1 Dec 2023 12:33:05 +0100 Subject: [PATCH 39/88] [Attention] Add interface name generator to head splitting and merging --- .../custom_op/fpgadataflow/attention_heads.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 3e8aebdab3..87d673c8a9 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -497,6 +497,29 @@ def pragmas(self): "#pragma HLS INTERFACE ap_ctrl_none port=return" ) + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary starting with clock + # and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + # Just one input stream + (f"in_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + # One output stream per head + (f"out{i}_{self.hls_sname()}", + self.get_outstream_width_padded(ind=i)) for i in range(self.heads) + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names + # Merging of attention heads (before output projections) custom operator class MergeMultiHeads(HLSCustomOp): @@ -963,3 +986,26 @@ def pragmas(self): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE ap_ctrl_none port=return" ) + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary starting with clock + # and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + # One input stream per head + (f"in{i}_{self.hls_sname()}", + self.get_instream_width_padded(ind=i)) for i in range(self.heads) + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + # Just one output stream + (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0)), + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names From f8af260101e47dd2b641cfa361758035c555589a Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 4 Dec 2023 16:00:34 +0100 Subject: [PATCH 40/88] [Attention] Add default FIFO depths to head splitting and merging --- .../custom_op/fpgadataflow/attention_heads.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 87d673c8a9..29a745f4b3 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -27,6 +27,12 @@ def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base super().__init__(onnx_node, **kwargs) + # Need to override the default depths of outputs FIFOs here as these + # depend on the number of heads, which are not known during calls to + # get_nodeattr_types. + if not self.get_nodeattr("outFIFODepths"): + self.set_nodeattr("outFIFODepths", [2 for _ in range(self.heads)]) + # Defines attributes which must be present on this node def get_nodeattr_types(self): # Start from parent operator class attributes @@ -46,7 +52,12 @@ def get_nodeattr_types(self): "num_inputs": ("ints", True, [1]), # Possible execution modes for simulating this node # Note: Override to support python mode - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}) + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + + # Input and output FIFO depths for multi-I/O nodes + # Note: Need to override here as there multiple outputs + "inFIFODepths": ("ints", False, [2]), + "outFIFODepths": ("ints", False, []), # Default will be override }) # Return updated attribute dictionary return attrs @@ -528,6 +539,12 @@ def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base super().__init__(onnx_node, **kwargs) + # Need to override the default depths of input FIFOs here as these + # depend on the number of heads, which are not known during calls to + # get_nodeattr_types. + if not self.get_nodeattr("inFIFODepths"): + self.set_nodeattr("inFIFODepths", [2 for _ in range(self.heads)]) + # Defines attributes which must be present on this node def get_nodeattr_types(self): # Start from parent operator class attributes @@ -549,7 +566,12 @@ def get_nodeattr_types(self): "squeezed": ("i", True, 0), # Possible execution modes for simulating this node # Note: Override to support python mode - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}) + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + + # Input and output FIFO depths for multi-I/O nodes + # Note: Need to override here as there multiple inputs + "inFIFODepths": ("ints", False, []), # Default will be override + "outFIFODepths": ("ints", False, [2]), }) # Return updated attribute dictionary return attrs From 0c772a4f8d10cea9193519b8c9f53bf8b21e72a1 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 4 Dec 2023 17:17:23 +0100 Subject: [PATCH 41/88] [Attention] Refactor node execution to have a separate function per mode --- src/finn/custom_op/fpgadataflow/attention.py | 137 ++++++++++--------- 1 file changed, 70 insertions(+), 67 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index edc146c9e4..3d8b065b7b 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -91,6 +91,10 @@ def get_nodeattr_types(self): # is generated by the operator itself. "mask_mode": ("s", True, "none", {"none", "input", "causal"}), + # Possible execution modes for simulating this node + # Note: Override to support python mode + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + # Input and output FIFO depths for multi-I/O nodes # Note: Need to override here as there are three inputs "inFIFODepths": ("ints", False, [2, 2, 2]), @@ -206,80 +210,79 @@ def infer_node_datatype(self, model): node.output[0], DataType[self.get_nodeattr('OType')] ) - # Executes the node - def execute_node(self, context, graph): - # The folding configuration must be valid - assert self.is_valid_folding, "Invalid Folding" - - # The execution mode is configured via a node attribute of the - # HLSCustomOp base. This is a string, either "cppsim" or "rtlsim". - mode = self.get_nodeattr("exec_mode") + # Executes the attention operator in python mode simulation + def _execute_node_python(self, context, graph): # noqa: graph unused + # TODO: Implement rtlsim mode + raise NotImplementedError( + "exec_mode python is not implemented yet!" + ) + # Executes the attention operator in C++ mode simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node # Input data is stored in numpy files in the code generation dictionary - # TODO: Refactor this, there is too much duplication of mode checking - if mode == "cppsim" or mode == "python": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """ - Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim") - """.format(mode) - ) + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Give names to the ordered node inputs which are always present: This - # serves as a translation table for mapping QONNX node index via the - # execution context name to this internal name for generating i/o files - # TODO: Maybe configure the naming and order of inputs somewhere more - # global? - named_inputs = ["q", "k", "v"] - # The mask is an optional fourth input. While "zip" runs over the - # shortest of its arguments, there would be no mask file generated if - # there is no fourth node input. However, the fourth input might be - # occupied by one of the thresholds instead, which is not an actual - # input and thus the mask ust be appended conditionally here. - if self.get_nodeattr("mask_mode") == "input": - named_inputs.append("m") + # By convention, inputs 0, 1 and 2 correspond to named inputs q, k and v + + # Read the input from the execution context and reshape to match the + # expected folding + q = context[node.input[0]].reshape(self.get_folded_input_shape(ind=0)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"q.npy"), q) + + # Read the input from the execution context and reshape to match the + # expected folding + k = context[node.input[1]].reshape(self.get_folded_input_shape(ind=1)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"k.npy"), k) + + # Read the input from the execution context and reshape to match the + # expected folding + v = context[node.input[2]].reshape(self.get_folded_input_shape(ind=2)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"v.npy"), v) - # Enumerate and name the node inputs. - for ind, (name, context_name) in enumerate( - zip(named_inputs, self.onnx_node.input) - ): + # Optionally, the mask may be provided as an input as well + if self.get_nodeattr("mask_mode") == "input": # Read the input from the execution context and reshape to match the # expected folding - x = context[context_name].reshape(self.get_folded_input_shape(ind)) - # TODO: Why do the HLSCustomOp and MatrixVectorActivation make a - # copy here? - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"{name}.npy"), x) - - # CPP Simulation of the HLS operator - if mode == "cppsim": - # Execute the precompiled C++ simulation program - # Note: Reusing the HLSCustomOp base implementation is probably fine - super().exec_precompiled_singlenode_model() - # Load the output numpy file generated by the C++ simulation - out = np.load(os.path.join(code_gen_dir, f"out.npy")) - # Reshape the folded output and insert into the execution context - context[self.onnx_node.output[0]] = out.reshape( - self.get_normal_output_shape(ind=0) - ) - # RTL Simulation of the HLS operator - elif mode == "rtlsim": - # TODO: Implement rtlsim mode - raise NotImplementedError( - "exec_mode rtlsim is not implemented yet!" - ) - # All other modes are unsupported - else: - raise Exception( - """ - Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim") - """.format(mode) + m = context[node.input[3]].reshape( + self.get_folded_input_shape(ind=3) ) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"m.npy"), m) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, f"out.npy")) + # Reshape the folded output and insert into the execution context + context[self.onnx_node.output[0]] = out.reshape( + self.get_normal_output_shape(ind=0) + ) + + # Executes the attention operator in RTL mode simulation + def _execute_node_rtlsim(self, context, graph): # noqa: graph unused + # TODO: Implement rtlsim mode + raise NotImplementedError( + "exec_mode rtlsim is not implemented yet!" + ) + + # Executes the attention operator in simulation (either python, c++ or rtl) + def execute_node(self, context, graph): + # Get the configured execution mode + mode = self.get_nodeattr("exec_mode") + # Lookup table mapping execution modes to implementing methods + exec_fns = { + "python": self._execute_node_python, + "cppsim": self._execute_node_cppsim, + "rtlsim": self._execute_node_rtlsim, + } + # Select and execute the function by mode string + exec_fns[mode](context, graph) # Optional node verification def verify_node(self): From fe483364cc29568dc5a3b8c6a1c6b9b936214159 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 5 Dec 2023 14:02:15 +0100 Subject: [PATCH 42/88] [Attention] Implement python mode node execution --- src/finn/custom_op/fpgadataflow/attention.py | 97 ++++++++++++++++++- .../test_fpgadataflow_attention.py | 17 +--- 2 files changed, 95 insertions(+), 19 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 3d8b065b7b..3d82f93527 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -18,6 +18,25 @@ from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions # noqa +# Softmax function on numpy arrays with overflow handling matching the HLS +# operator +def softmax(x, axis): + # For overflow handling, find the maximum value along axis and place ones at + # each occurrence + max_ones = (x == np.max(x, axis=axis, keepdims=True)).astype(np.float32) + # Count the occurrences of the maximum along the normalization axis + max_counts = np.sum(max_ones, axis=axis, keepdims=True) + # Exponential of the input + exp = np.exp(x) + # Compute the total along axis + total = np.sum(exp, axis=1, keepdims=True) + # Detect overflow of the summation + overflow = np.isinf(total) + # Replace overflows by equal weight given to all instances of the maximum + # input value. For non overflow just compute normal softmax + return np.where(overflow, max_ones / max_counts, exp / total) + + # Scaled Dot-Product Attention Custom Operator # Note: Single head attention class ScaledDotProductAttention(HLSCustomOp): @@ -212,9 +231,81 @@ def infer_node_datatype(self, model): # Executes the attention operator in python mode simulation def _execute_node_python(self, context, graph): # noqa: graph unused - # TODO: Implement rtlsim mode - raise NotImplementedError( - "exec_mode python is not implemented yet!" + # Multithreshold activations + from qonnx.custom_op.general.multithreshold import multithreshold # noqa + + # Get the node wrapped by this custom op + node = self.onnx_node + + # Read the input from the execution context and reshape to match the + # expected folding + q = context[node.input[0]].reshape(self.get_normal_input_shape(ind=0)) + k = context[node.input[1]].reshape(self.get_normal_input_shape(ind=1)) + v = context[node.input[2]].reshape(self.get_normal_input_shape(ind=2)) + + # Quantization activation function following the query and key + # multiplication + def act_qk_matmul(x): + # Only applies if this is specified as a thresholding activation + if self.get_nodeattr("ActQKMatMul") == "thresholds": + # Get the thresholds initializer by name from ordered list of + # optional inputs + thresholds = context[ + self.get_input_name_by_name("thresholds_qk_matmul") + ] + # Applies thresholding activation in python to the input + return multithreshold(x, thresholds) + # If not thresholds, assume identity function + return x + + # Quantization activation function following the softmax normalization + def act_a_softmax(x): + # Only applies if this is specified as a thresholding activation + if self.get_nodeattr("ActASoftmax") == "thresholds": + # Get the thresholds initializer by name from ordered list of + # optional inputs + thresholds = context[ + self.get_input_name_by_name("thresholds_a_softmax") + ] + # Applies thresholding activation in python to the input + return multithreshold(x, thresholds) + # If not thresholds, assume identity function + return x + + # Quantization activation function following the attention and values + # multiplication + def act_av_matmul(x): + # Only applies if this is specified as a thresholding activation + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # Get the thresholds initializer by name from ordered list of + # optional inputs + thresholds = context[ + self.get_input_name_by_name("thresholds_av_matmul") + ] + # Applies thresholding activation in python to the input + return multithreshold(x, thresholds) + # If not thresholds, assume identity function + return x + + # Get the datatype produced by the first matmul after quantization + qk_dtype = DataType[self.get_nodeattr("OutQKMatMul")] + # Scale used to dequantize the qk matrix before computing the softmax in + # floating point + dequant = 1.0 / (qk_dtype.get_num_possible_values() - 1) + + # 1. Queries and keys multiplication followed by quantizing activation + # function + qk = act_qk_matmul(np.matmul(q, k.T)) + # Softmax-normalization of the attention weights followed by quantizing + # activation function + a = act_a_softmax(softmax(dequant * qk, axis=1)) + # 2. Attention weights and values matmul followed by quantization + # activation function + out = act_av_matmul(np.matmul(a, v)) + + # Insert the results into the execution context + context[self.onnx_node.output[0]] = out.reshape( + self.get_normal_output_shape(ind=0) ) # Executes the attention operator in C++ mode simulation diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 9f85528dba..39e46fd21d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -33,24 +33,9 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim - # Softmax function on numpy arrays with overflow handling matching the HLS # operator -def softmax(x, axis): - # For overflow handling, find the maximum value along axis and place ones at - # each occurrence - max_ones = (x == np.max(x, axis=axis, keepdims=True)).astype(np.float32) - # Count the occurrences of the maximum along the normalization axis - max_counts = np.sum(max_ones, axis=axis, keepdims=True) - # Exponential of the input - exp = np.exp(x) - # Compute the total along axis - total = np.sum(exp, axis=1, keepdims=True) - # Detect overflow of the summation - overflow = np.isinf(total) - # Replace overflows by equal weight given to all instances of the maximum - # input value. For non overflow just compute normal softmax - return np.where(overflow, max_ones / max_counts, exp / total) +from finn.custom_op.fpgadataflow.attention import softmax # Python/Numpy model of the scaled dot-product attention operator as it is (will From f241b077d1f2f7bf92c8f4ee25b9a1454766cb26 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 6 Dec 2023 15:39:39 +0100 Subject: [PATCH 43/88] [Attention] Add missing out_bias of thresholds absorbed into attention Properly integrates dequantizer preceding the softmax narmalization as well. --- src/finn/custom_op/fpgadataflow/attention.py | 82 ++++++++++++++++--- .../test_fpgadataflow_attention.py | 35 ++++++-- 2 files changed, 102 insertions(+), 15 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 3d82f93527..144a5d84f8 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -85,6 +85,9 @@ def get_nodeattr_types(self): "OutQKMatMul": ("s", False, "UINT32"), # Activation function type of the Query x Key multiplication "ActQKMatMul": ("s", False, "none", {"none", "thresholds"}), + # Output bias to be applied to the thresholding activation following + # the Query x Key multiplication + "BiasActQKMatMul": ("f", False, 0.0), # Datatype of accumulator elements of the Attention x Value # multiplication @@ -94,8 +97,14 @@ def get_nodeattr_types(self): "OutAVMatMul": ("s", False, "UINT32"), # Activation function type of the Attention x Value multiplication "ActAVMatMul": ("s", False, "none", {"none", "thresholds"}), + # Output bias to be applied to the thresholding activation following + # the Attention x Value multiplication + "BiasActAVMatMul": ("f", False, 0.0), + # Scale factor preceding the softmax normalization to dequantize the + # input + "DequantSoftmax": ("f", False, 1.0), # Datatype of softmax normalization before applying activation or # type cast. THis is called Acc to stick to the naming scheme of the # MatMul operators before. @@ -104,6 +113,9 @@ def get_nodeattr_types(self): # Activation function type of the softmax normalization of the # attention weights "ActASoftmax": ("s", False, "none", {"none", "thresholds"}), + # Output bias to be applied to the thresholding activation following + # the softmax normalization of the attention weights + "BiasActASoftmax": ("f", False, 0.0), # Mode used for providing the attention mask: There can be no mask, # a mask sent as the fourth input or a causal attention mask which @@ -253,8 +265,11 @@ def act_qk_matmul(x): thresholds = context[ self.get_input_name_by_name("thresholds_qk_matmul") ] + # Activation value, i.e., bias applied after thresholding + # activation + bias = self.get_nodeattr("BiasActQKMatMul") # Applies thresholding activation in python to the input - return multithreshold(x, thresholds) + return multithreshold(x, thresholds) + bias # If not thresholds, assume identity function return x @@ -267,8 +282,11 @@ def act_a_softmax(x): thresholds = context[ self.get_input_name_by_name("thresholds_a_softmax") ] + # Activation value, i.e., bias applied after thresholding + # activation + bias = self.get_nodeattr("BiasActASoftmax") # Applies thresholding activation in python to the input - return multithreshold(x, thresholds) + return multithreshold(x, thresholds) + bias # If not thresholds, assume identity function return x @@ -282,16 +300,17 @@ def act_av_matmul(x): thresholds = context[ self.get_input_name_by_name("thresholds_av_matmul") ] + # Activation value, i.e., bias applied after thresholding + # activation + bias = self.get_nodeattr("BiasActAVMatMul") # Applies thresholding activation in python to the input - return multithreshold(x, thresholds) + return multithreshold(x, thresholds) + bias # If not thresholds, assume identity function return x - # Get the datatype produced by the first matmul after quantization - qk_dtype = DataType[self.get_nodeattr("OutQKMatMul")] # Scale used to dequantize the qk matrix before computing the softmax in # floating point - dequant = 1.0 / (qk_dtype.get_num_possible_values() - 1) + dequant = self.get_nodeattr("DequantSoftmax") # 1. Queries and keys multiplication followed by quantizing activation # function @@ -358,6 +377,7 @@ def _execute_node_cppsim(self, context, graph): # noqa: graph unused # Executes the attention operator in RTL mode simulation def _execute_node_rtlsim(self, context, graph): # noqa: graph unused # TODO: Implement rtlsim mode + # Note: Cannot even compile this right now due to missing float ips raise NotImplementedError( "exec_mode rtlsim is not implemented yet!" ) @@ -744,6 +764,13 @@ def prepare_thresholds(ts, length, fold, dtype): ) # Get the datatype of the thresholds thresholds_dtype = DataType[self.get_nodeattr("AccQKMatMul")] + # Activation value, i.e., bias applied after thresholding activation + bias = self.get_nodeattr("BiasActQKMatMul") + # No support for floating-point bias + assert int(bias) == bias, "BiasActQKMatMul must be integer" + # Convert the bias to integer representation, so it can be used as a + # template argument + bias = int(bias) # Format the thresholds as C++ array code: QK matmul outputs fold # along the key-value sequence length dimension thresholds_qk_matmul, num = prepare_thresholds( @@ -754,7 +781,12 @@ def prepare_thresholds(ts, length, fold, dtype): # "defines" method act_qk_matmul = "\n".join([ f"ThresholdsActivation<", - f" SeqFold, KVLen/SeqFold, {num}, AccQKMatMul, OutQKMatMul", + f" SeqFold,", + f" KVLen/SeqFold," + f" {num}," + f" AccQKMatMul," + f" OutQKMatMul," + f" {bias}", f">" ]) @@ -767,6 +799,13 @@ def prepare_thresholds(ts, length, fold, dtype): ) # Get the datatype of the thresholds thresholds_dtype = DataType[self.get_nodeattr("AccAVMatMul")] + # Activation value, i.e., bias applied after thresholding activation + bias = self.get_nodeattr("BiasActAVMatMul") + # No support for floating-point bias + assert int(bias) == bias, "BiasActAVMatMul must be integer" + # Convert the bias to integer representation, so it can be used as a + # template argument + bias = int(bias) # Format the thresholds as C++ array code: AV matmul outputs fold # along the value embedding dimension thresholds_av_matmul, num = prepare_thresholds( @@ -777,7 +816,12 @@ def prepare_thresholds(ts, length, fold, dtype): # "defines" method act_av_matmul = "\n".join([ f"ThresholdsActivation<", - f" EmbFold, VDim/EmbFold, {num}, AccAVMatMul, OutAVMatMul", + f" EmbFold," + f" VDim/EmbFold," + f" {num}," + f" AccAVMatMul," + f" OutAVMatMul," + f" {bias}" f">" ]) @@ -790,6 +834,13 @@ def prepare_thresholds(ts, length, fold, dtype): ) # Get the datatype of the thresholds thresholds_dtype = DataType[self.get_nodeattr("AccASoftmax")] + # Activation value, i.e., bias applied after thresholding activation + bias = self.get_nodeattr("BiasActASoftmax") + # No support for floating-point bias + assert int(bias) == bias, "BiasActASoftmax must be integer" + # Convert the bias to integer representation, so it can be used as a + # template argument + bias = int(bias) # Format the thresholds as C++ array code: Softmax outputs fold # along the key-value sequence length dimension thresholds_a_softmax, num = prepare_thresholds( @@ -800,7 +851,12 @@ def prepare_thresholds(ts, length, fold, dtype): # "defines" method act_a_softmax = "\n".join([ f"ThresholdsActivation<", - f" SeqFold, KVLen/SeqFold, {num}, AccASoftmax, AType", + f" SeqFold," + f" KVLen/SeqFold," + f" {num}," + f" AccASoftmax," + f" AType," + f" {bias}", f">" ]) @@ -808,6 +864,10 @@ def prepare_thresholds(ts, length, fold, dtype): with open(f"{code_gen_dir}/params.hpp", "w") as file: # Write lines of C++ code separated by newlines to the file file.write("\n".join([ + # Scale factor preceding the softmax activation function to + # dequantize the input to floating-point representation + f"static const float dequant_softmax =" + f" {self.get_nodeattr('DequantSoftmax')};", # Add type definition and threshold initialization of the # query-key matmul activation f"using ActQKMatMul = {act_qk_matmul};", @@ -995,8 +1055,10 @@ def docompute(self): # threshold parameters # Note: Assumes "Attention" to be aliased appropriate configuration # in defines with. + # Note: Assumes parameters to be generated in 'generate_params' and + # made available via include/defines before. f"Attention attention {{", - f" act_qk_matmul, act_av_matmul, act_a_softmax", + f" act_qk_matmul, act_av_matmul, act_a_softmax, dequant_softmax", f"}};", # Connect the attention operator to the input and output streams f"attention(" diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 39e46fd21d..38b3737f04 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -15,7 +15,9 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model # noqa qonnx # dependency is specified in setup.cfg as well as in fetch-repos.sh # QONNX datatypes -from qonnx.core.datatype import DataType, IntType, BaseDataType # noqa +from qonnx.core.datatype import ( # noqa + DataType, IntType, FloatType, BaseDataType +) # Wrapper around ONNX model with some graph manipulation utility from qonnx.core.modelwrapper import ModelWrapper # noqa # Execute onnx model graphs @@ -75,6 +77,9 @@ class MockScaledDotProductAttention: OutQKMatMul: IntType = DataType["UINT4"] # Activation function type of the Query x Key multiplication ActQKMatMul: str = "thresholds" + # Output bias to be applied to the thresholding activation following + # the Query x Key multiplication + BiasActQKMatMul: float = 0.0 # Datatype of accumulator elements of the Attention x Value # multiplication @@ -84,10 +89,24 @@ class MockScaledDotProductAttention: OutAVMatMul: IntType = DataType["UINT4"] # Activation function type of the Attention x Value multiplication ActAVMatMul: str = "thresholds" - + # Output bias to be applied to the thresholding activation following + # the Attention x Value multiplication + BiasActAVMatMul: float = 0.0 + + # Scale factor preceding the softmax normalization to dequantize the + # input + DequantSoftmax: float = 1.0 + # Datatype of softmax normalization before applying activation or + # type cast. THis is called Acc to stick to the naming scheme of the + # MatMul operators before. + # Note: Currently this is ALWAYS floats + AccASoftmax: FloatType = DataType["FLOAT32"] # Activation function type of the softmax normalization of the # attention weights ActASoftmax: str = "thresholds" + # Output bias to be applied to the thresholding activation following + # the softmax normalization of the attention weights + BiasActASoftmax: float = 0.0 # Initializes those parameters which depend on the initial configuration, # which is set by the generated __init__ @@ -172,7 +191,7 @@ def qk_matmul(self, query, key): # function simulating quantization via thresholding def softmax(self, attention): # Input and output scale factors for float <-> int conversion - iscale = 1.0 / (self.OutQKMatMul.get_num_possible_values() - 1) + iscale = self.DequantSoftmax # Scale the inputs, normalize using softmax and activate via thresholds return multithreshold( softmax(iscale * attention, axis=1), self.a_thresholds @@ -352,7 +371,10 @@ def test_attention_cppsim( AccQKMatMul=DataType["UINT22"], OutQKMatMul=DataType["UINT8"], AccAVMatMul=DataType["UINT22"], - OutAVMatMul=OType + OutAVMatMul=OType, + # Dequantizer scale, factor to convert the whole UINT8 range to floats + # in range 0.0 to 1.0 + DequantSoftmax=1.0 / (DataType["UINT8"].get_num_possible_values() - 1) ) # Create a QONNX model wrapper for testing @@ -459,7 +481,10 @@ def test_attention_rtlsim( AccQKMatMul=DataType["UINT22"], OutQKMatMul=DataType["UINT8"], AccAVMatMul=DataType["UINT22"], - OutAVMatMul=OType + OutAVMatMul=OType, + # Dequantizer scale, factor to convert the whole UINT8 range to floats + # in range 0.0 to 1.0 + DequantSoftmax=1.0 / (DataType["UINT8"].get_num_possible_values() - 1) ) # Create a QONNX model wrapper for testing From fbfa27f29af7084d15da063e5c9f3e89b0795128 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 7 Dec 2023 20:06:02 +0100 Subject: [PATCH 44/88] [Attention] Fix python softmax sum axis --- src/finn/custom_op/fpgadataflow/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 144a5d84f8..489477ac72 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -29,7 +29,7 @@ def softmax(x, axis): # Exponential of the input exp = np.exp(x) # Compute the total along axis - total = np.sum(exp, axis=1, keepdims=True) + total = np.sum(exp, axis=axis, keepdims=True) # Detect overflow of the summation overflow = np.isinf(total) # Replace overflows by equal weight given to all instances of the maximum From 8155603b9840546ba25b539014274eac70b0df11 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 8 Dec 2023 20:40:48 +0100 Subject: [PATCH 45/88] [Attention] Introduce ReplicateStream custom operation This is intended to replicate the stream feeding the query, key and value projections, thus producing three replicas of the stream. However, it seems like DuplicateStreams_Batch indeed already supports an arbitrary number of replicas, contrary to what the name and corresponding transformation suggest... Then this new operation might be redundant. --- src/finn/custom_op/fpgadataflow/__init__.py | 2 + src/finn/custom_op/fpgadataflow/attention.py | 2 +- .../custom_op/fpgadataflow/attention_heads.py | 16 +- .../fpgadataflow/replicate_stream.py | 478 ++++++++++++++++++ .../test_fpgadataflow_attention_heads.py | 22 +- .../test_fpgadataflow_replicate_stream.py | 200 ++++++++ 6 files changed, 697 insertions(+), 23 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/replicate_stream.py create mode 100644 tests/fpgadataflow/test_fpgadataflow_replicate_stream.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index a4bd6a2ce4..869b9c69ca 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -59,6 +59,7 @@ from finn.custom_op.fpgadataflow.attention_heads import ( SplitMultiHeads, MergeMultiHeads ) +from finn.custom_op.fpgadataflow.replicate_stream import ReplicateStream custom_op = dict() @@ -89,3 +90,4 @@ custom_op["ScaledDotProductAttention"] = ScaledDotProductAttention custom_op["SplitMultiHeads"] = SplitMultiHeads custom_op["MergeMultiHeads"] = MergeMultiHeads +custom_op["ReplicateStream"] = ReplicateStream diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 489477ac72..08d96ead58 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -781,7 +781,7 @@ def prepare_thresholds(ts, length, fold, dtype): # "defines" method act_qk_matmul = "\n".join([ f"ThresholdsActivation<", - f" SeqFold,", + f" SeqFold," f" KVLen/SeqFold," f" {num}," f" AccQKMatMul," diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 29a745f4b3..5e4fd05cb5 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -167,7 +167,7 @@ def _execute_node_python(self, context, graph): # noqa: graph unused # Executes multi-head slicing in C++ simulation def _execute_node_cppsim(self, context, graph): # noqa: graph unused - # Get the node wrapped by this custom op + # Get the node wrapped by this custom op # noqa Duplicate node = self.onnx_node # Input data is stored in numpy files in the code generation dictionary code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -194,7 +194,7 @@ def _execute_node_cppsim(self, context, graph): # noqa: graph unused # Executes multi-head slicing in RTL simulation def _execute_node_rtlsim(self, context, graph): # noqa: graph unused - # Get the node wrapped by this custom op + # Get the node wrapped by this custom op # noqa Duplicate node = self.onnx_node # Input data is stored in numpy files in the code generation dictionary code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -430,10 +430,10 @@ def out(i): ] # Generates C++ code for reading the output stream and converting back to - # numpy format for testing in C** simulation + # numpy format for testing in C++ simulation def dataoutstrm(self): - # Output data will be stored in numpy files in the code generation - # dictionary + # Output data will be stored in numpy files in the # noqa Duplicate + # code generation dictionary code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") # Get the expected shape of the folded output array formatted as a C++ # vector initializer @@ -510,8 +510,8 @@ def pragmas(self): # Returns the names of input and output interfaces grouped by protocol def get_verilog_top_module_intf_names(self): - # Start collecting interface names in a dictionary starting with clock - # and reset + # Start collecting interface names in a dictionary # noqa Duplicate + # starting with clock and reset intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa # AXI stream input interfaces intf_names["s_axis"] = [ @@ -613,8 +613,6 @@ def squeezed(self): # Note: Propagates shape forward, i.e., never asks for the shape of the # output, even if it seems easier. def make_shape_compatible_op(self, model: ModelWrapper): # noqa - # Get the node wrapped by this custom op - node = self.onnx_node # Squeeze single-element batch dimension from the output? squeezed = self.squeezed # Assume unpacked inputs by default, here seq sill be the number of diff --git a/src/finn/custom_op/fpgadataflow/replicate_stream.py b/src/finn/custom_op/fpgadataflow/replicate_stream.py new file mode 100644 index 0000000000..5f0dc59a2c --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/replicate_stream.py @@ -0,0 +1,478 @@ +# Operating system stuff, e.g. paths +import os +# Numpy math and arrays +import numpy as np + +# Protobuf onnx graph node type +from onnx import NodeProto # noqa +# Helper for creating ONNX nodes +from onnx import helper as oh # noqa + +# QONNX/FINN datatypes +from qonnx.core.datatype import DataType # noqa qonnx dependency is specified +# in setup.cfg as well as in fetch-repos.sh +# QONNX wrapper to ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper # noqa + +# Converts inputs/outputs to/from RTL simulation format +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +# Derive custom operators form the FINN base custom op +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp + + +# Replicates an input stream to arbitrary many output streams +# See DuplicateStreams_Batch for feeding exactly two streams +class ReplicateStream(HLSCustomOp): + # Initializes the operator given an onnx graph node + def __init__(self, onnx_node, **kwargs): + # Just forward all arguments to the init method of the CustomOp base + super().__init__(onnx_node, **kwargs) + + # Need to override the default depths of outputs FIFOs here as these + # depend on the number of replicas, which are not known during calls to + # get_nodeattr_types. + if not self.get_nodeattr("outFIFODepths"): + self.set_nodeattr("outFIFODepths", [2 for _ in range(self.num)]) + + # Defines attributes which must be present on this node + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = super().get_nodeattr_types() + # Update attributes dictionary for new custom operator + attrs.update({ + # Number of replicas to produce + "num": ("i", True, 1), + # Data type of input and output elements + "dtype": ("s", True, ""), + # Number of input elements received in parallel + "num_elems": ("i", True, 1), + # Number of inputs to be processed sequentially + "num_inputs": ("ints", True, [1]), + # Possible execution modes for simulating this node + # Note: Override to support python mode + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + # Input and output FIFO depths for multi-I/O nodes + # Note: Need to override here as there multiple outputs + "inFIFODepths": ("ints", False, [2]), + "outFIFODepths": ("ints", False, []), # Default will be override + }) + # Return updated attribute dictionary + return attrs + + # Number of replicas attribute as property for convenience + @property + def num(self): + return self.get_nodeattr("num") + + # Datatype attribute as property for convenience + @property + def dtype(self): + # Note: Converts from string to QONNX data type + return DataType[self.get_nodeattr("dtype")] + + # Number of elements attribute as property for convenience + @property + def num_elems(self): + return self.get_nodeattr("num_elems") + + # Number of inputs attribute as property for convenience + @property + def num_inputs(self): + return self.get_nodeattr("num_inputs") + + # Makes an operation compatible with the output shape for shape inference + # Note: Propagates shape forward, i.e., never asks for the shape of the + # output, even if it seems easier. + def make_shape_compatible_op(self, model: ModelWrapper): # noqa + # Get the node wrapped by this custom op + node = self.onnx_node + # Prepare a dummy input to simulate a large input that can be split into + # the desired number and shapes of outputs + mock_input = model.make_new_valueinfo_name() + # Simulate an input of number of replicas many elements + model.set_tensor_shape( + mock_input, [*self.num_inputs, self.num * self.num_elems] + ) + # Simulate behavior via the standard ONNX split operation + return oh.make_node( + "Split", [mock_input], node.output, num_outputs=self.num, axis=-1 + ) + + # Infers the datatype of the node output + def infer_node_datatype(self, model: ModelWrapper): # noqa + # Get the node wrapped by this custom op + node = self.onnx_node + # Propagate the type from the input to each output tensor + for o in node.output: + # Replicating simply propagates the type of the input to the output + model.set_tensor_datatype( + o, model.get_tensor_datatype(node.input[0]) + ) + + # Executes replicating inputs in python + def _execute_node_python(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node + # Get the input out of the execution context + inp = context[node.input[0]] + # Copy the input into each of the outputs + for o in node.output: + # Insert copy of input into the execution context at output + context[o] = inp + + # Executes replicating inputs in C++ simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op # noqa Duplicate + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the input out of the execution context + inp = context[node.input[0]] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=0), \ + f"Input shape mismatch for {node.input[0]}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=0)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"in.npy"), inp) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Enumerate the node outputs + for i, name in enumerate(node.output): + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, f"out{i}.npy")) + # Reshape the folded output and insert into the execution context + context[name] = out.reshape(self.get_normal_output_shape(ind=i)) + + # Executes replicating inputs in RTL simulation + def _execute_node_rtlsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op # noqa Duplicate + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Get the input out of the execution context + inp = context[node.input[0]] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=0), \ + f"Input shape mismatch for {node.input[0]}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=0)) + # Path to store the intermediate input in numpy format + filename = os.path.join(code_gen_dir, "in.npy") + # Save the folded inputs to file to be used by simulation + np.save(filename, inp) + # Start collecting inputs/outputs to the RTL simulation in a dictionary + # Note: Prepare one output list per replica + io_dict = { + "inputs": {}, "outputs": {f"out{i}": [] for i in range(self.num)} + } + # Type and width of the input tensor + dtype = self.get_input_datatype(ind=0) + width = self.get_instream_width(ind=0) + # Convert inputs to RTL simulation format + io_dict["inputs"]["in"] = npy_to_rtlsim_input(filename, dtype, width) + + # Setup PyVerilator simulation of the node + sim = self.get_rtlsim() + # Reset the RTL simulation + super().reset_rtlsim(sim) + super().toggle_clk(sim) + # Run the RTL Simulation + self.rtlsim_multi_io(sim, io_dict) + + # Enumerate the node outputs + for i, name in enumerate(node.output): + # Collect the output from RTL simulation + out = io_dict["outputs"][f"out{i}"] + # Type and sizes of the output tensor + dtype = self.get_output_datatype(ind=i) + width = self.get_outstream_width(ind=i) + shape = self.get_folded_output_shape(ind=i) + # Path to store the intermediate numpy file + filename = os.path.join(code_gen_dir, f"out{i}.npy") + # Convert from RTL simulation format to numpy format + rtlsim_output_to_npy( + out, filename, dtype, shape, width, dtype.bitwidth() + ) + # Load the generated output numpy file + out = np.load(filename) + # Reshape the folded output and insert into the execution context + context[name] = out.reshape(self.get_normal_output_shape(ind=i)) + + # Executes replicating inputs in simulation (either python c++ or rtl sim) + def execute_node(self, context, graph): + # Get the configured execution mode + mode = self.get_nodeattr("exec_mode") + # Lookup table mapping execution modes to implementing methods + exec_fns = { + "python": self._execute_node_python, + "cppsim": self._execute_node_cppsim, + "rtlsim": self._execute_node_rtlsim, + } + # Select and execute the function by mode string + exec_fns[mode](context, graph) + + # Verifies the node attributes, inputs and outputs + def verify_node(self): + # TODO: Implement + return [] + + # Note: End of QONNX CustomOp region, below is FINN HLSCustomOp stuff + + # Gets the datatype of input at index ind + def get_input_datatype(self, ind=0): + # All inputs (there should only be one) have the same type + return self.dtype + + # Gets the datatype of the output at index ind + def get_output_datatype(self, ind=0): + # All outputs will hae the same type, which is the same as the input + return self.dtype + + # Gets the shape of the input at index ind without folding + def get_normal_input_shape(self, ind=0): + # There is only one input with shape configured as attributes + # Unpack multi-axis inputs list to yield a flat tuple as shape + return *self.num_inputs, self.num_elems + + # Gets the shape of the output at index ind without folding + def get_normal_output_shape(self, ind=0): + # All output have the same shape, which is the same as the input + # Unpack multi-axis inputs list to yield a flat tuple as shape + return *self.num_inputs, self.num_elems + + # Gets the shape of the input at index ind with folding + def get_folded_input_shape(self, ind=0): + # No folding for now, normal and folded shape are the same + return self.get_normal_input_shape(ind=ind) + + # Gets the shape of the output at index ind with folding + def get_folded_output_shape(self, ind=0): + # No folding for now, normal and folded shape are the same + return self.get_normal_output_shape(ind=ind) + + # Widths of the input data stream of the input at index ind + def get_instream_width(self, ind=0): + # Get the number of bits used to represent the input + i_bits = self.get_input_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded input + *_, elems = self.get_folded_input_shape(ind) + # Width of a stream receiving input elements in parallel + return elems * i_bits + + # Widths of the output data stream of the output at index ind + def get_outstream_width(self, ind=0): + # Get the number of bits used to represent the output + o_bits = self.get_output_datatype(ind).bitwidth() + # Parallelism is the number of elements in the last dimension of the + # folded output + *_, elems = self.get_folded_output_shape(ind) + # Width of a stream producing output elements in parallel + return elems * o_bits + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + # Note: There is just one input. + i_bits_max = self.get_instream_width(ind=0) + # Find the widths of the widest output + # Note: there is one output per replica + o_bits_max = max( + (self.get_outstream_width(ind) for ind in range(self.num)) + ) + # Find the biggest of the inputs/outputs + return max([i_bits_max, o_bits_max]) + + # Gets the number of expected output values, i.e. how many times read() + # could/should be called on any output stream of this operator + def get_number_output_values(self): + # Elements over all but the last dimension of the output folded along + # the embedding dimension. Need to count across the number of replicas, + # as RTL simulation actually counts individual outputs, not cycles with + # outputs, i.e., producing N replica outputs per cycle in parallel, + # count N outputs per cycle... + return np.prod(self.get_folded_output_shape()[:-1]) * self.num + + # Note: End of shape and datatype utilities + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # Currently nothing to include + self.code_gen_dict["$GLOBALS$"] = [] + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Input and output element datatypes + f"using IType = {self.dtype.get_hls_datatype_str()};", + f"using OType = {self.dtype.get_hls_datatype_str()};", + # Width of single elements to avoid using ::width attribute which is + # not present for datatype float + f"static constexpr auto ElemWidth = {self.dtype.bitwidth()};" + # Datatype of elements packed into the input stream + f"using IPacked = ap_uint<{self.get_instream_width()}>;", + # Datatype of elements packed into the output stream + f"using OPacked = ap_uint<{self.get_outstream_width()}>;", + # Input and output HLS stream datatypes + f"using IStream = hls::stream<" + f" ap_uint<{self.get_instream_width()}>" + f">;", + f"using OStream = hls::stream<" + f" ap_uint<{self.get_outstream_width()}>" + f">;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [ + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + f'npy2apintstream(', + f'"{code_gen_dir}/in.npy", in_{self.hls_sname()}, false', + f');' + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Declare input and output streams + # Note: Assumes stream type aliases to be set in defines + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # There is one input datastream + f"IStream in_{self.hls_sname()};", + # There is one output datastream per replica + *(f"OStream out{i}_{self.hls_sname()};" for i in range(self.num)) + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + + # Write the body of the stream replicating top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Repeat for the number of inputs + # Note: Repeat for all num_inputs dimensions + f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", + # Pipeline the steps of this loop + f"#pragma HLS pipeline II=1 style=flp", + # Read the next input element from the stream + f"const auto x = in_{self.hls_sname()}.read();", + # Write the same input element into each output stream + *(f"{out(i)}.write(x);" for i in range(self.num)), + # End of for-loop over repetitions body + f"}}" + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C++ simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the # noqa Duplicate + # code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Start collecting function calls to write the output data stream + self.code_gen_dict["$DATAOUTSTREAM$"] = [] + + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + + # Generate code for each output stream + for i in range(self.num): + # Append each reading/writing function call + self.code_gen_dict["$DATAOUTSTREAM$"] += [ + # Generate function call reading from stream into the output + # file + # Note: Outputs are always represented as numpy floats + f'apintstream2npy(', + f'{out(i)}, {shape}, "{code_gen_dir}/out{i}.npy", false', + f');' + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the stream + # replicating operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # @formatter:off Prevent Python formatter from messing with C++ + # formatting + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + # Input HLS stream + f" IStream &in_{self.hls_sname()}, ", ",".join([ + # One output HLS stream per replica # noqa: Formatting + f" OStream &out{i}_{self.hls_sname()}" for i in range(self.num) + ]), + f")", + # @formatter:off + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the input stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=in_{self.hls_sname()}" + ] + # Connect each output stream with an axi stream interface + for i in range(self.num): + # Add new interface directive for the output stream + self.code_gen_dict["$PRAGMAS$"] += [ + f"#pragma HLS INTERFACE axis port=out{i}_{self.hls_sname()}" + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary # noqa Duplicate + # starting with clock and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + # Just one input stream + (f"in_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + # One output stream per replica + (f"out{i}_{self.hls_sname()}", + self.get_outstream_width_padded(ind=i)) for i in range(self.num) + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names diff --git a/tests/fpgadataflow/test_fpgadataflow_attention_heads.py b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py index a9d4796a14..4155d11543 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention_heads.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py @@ -5,22 +5,22 @@ import numpy as np # Protobuf onnx graph node type -from onnx import NodeProto, TensorProto # noqa +from onnx import TensorProto # Helper for creating ONNX nodes -from onnx import helper as oh # noqa +from onnx import helper as oh # QONNX/FINN datatypes from qonnx.core.datatype import DataType # noqa qonnx dependency is specified # in setup.cfg as well as in fetch-repos.sh # QONNX wrapper to ONNX model graphs -from qonnx.core.modelwrapper import ModelWrapper # noqa +from qonnx.core.modelwrapper import ModelWrapper # noqa: qonnx # Execute onnx model graphs -from qonnx.core.onnx_exec import execute_onnx # noqa +from qonnx.core.onnx_exec import execute_onnx # noqa: qonnx # Utility for wrapping onnx graphs and generating tensor of FINN datatypes -from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # noqa +from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # noqa: qonnx # Graph transformation giving unique names to each node in a QONNX model graph -from qonnx.transformation.general import GiveUniqueNodeNames # noqa +from qonnx.transformation.general import GiveUniqueNodeNames # noqa: qonnx # FINN graph transformations for preparing simulation (cppsim or rtlsim) from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode @@ -137,8 +137,6 @@ def mock_merge_multi_heads(seq, dim, heads, dtype): @pytest.mark.parametrize("dtype", ["UINT8"]) # This is a slow running fpgadataflow type of test which requires vivado @pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado # Tests splitting of tensors to multiple attention heads using python mode # execution # Note: No actual attention operation is performed @@ -161,7 +159,7 @@ def test_attention_heads_split_python(seq, dim, heads, dtype): # Validate each output separately for i, out in enumerate((f"out{i}" for i in range(heads))): # Compare expected (retrieved by index) to produced (retrieve by key) - assert (o_produced[out] == o_expected[i]).all() + assert (o_produced[out] == o_expected[i]).all() # noqa: "all" warning # Sequence length to simulate, i.e., number of individual inputs to be split @@ -201,7 +199,7 @@ def test_attention_heads_split_cppsim(seq, dim, heads, dtype): # Validate each output separately for i, out in enumerate((f"out{i}" for i in range(heads))): # Compare expected (retrieved by index) to produced (retrieve by key) - assert (o_produced[out] == o_expected[i]).all() + assert (o_produced[out] == o_expected[i]).all() # noqa: "all" warning # Sequence length to simulate, i.e., number of individual inputs to be split @@ -242,7 +240,7 @@ def test_attention_heads_split_rtlsim(seq, dim, heads, dtype): # Validate each output separately for i, out in enumerate((f"out{i}" for i in range(heads))): # Compare expected (retrieved by index) to produced (retrieve by key) - assert (o_produced[out] == o_expected[i]).all() + assert (o_produced[out] == o_expected[i]).all() # noqa: "all" warning # Sequence length to simulate, i.e., number of individual inputs to be split @@ -259,8 +257,6 @@ def test_attention_heads_split_rtlsim(seq, dim, heads, dtype): @pytest.mark.vivado # This is a slow running fpgadataflow type of test which requires vivado @pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado # Tests merging of tensors from multiple attention heads using python mode # execution # Note: No actual attention operation is performed diff --git a/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py new file mode 100644 index 0000000000..3f802d5303 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py @@ -0,0 +1,200 @@ +# Testing framework +import pytest # noqa pytest dependecy is listed in setup.cfg + +# Protobuf onnx graph node type +from onnx import TensorProto +# Helper for creating ONNX nodes +from onnx import helper as oh + +# QONNX/FINN datatypes +from qonnx.core.datatype import DataType # noqa qonnx dependency is specified +# in setup.cfg as well as in fetch-repos.sh +# QONNX wrapper to ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper # noqa: qonnx +# Execute onnx model graphs +from qonnx.core.onnx_exec import execute_onnx # noqa: qonnx +# Utility for wrapping onnx graphs and generating tensor of FINN datatypes +from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # noqa + +# Graph transformation giving unique names to each node in a QONNX model graph +from qonnx.transformation.general import GiveUniqueNodeNames # noqa: qonnx + +# FINN graph transformations for preparing simulation (cppsim or rtlsim) +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim + + +# Creates a model executing stream replication +def mock_split_multi_heads(num_inputs, num_elems, num, dtype): + # Create a node representing the stream replication operation + node = oh.make_node( + # Operator type from the name of the fpgadataflow hlscustomop + op_type="ReplicateStream", + # Specify the domain, i.e., the package to look for the custom operator + # implementation + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from HLSCustomOp + backend="fpgadataflow", + # Just one input + inputs=["inp"], + # Enumerate the outputs + outputs=[f"out{i}" for i in range(num)], + # Number of replicas to produce + num=num, + # Datatype of inputs and outputs + dtype=dtype, + # Number of input elements received in parallel + num_elems=num_elems, + # Number of inputs to be processed sequentially + num_inputs=num_inputs + ) + # Shape of the input and each output + shape = [*num_inputs, num_elems] + # Construct the input tensor value info + inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + # Construct output tensor value infos + out = [oh.make_tensor_value_info( + f"out{i}", TensorProto.FLOAT, shape) for i in range(num) + ] + # Create a graph connecting the node to the inputs and outputs + graph = oh.make_graph([node], inputs=[inp], outputs=out, name="replicate") + # Wrap the ONNX graph in QONNX model wrapper + model = ModelWrapper(qonnx_make_model(graph, producer_name='replicate')) + + # Add datatype annotation to the value info of input tensor + model.set_tensor_datatype("inp", DataType[dtype]) + # Add datatype annotation to the value infor of each output tensor + for out in (f"out{i}" for i in range(num)): + model.set_tensor_datatype(out, DataType[dtype]) + + # Return the wrapped onnx model + return model + + +# Number of inputs to be processed sequentially +@pytest.mark.parametrize( # noqa Duplicate + "num_inputs", [[64], [1, 64], [2, 64], [2, 2, 64]] +) +# Number of input elements received in parallel +@pytest.mark.parametrize("num_elems", [32]) +# Number of replicas to produce +@pytest.mark.parametrize("num", [1, 2, 4, 8]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["FLOAT32", "UINT8", "INT4"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +# Tests replicating of tensors/streams to multiple outputs using python mode +# execution +def test_replicate_stream_python(num_inputs, num_elems, num, dtype): + # Make dummy model for testing + model = mock_split_multi_heads(num_inputs, num_elems, num, dtype) + + # Prepare the execution context + context = { + "inp": gen_finn_dt_tensor(DataType[dtype], (*num_inputs, num_elems)) + } + + # Set model execution mode to python simulation + model = model.transform(SetExecMode("python")) + model = model.transform(GiveUniqueNodeNames()) + + # Compute ground-truth output in software + o_expected = [context["inp"] for _ in range(num)] # noqa: Duplicate + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context) + + # Validate each output separately + for i, out in enumerate((f"out{i}" for i in range(num))): + # Compare expected (retrieved by index) to produced (retrieve by key) + assert (o_produced[out] == o_expected[i]).all() # noqa: "all" warning + + +# Number of inputs to be processed sequentially +@pytest.mark.parametrize( # noqa Duplicate + "num_inputs", [[64], [1, 64], [2, 64], [2, 2, 64]] +) +# Number of input elements received in parallel +@pytest.mark.parametrize("num_elems", [32]) +# Number of replicas to produce +@pytest.mark.parametrize("num", [1, 2, 4, 8]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["FLOAT32", "UINT8", "INT4"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests replicating of tensors/streams to multiple outputs using C++ mode +# execution +def test_replicate_stream_cppsim(num_inputs, num_elems, num, dtype): + # Make dummy model for testing + model = mock_split_multi_heads(num_inputs, num_elems, num, dtype) + + # Prepare the execution context + context = { + "inp": gen_finn_dt_tensor(DataType[dtype], (*num_inputs, num_elems)) + } + + # Set model execution mode to C++ simulation + model = model.transform(SetExecMode("cppsim")) + # Generates the C++ source and compiles the C++ simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + + # Compute ground-truth output in software + o_expected = [context["inp"] for _ in range(num)] # noqa: Duplicate + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context) + + # Validate each output separately + for i, out in enumerate((f"out{i}" for i in range(num))): + # Compare expected (retrieved by index) to produced (retrieve by key) + assert (o_produced[out] == o_expected[i]).all() # noqa: "all" warning + + +# Number of inputs to be processed sequentially +@pytest.mark.parametrize( # noqa Duplicate + "num_inputs", [[64], [1, 64], [2, 64], [2, 2, 64]] +) +# Number of input elements received in parallel +@pytest.mark.parametrize("num_elems", [32]) +# Number of replicas to produce +@pytest.mark.parametrize("num", [1, 2, 4, 8]) +# Datatypes to simulate +@pytest.mark.parametrize("dtype", ["FLOAT32", "UINT8", "INT4"]) +# This is a slow running fpgadataflow type of test which requires vivado +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +# Tests replicating of tensors/streams to multiple outputs using RTL mode +# execution +def test_replicate_stream_rtlsim(num_inputs, num_elems, num, dtype): + # Make dummy model for testing + model = mock_split_multi_heads(num_inputs, num_elems, num, dtype) + + # Prepare the execution context + context = { + "inp": gen_finn_dt_tensor(DataType[dtype], (*num_inputs, num_elems)) + } + + # Set model execution mode to RTL simulation + model = model.transform(SetExecMode("rtlsim")) + # Generates the C++ source and compiles the RTL simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 10)) # noqa + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + # Compute ground-truth output in software + o_expected = [context["inp"] for _ in range(num)] # noqa: Duplicate + # Execute the onnx model to collect the result + o_produced = execute_onnx(model, context) + + # Validate each output separately + for i, out in enumerate((f"out{i}" for i in range(num))): + # Compare expected (retrieved by index) to produced (retrieve by key) + assert (o_produced[out] == o_expected[i]).all() # noqa: "all" warning From 8ecebb65e2afb4831d78f1f741c1e47100be6077 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 12 Dec 2023 15:06:04 +0100 Subject: [PATCH 46/88] [Attention] Update dtype attribute of multi-heads and stream replication --- .../custom_op/fpgadataflow/attention_heads.py | 42 ++++++++++++++----- .../fpgadataflow/replicate_stream.py | 24 +++++++---- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 5e4fd05cb5..be1f75d51f 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -1,5 +1,7 @@ # Operating system stuff, e.g. paths import os +# Python warning subsystem +import warnings # Numpy math and arrays import numpy as np @@ -132,14 +134,22 @@ def make_shape_compatible_op(self, model: ModelWrapper): # noqa # Infers the datatype of the node output def infer_node_datatype(self, model: ModelWrapper): # noqa - # Get the node wrapped by this custom op + # Get the node wrapped by this custom op # noqa Duplicate node = self.onnx_node + # Test for changing input datatype + if model.get_tensor_datatype(node.input[0]) != self.dtype: + # Get the new datatype + new_dtype = model.get_tensor_datatype(node.input[0]) + # Issue a warning message + warnings.warn( + f"{node.name}: dtype changing from {self.dtype} to {new_dtype}" + ) + # Set the new datatype attribute + self.set_nodeattr("dtype", new_dtype.name) # Propagate the type from the input to each output tensor for o in node.output: - # Slicing simply propagates the type of the input to the output - model.set_tensor_datatype( - o, model.get_tensor_datatype(node.input[0]) - ) + # Slicing simply propagates the dtype to the output + model.set_tensor_datatype(o, self.dtype) # Executes multi-head slicing in python def _execute_node_python(self, context, graph): # noqa: graph unused @@ -633,11 +643,23 @@ def make_shape_compatible_op(self, model: ModelWrapper): # noqa # Infers the datatype of the node output def infer_node_datatype(self, model: ModelWrapper): # noqa # Get the node wrapped by this custom op - node = self.onnx_node - # Merging simply propagates the type of the input to the output - model.set_tensor_datatype( - node.output[0], model.get_tensor_datatype(node.input[0]) - ) + node = self.onnx_node # noqa Duplicate + # Test for changing input datatype + if model.get_tensor_datatype(node.input[0]) != self.dtype: + # Get the new datatype + new_dtype = model.get_tensor_datatype(node.input[0]) + # Issue a warning message + warnings.warn( + f"{node.name}: dtype changing from {self.dtype} to {new_dtype}" + ) + # Set the new datatype attribute + self.set_nodeattr("dtype", new_dtype.name) + # All inputs must have the same datatype + assert all( + model.get_tensor_datatype(inp) == self.dtype for inp in node.input + ), f"{node.name}: All inputs must have the same datatype" + # Merging simply propagates the datatype to the output + model.set_tensor_datatype(node.output[0], self.dtype) # Executes multi-head merging in python def _execute_node_python(self, context, graph): # noqa: graph unused diff --git a/src/finn/custom_op/fpgadataflow/replicate_stream.py b/src/finn/custom_op/fpgadataflow/replicate_stream.py index 5f0dc59a2c..755afda8cd 100644 --- a/src/finn/custom_op/fpgadataflow/replicate_stream.py +++ b/src/finn/custom_op/fpgadataflow/replicate_stream.py @@ -1,10 +1,10 @@ # Operating system stuff, e.g. paths import os +# Python warning subsystem +import warnings # Numpy math and arrays import numpy as np -# Protobuf onnx graph node type -from onnx import NodeProto # noqa # Helper for creating ONNX nodes from onnx import helper as oh # noqa @@ -12,7 +12,7 @@ from qonnx.core.datatype import DataType # noqa qonnx dependency is specified # in setup.cfg as well as in fetch-repos.sh # QONNX wrapper to ONNX model graphs -from qonnx.core.modelwrapper import ModelWrapper # noqa +from qonnx.core.modelwrapper import ModelWrapper # noqa qonnx # Converts inputs/outputs to/from RTL simulation format from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -100,14 +100,22 @@ def make_shape_compatible_op(self, model: ModelWrapper): # noqa # Infers the datatype of the node output def infer_node_datatype(self, model: ModelWrapper): # noqa - # Get the node wrapped by this custom op + # Get the node wrapped by this custom op # noqa Duplicate node = self.onnx_node + # Test for changing input datatype + if model.get_tensor_datatype(node.input[0]) != self.dtype: + # Get the new datatype + new_dtype = model.get_tensor_datatype(node.input[0]) + # Issue a warning message + warnings.warn( + f"{node.name}: dtype changing from {self.dtype} to {new_dtype}" + ) + # Set the new datatype attribute + self.set_nodeattr("dtype", new_dtype.name) # Propagate the type from the input to each output tensor for o in node.output: - # Replicating simply propagates the type of the input to the output - model.set_tensor_datatype( - o, model.get_tensor_datatype(node.input[0]) - ) + # Replicating simply propagates the dtype to the output + model.set_tensor_datatype(o, self.dtype) # Executes replicating inputs in python def _execute_node_python(self, context, graph): # noqa: graph unused From 1e932da0b9e060f983c053b02f254e33efd68190 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 13 Dec 2023 17:03:59 +0100 Subject: [PATCH 47/88] [Attention] Set default node execution mode to python This is expected for python-mode verification steps during dataflow build to work properly. --- src/finn/custom_op/fpgadataflow/attention.py | 4 +++- src/finn/custom_op/fpgadataflow/attention_heads.py | 8 ++++++-- src/finn/custom_op/fpgadataflow/replicate_stream.py | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 08d96ead58..a18b57fb66 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -124,7 +124,9 @@ def get_nodeattr_types(self): # Possible execution modes for simulating this node # Note: Override to support python mode - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + "exec_mode": ( + "s", False, "python", {"", "rtlsim", "cppsim", "python"} + ), # Input and output FIFO depths for multi-I/O nodes # Note: Need to override here as there are three inputs diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index be1f75d51f..168ee1207e 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -54,7 +54,9 @@ def get_nodeattr_types(self): "num_inputs": ("ints", True, [1]), # Possible execution modes for simulating this node # Note: Override to support python mode - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + "exec_mode": ( + "s", False, "python", {"", "rtlsim", "cppsim", "python"} + ), # Input and output FIFO depths for multi-I/O nodes # Note: Need to override here as there multiple outputs @@ -576,7 +578,9 @@ def get_nodeattr_types(self): "squeezed": ("i", True, 0), # Possible execution modes for simulating this node # Note: Override to support python mode - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + "exec_mode": ( + "s", False, "python", {"", "rtlsim", "cppsim", "python"} + ), # Input and output FIFO depths for multi-I/O nodes # Note: Need to override here as there multiple inputs diff --git a/src/finn/custom_op/fpgadataflow/replicate_stream.py b/src/finn/custom_op/fpgadataflow/replicate_stream.py index 755afda8cd..4190897b4a 100644 --- a/src/finn/custom_op/fpgadataflow/replicate_stream.py +++ b/src/finn/custom_op/fpgadataflow/replicate_stream.py @@ -50,7 +50,9 @@ def get_nodeattr_types(self): "num_inputs": ("ints", True, [1]), # Possible execution modes for simulating this node # Note: Override to support python mode - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim", "python"}), + "exec_mode": ( + "s", False, "python", {"", "rtlsim", "cppsim", "python"} + ), # Input and output FIFO depths for multi-I/O nodes # Note: Need to override here as there multiple outputs "inFIFODepths": ("ints", False, [2]), From db412af21ed2ed3743791e919d013bf018eabb8b Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 15 Dec 2023 16:42:37 +0100 Subject: [PATCH 48/88] [Attention] Introduce 'const' attention mask mode --- src/finn/custom_op/fpgadataflow/attention.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index a18b57fb66..3587dc3bf9 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -118,9 +118,12 @@ def get_nodeattr_types(self): "BiasActASoftmax": ("f", False, 0.0), # Mode used for providing the attention mask: There can be no mask, - # a mask sent as the fourth input or a causal attention mask which - # is generated by the operator itself. - "mask_mode": ("s", True, "none", {"none", "input", "causal"}), + # a mask sent as the fourth dynamic input, a mask provided as fourth + # constant input or a causal attention mask which is generated by + # the operator itself. + "mask_mode": ( + "s", True, "none", {"none", "input", "const", "causal"} + ), # Possible execution modes for simulating this node # Note: Override to support python mode From da6f6632672af90c1c6723afacbc8a6542cdaa84 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 9 Jan 2024 17:11:53 +0100 Subject: [PATCH 49/88] [Attention] Add python exec of all and HLS exec of causal mask modes --- src/finn/custom_op/fpgadataflow/attention.py | 59 ++++++++++++++++++-- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 3587dc3bf9..58c4ec308e 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -320,9 +320,46 @@ def act_av_matmul(x): # 1. Queries and keys multiplication followed by quantizing activation # function qk = act_qk_matmul(np.matmul(q, k.T)) + + # Load or create the attention mask for mutually exclusive mask modes + + # There might be no attention mask + if self.get_nodeattr("mask_mode") == "none": + # No mask can be realized by adding zero, which does not change + # anything + mask = 0 + # There might eb a causal attention mask + elif self.get_nodeattr("mask_mode") == "causal": + # A causal mask does not need to be stored and can be generated on + # the fly + mask = np.triu(-np.inf * np.ones_like(qk), 1) + # There might be a constant initializer attention mask + elif self.get_nodeattr("mask_mode") == "const": + # Load the mask initializer from the execution context + mask = context[ + self.get_input_name_by_name("M") + ] + # The attention operator represents attention masks as binary masks, + # but the numpy simulation requires floats with 0 and -inf + mask = np.where(mask, -np.inf * np.ones_like(mask), 0) + # The attention mask might be streamed in as the third input + elif self.get_nodeattr("mask_mode") == "input": + # Load the mask input from the execution context + mask = context[ + self.get_input_name_by_name("M") + ] + # The attention operator represents attention masks as binary masks, + # but the numpy simulation requires floats with 0 and -inf + mask = np.where(mask, -np.inf * np.ones_like(mask), 0) + # All other mask modes are not supported + else: + raise NotImplementedError( + f"Mask Mode {self.get_nodeattr('mask_mode')} is not implemented" + ) + # Softmax-normalization of the attention weights followed by quantizing # activation function - a = act_a_softmax(softmax(dequant * qk, axis=1)) + a = act_a_softmax(softmax(dequant * qk + mask, axis=1)) # 2. Attention weights and values matmul followed by quantization # activation function out = act_av_matmul(np.matmul(a, v)) @@ -588,9 +625,8 @@ def get_ap_int_max_w(self): o_bits_max = max((self.get_outstream_width(ind) for ind in range(1))) # Assume no bits to represent the mask, if there is no mask m_bits = 0 - # A mask received as input or produced as causal on the fly has a - # bit-width as well - if self.get_nodeattr("mask_mode") in {"input", "causal"}: + # A mask received as input has a bit-width as well + if self.get_nodeattr("mask_mode") in {"input", "const"}: # Parallelism is the number of elements in the last dimension of the # folded mask input _, _, elems = self.get_folded_input_shape(ind=3) @@ -865,6 +901,15 @@ def prepare_thresholds(ts, length, fold, dtype): f">" ]) + # For now, assume no attention mask as default + # TODO: Add all attention mask modes + attention_mask = "attention::mask::NONE" + + # If a causal mask is specified, set the appropriate tag dispatching + # instance + if self.get_nodeattr("mask_mode") == "causal": + attention_mask = "attention::mask::CAUSAL" + # Open a file to store the thresholds parameters as C++ code with open(f"{code_gen_dir}/params.hpp", "w") as file: # Write lines of C++ code separated by newlines to the file @@ -873,6 +918,8 @@ def prepare_thresholds(ts, length, fold, dtype): # dequantize the input to floating-point representation f"static const float dequant_softmax =" f" {self.get_nodeattr('DequantSoftmax')};", + # Attention mask parameters if "none", "causal" or "const" + f"static constexpr auto attention_mask = {attention_mask};", # Add type definition and threshold initialization of the # query-key matmul activation f"using ActQKMatMul = {act_qk_matmul};", @@ -1070,7 +1117,9 @@ def docompute(self): f"q_{self.hls_sname()}, " f"k_{self.hls_sname()}, " f"v_{self.hls_sname()}, " - f"out_{self.hls_sname()}" + f"out_{self.hls_sname()}," + # TODO: Does not work for "input" mode mask + "attention_mask" f");", ] From d64bb8c5e35904d00e0fc51fd0472b434d10ac30 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 10 Jan 2024 10:57:05 +0100 Subject: [PATCH 50/88] [Attention] Add "const" mask mode to input names --- src/finn/custom_op/fpgadataflow/attention.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 58c4ec308e..0785909dac 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -101,7 +101,6 @@ def get_nodeattr_types(self): # the Attention x Value multiplication "BiasActAVMatMul": ("f", False, 0.0), - # Scale factor preceding the softmax normalization to dequantize the # input "DequantSoftmax": ("f", False, 1.0), @@ -249,7 +248,8 @@ def infer_node_datatype(self, model): # Executes the attention operator in python mode simulation def _execute_node_python(self, context, graph): # noqa: graph unused # Multithreshold activations - from qonnx.custom_op.general.multithreshold import multithreshold # noqa + from qonnx.custom_op.general.multithreshold import \ + multithreshold # noqa # Get the node wrapped by this custom op node = self.onnx_node @@ -731,8 +731,10 @@ def get_input_name_by_name(self, name): # Specify for each input whether it is present or not inputs_present = [ # Note: Primary inputs are always present, the mask is present in - # input mask mode - True, True, True, self.get_nodeattr("mask_mode") == "input", + # "input" or "const" mask mode + True, True, True, self.get_nodeattr("mask_mode") in { + "input", "const" + }, ] # Thresholds are present if the activation function is set to From 72f4792effa590f9fd48ed1294d8053e3677ab93 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 10 Jan 2024 14:06:28 +0100 Subject: [PATCH 51/88] [Attention] Add HLS code generation of constant attention masks --- src/finn/custom_op/fpgadataflow/attention.py | 59 ++++++++++++++++---- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 0785909dac..bd57f924c0 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -356,10 +356,13 @@ def act_av_matmul(x): raise NotImplementedError( f"Mask Mode {self.get_nodeattr('mask_mode')} is not implemented" ) - # Softmax-normalization of the attention weights followed by quantizing # activation function - a = act_a_softmax(softmax(dequant * qk + mask, axis=1)) + a = act_a_softmax( + # Note: Reshape after masking, as the mask might broadcast messing + # with the shape + softmax((dequant * qk + mask).reshape(qk.shape), axis=1) + ) # 2. Attention weights and values matmul followed by quantization # activation function out = act_av_matmul(np.matmul(a, v)) @@ -561,7 +564,7 @@ def get_folded_input_shape(self, ind=0): # If the mask is provided as input, it is folded along the second # sequence dimension - if ind == 3 and self.get_nodeattr("mask_mode") == "input": + if ind == 3 and self.get_nodeattr("mask_mode") in {"input", "const"}: # Note: Both dimensions are sequence dimension, the second # corresponds to the KVLen return ilen, seqfold, idim // seqfold @@ -903,14 +906,48 @@ def prepare_thresholds(ts, length, fold, dtype): f">" ]) - # For now, assume no attention mask as default - # TODO: Add all attention mask modes - attention_mask = "attention::mask::NONE" + # Assume no attention mask as a default: Generate C++ code of tag + # instance of "none" mask type + attention_mask = \ + "static const auto attention_mask = attention::mask::NONE" # If a causal mask is specified, set the appropriate tag dispatching # instance if self.get_nodeattr("mask_mode") == "causal": - attention_mask = "attention::mask::CAUSAL" + # Generate C++ code of tag instance of causal mask type + attention_mask = \ + "static const auto attention_mask = attention::mask::CAUSAL" + + # If a constant mask is specified, array code needs to be generated + if self.get_nodeattr("mask_mode") == "const": + # Attention mask type of folded constant mask array + mask_type = "attention::mask::Const" + # Get the constant mask values + mask = model.get_initializer(self.get_input_name_by_name("M")) + # Num should always be equal to QLen + num = mask.shape[-1] + # Partition the mask along the length into folds of parallel + # elements + mask = interleave_matrix_outer_dim_from_partitions( + mask, kvlen // seqfold + ) + # Reshape folded mask adding an outer dimension + mask = mask.reshape(num, kvlen // seqfold, seqfold).squeeze() + # Format the mask as C++ array code + # Note: no packing, no variable name/type declaration + mask = numpy_to_hls_code(mask, DataType["BINARY"], "_", False, True) + # Generate C++ code initializing the constant mask array + attention_mask = f"static const {mask_type} attention_mask = {mask}" + + # Of a mask is provided as input, no object parameters need to be + # generated here + if self.get_nodeattr("mask_mode") == "input": + # Attention mask type of input stream + mask_type = "attention::mask::Input" + # Generate C++ code creating an input stream instance for the mask + # Note: This is just a dummy, the real input stream will be part + # of the operator interface + attention_mask = f"static const {mask_type} attention_mask;" # Open a file to store the thresholds parameters as C++ code with open(f"{code_gen_dir}/params.hpp", "w") as file: @@ -921,7 +958,9 @@ def prepare_thresholds(ts, length, fold, dtype): f"static const float dequant_softmax =" f" {self.get_nodeattr('DequantSoftmax')};", # Attention mask parameters if "none", "causal" or "const" - f"static constexpr auto attention_mask = {attention_mask};", + f"{attention_mask};", + # Type alias to the generated attention mask for convenience + f"using AttentionMask = decltype(attention_mask);", # Add type definition and threshold initialization of the # query-key matmul activation f"using ActQKMatMul = {act_qk_matmul};", @@ -1119,9 +1158,9 @@ def docompute(self): f"q_{self.hls_sname()}, " f"k_{self.hls_sname()}, " f"v_{self.hls_sname()}, " - f"out_{self.hls_sname()}," + f"out_{self.hls_sname()}, " # TODO: Does not work for "input" mode mask - "attention_mask" + f"attention_mask" f");", ] From f8c04770ca92e4575a46a7263e861a206acfd616 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sun, 28 Jan 2024 16:14:13 +0100 Subject: [PATCH 52/88] [Attention] Rework minimize_accumulator_width similar to MVAU --- src/finn/custom_op/fpgadataflow/attention.py | 102 +++++++++++++++---- 1 file changed, 84 insertions(+), 18 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index bd57f924c0..f2d966a828 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -2,6 +2,9 @@ import os # Python warning subsystem import warnings +# Python builtin math functions: math.ceil returns int, while np.ceil returns +# float +import math # Numpy math and arrays import numpy as np @@ -14,8 +17,11 @@ # in setup.cfg as well as in fetch-repos.sh # QONNX wrapper to ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper # noqa -# Partitions tensor into folded/pe groups -from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions # noqa +# Some utils for working with tensors in qonnx +from qonnx.util.basic import ( # noqa + interleave_matrix_outer_dim_from_partitions, + calculate_matvec_accumulator_range +) # Softmax function on numpy arrays with overflow handling matching the HLS @@ -682,22 +688,82 @@ def minimize_accumulator_width(self, model): # noqa: model is unused KType = DataType[self.get_nodeattr("KType")] # noqa VType = DataType[self.get_nodeattr("VType")] # noqa AType = DataType[self.get_nodeattr("AType")] # noqa - # Minimal and maximal possible results of query-key multiplication - qk_min = self.get_nodeattr("QKDim") * QType.min() * KType.min() - qk_max = self.get_nodeattr("QKDim") * QType.max() * KType.max() - # Minimal and maximal possible results of attention-value multiplication - av_min = self.get_nodeattr("KVLen") * AType.min() * VType.min() - av_max = self.get_nodeattr("KVLen") * AType.max() * VType.max() - # Update the accumulator types to fit the min-max range - # TODO: Is this correct? - _qk_max = max(-qk_min, 1 + qk_max) - acc_bit_width = np.log2(_qk_max) + 1 - acc_bit_width = int(np.ceil(acc_bit_width)) - self.set_nodeattr("AccQKMatMul", f"UINT{acc_bit_width}") - _av_max = max(-av_min, 1 + av_max) - acc_bit_width = np.log2(_av_max) + 1 - acc_bit_width = int(np.ceil(acc_bit_width)) - self.set_nodeattr("AccAVMatMul", f"UINT{acc_bit_width}") + + # Compute the worst-case upper and lower bounds of the accumulator range + lower_worst = QType.min() * np.ones(self.get_normal_input_shape(0)) + lower_range = calculate_matvec_accumulator_range(lower_worst, KType) + upper_worst = QType.max() * np.ones(self.get_normal_input_shape(0)) + upper_range = calculate_matvec_accumulator_range( # noqa: Duplicate + upper_worst, KType + ) + # Minimum and maximum values of the range + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + # Unsigned accumulator range + if acc_min >= 0: + # Number of bits necessary to represent the maximum value of the + # range. Some values between 0 and acc_min might be unused. + bitwidth = math.ceil(np.log2(acc_max + 1)) + # New unsigned accumulator datatype of this bitwidth + AccQKMatMul = DataType[f"UINT{bitwidth}"] # noqa + # Signed accumulator range + else: + # Maximum absolute value which needs to be represented + acc_max = max(-acc_min, 1 + acc_max) + # Number of bits necessary to represent the maximum value of the + # range. Some values on one of the ends might remain unused. + bitwidth = math.ceil(np.log2(acc_max) + 1) + # New signed accumulator datatype of this bitwidth + AccQKMatMul = DataType[f"INT{bitwidth}"] # noqa + # Update the accumulator datatype attribute + self.set_nodeattr("AccQKMatMul", AccQKMatMul.name) + # If there is no activation function following the accumulator, the + # output type needs to be adjusted as well + if self.get_nodeattr("ActQKMatMul") == "none": + # Update the output datatype attribute to the same type as the + # accumulator + self.set_nodeattr("OutQKMatMul", AccQKMatMul.name) + + # Compute the worst-case upper and lower bounds of the accumulator range + lower_worst = AType.min() * np.ones(self.get_normal_attention_shape(0)) + lower_range = calculate_matvec_accumulator_range(lower_worst, VType) + upper_worst = AType.max() * np.ones(self.get_normal_attention_shape(0)) + upper_range = calculate_matvec_accumulator_range( # noqa: Duplicate + upper_worst, VType + ) + # Minimum and maximum values of the range + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + # Unsigned accumulator range + if acc_min >= 0: + # Number of bits necessary to represent the maximum value of the + # range. Some values between 0 and acc_min might be unused. + bitwidth = math.ceil(np.log2(acc_max + 1)) + # New unsigned accumulator datatype of this bitwidth + AccAVMatMul = DataType[f"UINT{bitwidth}"] # noqa + # Signed accumulator range + else: + # Maximum absolute value which needs to be represented + acc_max = max(-acc_min, 1 + acc_max) + # Number of bits necessary to represent the maximum value of the + # range. Some values on one of the ends might remain unused. + bitwidth = math.ceil(np.log2(acc_max) + 1) + # New signed accumulator datatype of this bitwidth + AccAVMatMul = DataType[f"INT{bitwidth}"] # noqa + # Update the accumulator datatype attribute + self.set_nodeattr("AccAVMatMul", AccAVMatMul.name) + # If there is no activation function following the accumulator, the + # output type needs to be adjusted as well + if self.get_nodeattr("ActAVMatMul") == "none": + # Update the output datatype attribute to the same type as the + # accumulator + self.set_nodeattr("OutAVMatMul", AccQKMatMul.name) + # # The output type of the whole operator is the same as the output + # # type of the last MatMul + # TODO: This currently breaks MergeMultiHeads via + # MinimizeAccumulatorWidth, which re-infers datatypes after + # each custom op instead of once after traversing the whole graph. + # self.set_nodeattr("OType", AccQKMatMul.name) # Gets the number of expected output values, i.e. how many times read() # could/should be called on the output stream of this operator From 5311b9c90419494a403e6f0a4cac7451fe3bf4f2 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sun, 28 Jan 2024 16:21:29 +0100 Subject: [PATCH 53/88] [Attention] Set HLS_CONSTEXPR_ENABLE in code generation templates This is needed by some components of the attention-hlslib. If it later turns out to interfere with other parts of finn, a workaround could possibly be found. --- src/finn/custom_op/fpgadataflow/templates.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 150ba3b578..6d9fcc288b 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -29,6 +29,7 @@ # template for single node execution docompute_template = """ +#define HLS_CONSTEXPR_ENABLE #define AP_INT_MAX_W $AP_INT_MAX_W$ #include "cnpy.h" #include "npy2apintstream.hpp" @@ -62,6 +63,7 @@ # cpp file ipgen_template = """ +#define HLS_CONSTEXPR_ENABLE #define AP_INT_MAX_W $AP_INT_MAX_W$ #include "bnn-library.h" From 0a95208314c15bf2f48dfaa3373da5a9cb55de0c Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sun, 28 Jan 2024 17:23:17 +0100 Subject: [PATCH 54/88] [Attention] Improve stability of softmax by subtracting row-wise maximum --- src/finn/custom_op/fpgadataflow/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index f2d966a828..20139aa981 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -33,7 +33,7 @@ def softmax(x, axis): # Count the occurrences of the maximum along the normalization axis max_counts = np.sum(max_ones, axis=axis, keepdims=True) # Exponential of the input - exp = np.exp(x) + exp = np.exp(x - np.max(x, axis=axis)) # Compute the total along axis total = np.sum(exp, axis=axis, keepdims=True) # Detect overflow of the summation From f84e5f4442b6af49343e3d9f58d90248f5951fa9 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 7 Feb 2024 17:45:44 +0100 Subject: [PATCH 55/88] [Attention] Fix threshold tensor code generation --- src/finn/custom_op/fpgadataflow/attention.py | 119 +++++++++++-------- 1 file changed, 67 insertions(+), 52 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 20139aa981..72840b2722 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -472,13 +472,6 @@ def get_input_datatype(self, ind=0): # activation maps from AccQKMatMul to OutQKMatMul inputs += ["AccQKMatMul"] - # If there is a thresholding activation for the second matmul, it will - # have a type as well - if self.get_nodeattr("ActAVMatMul") == "thresholds": - # The thresholds will always be of the accumulator type as the - # activation maps from AccAVMatMul to OutAVMatMul - inputs += ["AccAVMatMul"] - # If there is a thresholding activation for the softmax normalization, # it will have a type as well if self.get_nodeattr("ActASoftmax") == "thresholds": @@ -486,6 +479,13 @@ def get_input_datatype(self, ind=0): # threshold type of the softmax, these are currently always floats inputs += ["AccASoftmax"] + # If there is a thresholding activation for the second matmul, it will + # have a type as well + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # The thresholds will always be of the accumulator type as the + # activation maps from AccAVMatMul to OutAVMatMul + inputs += ["AccAVMatMul"] + # Look up datatype name in attributes and convert to DataType return DataType[self.get_nodeattr(f"{inputs[ind]}")] @@ -525,12 +525,6 @@ def get_normal_input_shape(self, ind=0): # TODO: This is just a dummy shape inputs_shapes += [(0, 0)] - # If there is a thresholding activation for the second matmul, these - # will be the next input index after the (optional) first thresholds - if self.get_nodeattr("ActAVMatMul") == "thresholds": - # TODO: This is just a dummy shape - inputs_shapes += [(0, 0)] - # If there is a thresholding activation for the softmax normalization, # these will be the next (and last) input index after the (optional) # second thresholds @@ -538,6 +532,12 @@ def get_normal_input_shape(self, ind=0): # TODO: This is just a dummy shape inputs_shapes += [(0, 0)] + # If there is a thresholding activation for the second matmul, these + # will be the next input index after the (optional) first thresholds + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # TODO: This is just a dummy shape + inputs_shapes += [(0, 0)] + # Get the shape by indexing into the ordered list of all inputs return inputs_shapes[ind] @@ -757,7 +757,7 @@ def minimize_accumulator_width(self, model): # noqa: model is unused if self.get_nodeattr("ActAVMatMul") == "none": # Update the output datatype attribute to the same type as the # accumulator - self.set_nodeattr("OutAVMatMul", AccQKMatMul.name) + self.set_nodeattr("OutAVMatMul", AccAVMatMul.name) # # The output type of the whole operator is the same as the output # # type of the last MatMul # TODO: This currently breaks MergeMultiHeads via @@ -789,8 +789,8 @@ def get_input_name_by_name(self, name): # Ordered names of the (optional) threshold inputs thresholds = [ "thresholds_qk_matmul", + "thresholds_a_softmax", "thresholds_av_matmul", - "thresholds_a_softmax" ] # Ordered names of primary query, key, value inputs and optional mask @@ -810,8 +810,8 @@ def get_input_name_by_name(self, name): # thresholds inputs_present.extend([ self.get_nodeattr("ActQKMatMul") == "thresholds", - self.get_nodeattr("ActAVMatMul") == "thresholds", - self.get_nodeattr("ActASoftmax") == "thresholds" + self.get_nodeattr("ActASoftmax") == "thresholds", + self.get_nodeattr("ActAVMatMul") == "thresholds" ]) # Filter the ordered list of input names for those which are actually @@ -888,6 +888,9 @@ def prepare_thresholds(ts, length, fold, dtype): thresholds_qk_matmul, num = prepare_thresholds( thresholds, kvlen, seqfold, thresholds_dtype ) + # Get the HLS datatype string corresponding to the thresholds + # datatype for C++ code generation + dtype_str = thresholds_dtype.get_hls_datatype_str() # Replace default pass-through activation by thresholding activation # Note: Relies on type and shape definitions generated by the # "defines" method @@ -898,77 +901,89 @@ def prepare_thresholds(ts, length, fold, dtype): f" {num}," f" AccQKMatMul," f" OutQKMatMul," - f" {bias}", + f" {bias}," + # Note: Not sure why the default comp::less does not work... + f" comp::less_equal<{dtype_str}, {dtype_str}>", f">" ]) - # Attention-value matmul can have an optional activation function set to - # thresholding activations via node attribute - if self.get_nodeattr("ActAVMatMul") == "thresholds": + # Softmax can have an optional activation function set to thresholding + # activations via node attribute + if self.get_nodeattr("ActASoftmax") == "thresholds": # In this case there will be a thresholds parameter initializer thresholds = model.get_initializer( - self.get_input_name_by_name("thresholds_av_matmul") + self.get_input_name_by_name("thresholds_a_softmax") ) # Get the datatype of the thresholds - thresholds_dtype = DataType[self.get_nodeattr("AccAVMatMul")] + thresholds_dtype = DataType[self.get_nodeattr("AccASoftmax")] # Activation value, i.e., bias applied after thresholding activation - bias = self.get_nodeattr("BiasActAVMatMul") + bias = self.get_nodeattr("BiasActASoftmax") # No support for floating-point bias - assert int(bias) == bias, "BiasActAVMatMul must be integer" + assert int(bias) == bias, "BiasActASoftmax must be integer" # Convert the bias to integer representation, so it can be used as a # template argument bias = int(bias) - # Format the thresholds as C++ array code: AV matmul outputs fold - # along the value embedding dimension - thresholds_av_matmul, num = prepare_thresholds( - thresholds, vdim, embfold, thresholds_dtype + # Format the thresholds as C++ array code: Softmax outputs fold + # along the key-value sequence length dimension + thresholds_a_softmax, num = prepare_thresholds( + thresholds, kvlen, seqfold, thresholds_dtype ) + # Get the HLS datatype string corresponding to the thresholds + # datatype for C++ code generation + dtype_str = thresholds_dtype.get_hls_datatype_str() # Replace default pass-through activation by thresholding activation # Note: Relies on type and shape definitions generated by the # "defines" method - act_av_matmul = "\n".join([ + act_a_softmax = "\n".join([ f"ThresholdsActivation<", - f" EmbFold," - f" VDim/EmbFold," + f" SeqFold," + f" KVLen/SeqFold," f" {num}," - f" AccAVMatMul," - f" OutAVMatMul," - f" {bias}" + f" AccASoftmax," + f" AType," + f" {bias}," + # Note: Not sure why the default comp::less does not work... + f" comp::less_equal<{dtype_str}, {dtype_str}>", f">" ]) - # Softmax can have an optional activation function set to thresholding - # activations via node attribute - if self.get_nodeattr("ActASoftmax") == "thresholds": + # Attention-value matmul can have an optional activation function set to + # thresholding activations via node attribute + if self.get_nodeattr("ActAVMatMul") == "thresholds": # In this case there will be a thresholds parameter initializer thresholds = model.get_initializer( - self.get_input_name_by_name("thresholds_a_softmax") + self.get_input_name_by_name("thresholds_av_matmul") ) # Get the datatype of the thresholds - thresholds_dtype = DataType[self.get_nodeattr("AccASoftmax")] + thresholds_dtype = DataType[self.get_nodeattr("AccAVMatMul")] # Activation value, i.e., bias applied after thresholding activation - bias = self.get_nodeattr("BiasActASoftmax") + bias = self.get_nodeattr("BiasActAVMatMul") # No support for floating-point bias - assert int(bias) == bias, "BiasActASoftmax must be integer" + assert int(bias) == bias, "BiasActAVMatMul must be integer" # Convert the bias to integer representation, so it can be used as a # template argument bias = int(bias) - # Format the thresholds as C++ array code: Softmax outputs fold - # along the key-value sequence length dimension - thresholds_a_softmax, num = prepare_thresholds( - thresholds, kvlen, seqfold, thresholds_dtype + # Format the thresholds as C++ array code: AV matmul outputs fold + # along the value embedding dimension + thresholds_av_matmul, num = prepare_thresholds( + thresholds, vdim, embfold, thresholds_dtype ) + # Get the HLS datatype string corresponding to the thresholds + # datatype for C++ code generation + dtype_str = thresholds_dtype.get_hls_datatype_str() # Replace default pass-through activation by thresholding activation # Note: Relies on type and shape definitions generated by the # "defines" method - act_a_softmax = "\n".join([ + act_av_matmul = "\n".join([ f"ThresholdsActivation<", - f" SeqFold," - f" KVLen/SeqFold," + f" EmbFold," + f" VDim/EmbFold," f" {num}," - f" AccASoftmax," - f" AType," - f" {bias}", + f" AccAVMatMul," + f" OutAVMatMul," + f" {bias}," + # Note: Not sure why the default comp::less does not work... + f" comp::less_equal<{dtype_str}, {dtype_str}>", f">" ]) From c4a456ea0ed22137af96686c03125c5c9e7a31ab Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 8 Feb 2024 10:58:13 +0100 Subject: [PATCH 56/88] Correct test function name fixing copy and paste error --- tests/fpgadataflow/test_fpgadataflow_replicate_stream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py index 3f802d5303..89d439fd3b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py +++ b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py @@ -29,7 +29,7 @@ # Creates a model executing stream replication -def mock_split_multi_heads(num_inputs, num_elems, num, dtype): +def mock_replicate_streams(num_inputs, num_elems, num, dtype): # Create a node representing the stream replication operation node = oh.make_node( # Operator type from the name of the fpgadataflow hlscustomop @@ -91,7 +91,7 @@ def mock_split_multi_heads(num_inputs, num_elems, num, dtype): # execution def test_replicate_stream_python(num_inputs, num_elems, num, dtype): # Make dummy model for testing - model = mock_split_multi_heads(num_inputs, num_elems, num, dtype) + model = mock_replicate_streams(num_inputs, num_elems, num, dtype) # Prepare the execution context context = { @@ -131,7 +131,7 @@ def test_replicate_stream_python(num_inputs, num_elems, num, dtype): # execution def test_replicate_stream_cppsim(num_inputs, num_elems, num, dtype): # Make dummy model for testing - model = mock_split_multi_heads(num_inputs, num_elems, num, dtype) + model = mock_replicate_streams(num_inputs, num_elems, num, dtype) # Prepare the execution context context = { @@ -174,7 +174,7 @@ def test_replicate_stream_cppsim(num_inputs, num_elems, num, dtype): # execution def test_replicate_stream_rtlsim(num_inputs, num_elems, num, dtype): # Make dummy model for testing - model = mock_split_multi_heads(num_inputs, num_elems, num, dtype) + model = mock_replicate_streams(num_inputs, num_elems, num, dtype) # Prepare the execution context context = { From dba89869ddf7c12676514e679b36459f5e67312f Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 4 Apr 2024 10:27:05 +0200 Subject: [PATCH 57/88] [Refactor] Patch all attention ops to work with new class hierarchy Note: This is just a provisional solution, transplanting the existing code into the namespace/module of the HLS backend without actually disentangling common and backend specific code. There is also, of course, no RTL backend of the attention operator. --- src/finn/custom_op/fpgadataflow/attention.py | 12 ++++++++--- .../custom_op/fpgadataflow/attention_heads.py | 20 ++++++++++++++----- .../custom_op/fpgadataflow/hls/__init__.py | 16 +++++++++++++++ .../fpgadataflow/replicate_stream.py | 12 ++++++++--- 4 files changed, 49 insertions(+), 11 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 72840b2722..8587761adc 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -9,7 +9,9 @@ import numpy as np # Derive custom operators form the FINN base custom op -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +# Specialize the custom op as HLS backend implementation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend # Convert and pack (numpy) data for C++ code generation from finn.util.data_packing import numpy_to_hls_code # QONNX/FINN datatypes @@ -45,7 +47,7 @@ def softmax(x, axis): # Scaled Dot-Product Attention Custom Operator # Note: Single head attention -class ScaledDotProductAttention(HLSCustomOp): +class ScaledDotProductAttention(HWCustomOp, HLSBackend): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -55,9 +57,13 @@ def __init__(self, onnx_node, **kwargs): # in another repository right now. def get_nodeattr_types(self): # Start from parent operator class attributes - attrs = super().get_nodeattr_types() + attrs = HWCustomOp.get_nodeattr_types(self) + attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ + # Force implementation style to HLS backend + "preferred_impl_style": ("s", False, "hls", {"", "hls"}), + # Embedding dimension of queries and keys "QKDim": ("i", True, 0), # Length of the query sequence diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 168ee1207e..6a738e0b13 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -19,11 +19,13 @@ # Converts inputs/outputs to/from RTL simulation format from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # Derive custom operators form the FINN base custom op -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +# Specialize the custom op as HLS backend implementation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend # Splitting of attention heads (after input projections) custom operator -class SplitMultiHeads(HLSCustomOp): +class SplitMultiHeads(HWCustomOp, HLSBackend): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -38,9 +40,13 @@ def __init__(self, onnx_node, **kwargs): # Defines attributes which must be present on this node def get_nodeattr_types(self): # Start from parent operator class attributes - attrs = super().get_nodeattr_types() + attrs = HWCustomOp.get_nodeattr_types(self) + attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ + # Force implementation style to HLS backend + "preferred_impl_style": ("s", False, "hls", {"", "hls"}), + # Number of attention heads "heads": ("i", True, 1), # Specifies whether the output is packed as a single output tensor @@ -545,7 +551,7 @@ def get_verilog_top_module_intf_names(self): # Merging of attention heads (before output projections) custom operator -class MergeMultiHeads(HLSCustomOp): +class MergeMultiHeads(HWCustomOp, HLSBackend): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -560,9 +566,13 @@ def __init__(self, onnx_node, **kwargs): # Defines attributes which must be present on this node def get_nodeattr_types(self): # Start from parent operator class attributes - attrs = super().get_nodeattr_types() + attrs = HWCustomOp.get_nodeattr_types(self) + attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ + # Force implementation style to HLS backend + "preferred_impl_style": ("s", False, "hls", {"", "hls"}), + # Number of attention heads "heads": ("i", True, 1), # Specifies whether the output is packed as a single output tensor diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 405c47a08d..9eb6793bbc 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -79,3 +79,19 @@ custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls custom_op["MVAU_hls"] = MVAU_hls custom_op["VVAU_hls"] = VVAU_hls + +from finn.custom_op.fpgadataflow.attention import ( + ScaledDotProductAttention as ScaledDotProductAttention_hls +) +from finn.custom_op.fpgadataflow.attention_heads import ( + SplitMultiHeads as SplitMultiHeads_hls, + MergeMultiHeads as MergeMultiHeads_hls +) +from finn.custom_op.fpgadataflow.replicate_stream import ( + ReplicateStream as ReplicateStream_hls +) + +custom_op["ScaledDotProductAttention_hls"] = ScaledDotProductAttention_hls +custom_op["SplitMultiHeads_hls"] = SplitMultiHeads_hls +custom_op["MergeMultiHeads_hls"] = MergeMultiHeads_hls +custom_op["ReplicateStream_hls"] = ReplicateStream_hls diff --git a/src/finn/custom_op/fpgadataflow/replicate_stream.py b/src/finn/custom_op/fpgadataflow/replicate_stream.py index 4190897b4a..aaba72757b 100644 --- a/src/finn/custom_op/fpgadataflow/replicate_stream.py +++ b/src/finn/custom_op/fpgadataflow/replicate_stream.py @@ -17,12 +17,14 @@ # Converts inputs/outputs to/from RTL simulation format from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # Derive custom operators form the FINN base custom op -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +# Specialize the custom op as HLS backend implementation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend # Replicates an input stream to arbitrary many output streams # See DuplicateStreams_Batch for feeding exactly two streams -class ReplicateStream(HLSCustomOp): +class ReplicateStream(HWCustomOp, HLSBackend): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -37,9 +39,13 @@ def __init__(self, onnx_node, **kwargs): # Defines attributes which must be present on this node def get_nodeattr_types(self): # Start from parent operator class attributes - attrs = super().get_nodeattr_types() + attrs = HWCustomOp.get_nodeattr_types(self) + attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ + # Force implementation style to HLS backend + "preferred_impl_style": ("s", False, "hls", {"", "hls"}), + # Number of replicas to produce "num": ("i", True, 1), # Data type of input and output elements From fa13f467fad4aa2cdd43c637265f63b5f46b08f0 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 4 Apr 2024 17:45:02 +0200 Subject: [PATCH 58/88] [Refactor] Disentangle HWCustomOp and HLSBackend parts of attention ops --- .isort.cfg | 3 + src/finn/custom_op/fpgadataflow/attention.py | 671 +----------------- .../custom_op/fpgadataflow/attention_heads.py | 488 +------------ .../custom_op/fpgadataflow/hls/__init__.py | 19 +- .../fpgadataflow/hls/attention_heads_hls.py | 490 +++++++++++++ .../fpgadataflow/hls/attention_hls.py | 669 +++++++++++++++++ .../fpgadataflow/hls/replicate_stream_hls.py | 249 +++++++ .../fpgadataflow/replicate_stream.py | 251 +------ 8 files changed, 1480 insertions(+), 1360 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/attention_heads_hls.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/attention_hls.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py diff --git a/.isort.cfg b/.isort.cfg index 5378b88fad..efb7a4a352 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -9,3 +9,6 @@ sections=FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER default_section=THIRDPARTY multi_line_output=3 profile=black +ignore_comments=true +ignore_whitespace=true +honor_noqa=true diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 8587761adc..293b442bc2 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -1,29 +1,24 @@ -# Operating system stuff, e.g. paths -import os -# Python warning subsystem -import warnings +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + # Python builtin math functions: math.ceil returns int, while np.ceil returns # float import math # Numpy math and arrays import numpy as np +# Python warning subsystem +import warnings -# Derive custom operators form the FINN base custom op -from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -# Specialize the custom op as HLS backend implementation -from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend -# Convert and pack (numpy) data for C++ code generation -from finn.util.data_packing import numpy_to_hls_code # QONNX/FINN datatypes -from qonnx.core.datatype import DataType # noqa qonnx dependency is specified -# in setup.cfg as well as in fetch-repos.sh -# QONNX wrapper to ONNX model graphs -from qonnx.core.modelwrapper import ModelWrapper # noqa +from qonnx.core.datatype import DataType +# Multithreshold activations +from qonnx.custom_op.general.multithreshold import multithreshold # Some utils for working with tensors in qonnx -from qonnx.util.basic import ( # noqa - interleave_matrix_outer_dim_from_partitions, - calculate_matvec_accumulator_range -) +from qonnx.util.basic import calculate_matvec_accumulator_range + +# Derive custom operators form the FINN base custom op +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp # Softmax function on numpy arrays with overflow handling matching the HLS @@ -47,23 +42,18 @@ def softmax(x, axis): # Scaled Dot-Product Attention Custom Operator # Note: Single head attention -class ScaledDotProductAttention(HWCustomOp, HLSBackend): +class ScaledDotProductAttention(HWCustomOp): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base super().__init__(onnx_node, **kwargs) - # WIP: Refactor the node attributes matching the HLS operator which is WIP - # in another repository right now. + # Node attributes matching the HLS operator def get_nodeattr_types(self): # Start from parent operator class attributes attrs = HWCustomOp.get_nodeattr_types(self) - attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ - # Force implementation style to HLS backend - "preferred_impl_style": ("s", False, "hls", {"", "hls"}), - # Embedding dimension of queries and keys "QKDim": ("i", True, 0), # Length of the query sequence @@ -259,10 +249,6 @@ def infer_node_datatype(self, model): # Executes the attention operator in python mode simulation def _execute_node_python(self, context, graph): # noqa: graph unused - # Multithreshold activations - from qonnx.custom_op.general.multithreshold import \ - multithreshold # noqa - # Get the node wrapped by this custom op node = self.onnx_node @@ -386,57 +372,16 @@ def act_av_matmul(x): # Executes the attention operator in C++ mode simulation def _execute_node_cppsim(self, context, graph): # noqa: graph unused - # Get the node wrapped by this custom op - node = self.onnx_node - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - - # By convention, inputs 0, 1 and 2 correspond to named inputs q, k and v - - # Read the input from the execution context and reshape to match the - # expected folding - q = context[node.input[0]].reshape(self.get_folded_input_shape(ind=0)) - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"q.npy"), q) - - # Read the input from the execution context and reshape to match the - # expected folding - k = context[node.input[1]].reshape(self.get_folded_input_shape(ind=1)) - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"k.npy"), k) - - # Read the input from the execution context and reshape to match the - # expected folding - v = context[node.input[2]].reshape(self.get_folded_input_shape(ind=2)) - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"v.npy"), v) - - # Optionally, the mask may be provided as an input as well - if self.get_nodeattr("mask_mode") == "input": - # Read the input from the execution context and reshape to match the - # expected folding - m = context[node.input[3]].reshape( - self.get_folded_input_shape(ind=3) - ) - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"m.npy"), m) - - # Execute the precompiled model - super().exec_precompiled_singlenode_model() - - # Load the output numpy file generated by the C++ simulation - out = np.load(os.path.join(code_gen_dir, f"out.npy")) - # Reshape the folded output and insert into the execution context - context[self.onnx_node.output[0]] = out.reshape( - self.get_normal_output_shape(ind=0) + # C++ Simulation needs to be implemented in HLS backend specialization + raise NotImplementedError( + f"exec_mode cppsim of {self.__class__.__name__} is not implemented!" ) # Executes the attention operator in RTL mode simulation def _execute_node_rtlsim(self, context, graph): # noqa: graph unused - # TODO: Implement rtlsim mode - # Note: Cannot even compile this right now due to missing float ips + # RTL Simulation needs to be implemented in backend specialization raise NotImplementedError( - "exec_mode rtlsim is not implemented yet!" + f"exec_mode rtlsim of {self.__class__.__name__} is not implemented!" ) # Executes the attention operator in simulation (either python, c++ or rtl) @@ -632,61 +577,6 @@ def get_outstream_width(self, ind=0): # Width of a stream producing output elements in parallel return elems * o_bits - # Maximum width of any ap_int used in this operator - def get_ap_int_max_w(self): - # Find the widths of the widest input - i_bits_max = max((self.get_instream_width(ind) for ind in range(3))) - # Find the widths of the widest output - o_bits_max = max((self.get_outstream_width(ind) for ind in range(1))) - # Assume no bits to represent the mask, if there is no mask - m_bits = 0 - # A mask received as input has a bit-width as well - if self.get_nodeattr("mask_mode") in {"input", "const"}: - # Parallelism is the number of elements in the last dimension of the - # folded mask input - _, _, elems = self.get_folded_input_shape(ind=3) - # Get width of the mask datatype - m_bits = elems * DataType[self.get_nodeattr("MType")].bitwidth() - - # Elements per folded key input (second input) - _, _, i_elems = self.get_folded_input_shape(ind=1) - # Elements per folded value input (third input), same as the number of - # output elements - _, _, o_elems = self.get_folded_input_shape(ind=2) - - # Parallelism is the number of elements in the last dimension of the - # folded attention weights - _, _, s_elems = self.get_folded_attention_shape() - # Number of bits used for the attention weights stream - a_bits = s_elems * DataType[self.get_nodeattr("AType")].bitwidth() - - # Maximum bits per tile of the key and value matrix streams - tile_bits_max = max([ - i_elems * s_elems * DataType[self.get_nodeattr("KType")].bitwidth(), - o_elems * s_elems * DataType[self.get_nodeattr("VType")].bitwidth(), - ]) - # Maximum bits per matmul accumulators - acc_bits_max = max([ - # These are not streamed, thus single element width is counted - DataType[self.get_nodeattr("AccQKMatMul")].bitwidth(), - DataType[self.get_nodeattr("AccAVMatMul")].bitwidth(), - ]) - # Maximum bits per matmul outputs - out_bits_max = max([ - # These are the stream widths, which are always >= than individual - # elements - s_elems * DataType[self.get_nodeattr("OutQKMatMul")].bitwidth(), - o_elems * DataType[self.get_nodeattr("OutAVMatMul")].bitwidth(), - ]) - # Aggregate the maximum bit width in both matmul operators over all - # inputs, intermediates and outputs - matmul_bits_max = max([ - tile_bits_max, acc_bits_max, out_bits_max - ]) - - # Find maximum of all (maximal) bit-widths - return max([i_bits_max, o_bits_max, m_bits, a_bits, matmul_bits_max]) - # Minimize the accumulator bit width def minimize_accumulator_width(self, model): # noqa: model is unused # Get the query, key, value and attention weights type @@ -778,14 +668,6 @@ def get_number_output_values(self): # the embedding dimension return np.prod(self.get_folded_output_shape()[:-1]) - # Generates list of C++ includes to be placed at the top of the generated - # code - def global_includes(self): - # FINN HLSLIB activation functions: e.g. PassThroughActivation - self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] - # Attention operator HLS code - self.code_gen_dict["$GLOBALS$"] += ['#include "attention.hpp"'] - # Converts names of optional inputs to the node input index and from there # to the ONNX node input name if the input is present. # Note: This mapping is required as the ONNX graph/node may provide @@ -827,516 +709,3 @@ def get_input_name_by_name(self, name): # Find the position of the requested input name and look up the # corresponding input name of the ONNX node return self.onnx_node.input[inputs.index(name)] - - # Generates C++ parameters file, i.e. activation function thresholds - def generate_params(self, model: ModelWrapper, path): - # The code generation directory is specified as an argument, so this - # will work for both RTL and C++ simulation - code_gen_dir = path - - # Note: The attention operator itself has no weights to be generated as - # a parameter file - - # Start all three activations defaulting to pass-through of the - # accumulator type. - # Note: This might allow type-casts to the output types if they are - # not the same as the accumulators. - act_qk_matmul = "PassThroughActivation" - act_av_matmul = "PassThroughActivation" - act_a_softmax = "PassThroughActivation" - - # Start all thresholds defaulting to empty default initializer braces - thresholds_qk_matmul = "{}" - thresholds_av_matmul = "{}" - thresholds_a_softmax = "{}" - - # Prepares a threshold tensor as C++ string for code generation - def prepare_thresholds(ts, length, fold, dtype): - # Number of thresholds is given as the last dimension of the - # threshold tensor, first dimension is covering all output elements - num = ts.shape[-1] # noqa - # Partition the thresholds along the length into folds of parallel - # elements - ts = interleave_matrix_outer_dim_from_partitions(ts, length // fold) - # Reshape folded thresholds adding an outer dimension - # TODO: Why? MVAU does this, just copied the behavior. This is - # probably to generate the outer C++ initializer braces {} for - # object construction. Isn't it weird to rely on an artificial - # dimension just to have the code generator produce the correct - # string? - ts = ts.reshape(1, length // fold, fold, num) - # Format the thresholds as C++ array code - # Note: no packing, no variable name/type declaration - return numpy_to_hls_code(ts, dtype, "_", False, True), num - - # Get shape and folding configuration. None of the activations fold - # along the query-key embedding dimension or the query sequence length - (_, _, vdim, kvlen), (embfold, seqfold) = self.shapes, self.folds - - # Query-key matmul can have an optional activation function set to - # thresholding activations via node attribute - if self.get_nodeattr("ActQKMatMul") == "thresholds": - # In this case there will be a thresholds parameter initializer - thresholds = model.get_initializer( - self.get_input_name_by_name("thresholds_qk_matmul") - ) - # Get the datatype of the thresholds - thresholds_dtype = DataType[self.get_nodeattr("AccQKMatMul")] - # Activation value, i.e., bias applied after thresholding activation - bias = self.get_nodeattr("BiasActQKMatMul") - # No support for floating-point bias - assert int(bias) == bias, "BiasActQKMatMul must be integer" - # Convert the bias to integer representation, so it can be used as a - # template argument - bias = int(bias) - # Format the thresholds as C++ array code: QK matmul outputs fold - # along the key-value sequence length dimension - thresholds_qk_matmul, num = prepare_thresholds( - thresholds, kvlen, seqfold, thresholds_dtype - ) - # Get the HLS datatype string corresponding to the thresholds - # datatype for C++ code generation - dtype_str = thresholds_dtype.get_hls_datatype_str() - # Replace default pass-through activation by thresholding activation - # Note: Relies on type and shape definitions generated by the - # "defines" method - act_qk_matmul = "\n".join([ - f"ThresholdsActivation<", - f" SeqFold," - f" KVLen/SeqFold," - f" {num}," - f" AccQKMatMul," - f" OutQKMatMul," - f" {bias}," - # Note: Not sure why the default comp::less does not work... - f" comp::less_equal<{dtype_str}, {dtype_str}>", - f">" - ]) - - # Softmax can have an optional activation function set to thresholding - # activations via node attribute - if self.get_nodeattr("ActASoftmax") == "thresholds": - # In this case there will be a thresholds parameter initializer - thresholds = model.get_initializer( - self.get_input_name_by_name("thresholds_a_softmax") - ) - # Get the datatype of the thresholds - thresholds_dtype = DataType[self.get_nodeattr("AccASoftmax")] - # Activation value, i.e., bias applied after thresholding activation - bias = self.get_nodeattr("BiasActASoftmax") - # No support for floating-point bias - assert int(bias) == bias, "BiasActASoftmax must be integer" - # Convert the bias to integer representation, so it can be used as a - # template argument - bias = int(bias) - # Format the thresholds as C++ array code: Softmax outputs fold - # along the key-value sequence length dimension - thresholds_a_softmax, num = prepare_thresholds( - thresholds, kvlen, seqfold, thresholds_dtype - ) - # Get the HLS datatype string corresponding to the thresholds - # datatype for C++ code generation - dtype_str = thresholds_dtype.get_hls_datatype_str() - # Replace default pass-through activation by thresholding activation - # Note: Relies on type and shape definitions generated by the - # "defines" method - act_a_softmax = "\n".join([ - f"ThresholdsActivation<", - f" SeqFold," - f" KVLen/SeqFold," - f" {num}," - f" AccASoftmax," - f" AType," - f" {bias}," - # Note: Not sure why the default comp::less does not work... - f" comp::less_equal<{dtype_str}, {dtype_str}>", - f">" - ]) - - # Attention-value matmul can have an optional activation function set to - # thresholding activations via node attribute - if self.get_nodeattr("ActAVMatMul") == "thresholds": - # In this case there will be a thresholds parameter initializer - thresholds = model.get_initializer( - self.get_input_name_by_name("thresholds_av_matmul") - ) - # Get the datatype of the thresholds - thresholds_dtype = DataType[self.get_nodeattr("AccAVMatMul")] - # Activation value, i.e., bias applied after thresholding activation - bias = self.get_nodeattr("BiasActAVMatMul") - # No support for floating-point bias - assert int(bias) == bias, "BiasActAVMatMul must be integer" - # Convert the bias to integer representation, so it can be used as a - # template argument - bias = int(bias) - # Format the thresholds as C++ array code: AV matmul outputs fold - # along the value embedding dimension - thresholds_av_matmul, num = prepare_thresholds( - thresholds, vdim, embfold, thresholds_dtype - ) - # Get the HLS datatype string corresponding to the thresholds - # datatype for C++ code generation - dtype_str = thresholds_dtype.get_hls_datatype_str() - # Replace default pass-through activation by thresholding activation - # Note: Relies on type and shape definitions generated by the - # "defines" method - act_av_matmul = "\n".join([ - f"ThresholdsActivation<", - f" EmbFold," - f" VDim/EmbFold," - f" {num}," - f" AccAVMatMul," - f" OutAVMatMul," - f" {bias}," - # Note: Not sure why the default comp::less does not work... - f" comp::less_equal<{dtype_str}, {dtype_str}>", - f">" - ]) - - # Assume no attention mask as a default: Generate C++ code of tag - # instance of "none" mask type - attention_mask = \ - "static const auto attention_mask = attention::mask::NONE" - - # If a causal mask is specified, set the appropriate tag dispatching - # instance - if self.get_nodeattr("mask_mode") == "causal": - # Generate C++ code of tag instance of causal mask type - attention_mask = \ - "static const auto attention_mask = attention::mask::CAUSAL" - - # If a constant mask is specified, array code needs to be generated - if self.get_nodeattr("mask_mode") == "const": - # Attention mask type of folded constant mask array - mask_type = "attention::mask::Const" - # Get the constant mask values - mask = model.get_initializer(self.get_input_name_by_name("M")) - # Num should always be equal to QLen - num = mask.shape[-1] - # Partition the mask along the length into folds of parallel - # elements - mask = interleave_matrix_outer_dim_from_partitions( - mask, kvlen // seqfold - ) - # Reshape folded mask adding an outer dimension - mask = mask.reshape(num, kvlen // seqfold, seqfold).squeeze() - # Format the mask as C++ array code - # Note: no packing, no variable name/type declaration - mask = numpy_to_hls_code(mask, DataType["BINARY"], "_", False, True) - # Generate C++ code initializing the constant mask array - attention_mask = f"static const {mask_type} attention_mask = {mask}" - - # Of a mask is provided as input, no object parameters need to be - # generated here - if self.get_nodeattr("mask_mode") == "input": - # Attention mask type of input stream - mask_type = "attention::mask::Input" - # Generate C++ code creating an input stream instance for the mask - # Note: This is just a dummy, the real input stream will be part - # of the operator interface - attention_mask = f"static const {mask_type} attention_mask;" - - # Open a file to store the thresholds parameters as C++ code - with open(f"{code_gen_dir}/params.hpp", "w") as file: - # Write lines of C++ code separated by newlines to the file - file.write("\n".join([ - # Scale factor preceding the softmax activation function to - # dequantize the input to floating-point representation - f"static const float dequant_softmax =" - f" {self.get_nodeattr('DequantSoftmax')};", - # Attention mask parameters if "none", "causal" or "const" - f"{attention_mask};", - # Type alias to the generated attention mask for convenience - f"using AttentionMask = decltype(attention_mask);", - # Add type definition and threshold initialization of the - # query-key matmul activation - f"using ActQKMatMul = {act_qk_matmul};", - f"ActQKMatMul act_qk_matmul = {thresholds_qk_matmul};", - # Add type definition and threshold initialization of the - # attention-value matmul activation - f"using ActAVMatMul = {act_av_matmul};", - f"ActAVMatMul act_av_matmul = {thresholds_av_matmul};", - # Add type definition and threshold initialization of the - # softmax activation - f"using ActASoftmax = {act_a_softmax};", - f"ActASoftmax act_a_softmax = {thresholds_a_softmax};", - # Append a newline at the end of the file (to avoid problems - # when including, required by C standard?) - "\n" - ])) - - # Generates C++ code of type alias, global constant and macro definitions - def defines(self, var): - # Generate shape definitions from attributes to C++ constant definitions - def shapedefs(*names): - # C++ qualified type to be used for shape constants - shape = "static constexpr std::size_t" - # Generate a C++ constant definition for each of the attributes - # given by argument list names - return ( - f"{shape} {name} = {self.get_nodeattr(name)};" for name in names - ) - - # Generate datatype definitions mapping from QONNX DataType to HLS type - def typedefs(*names): - # Gets the HLS type string for the datatype specified by the named - # attribute - def hls_type(name): - # Looks up the datatype specified for the attribute and - # translates from QONNX to HLS type - return DataType[self.get_nodeattr(name)].get_hls_datatype_str() - - # Generate a C++ type alias definition for each of the attributes - # given by argument list names - return (f"using {name} = {hls_type(name)};" for name in names) - - # Insert constants and type aliases into the dictionary - self.code_gen_dict["$DEFINES$"] = [ - # Shape constant definitions of attention inputs (query, key and - # value) and folding configuration - *shapedefs( - "QKDim", - "QLen", - "VDim", - "KVLen", - "EmbFold", - "SeqFold" - ), - # Type alias definitions for all input, output and intermediate - # datatypes - *typedefs( - "QType", - "KType", - "VType", - "MType", - "AType", - "OType" - ), - # Type alias definitions for the matmul accumulators and output - # datatypes - *typedefs( - "AccQKMatMul", - "OutQKMatMul", - "AccAVMatMul", - "OutAVMatMul", - "AccASoftmax" - ), - # Include the activation function type definitions and parameters - # Note: The typedefs in this header require the typedefs above, - # thus adding this to the global includes is not possible. - f'#include "params.hpp"', - # Type alias of the properly configured attention operator class - f"using Attention = ScaledDotProductAttention<", - f" QKDim,", - f" QLen,", - f" VDim,", - f" KVLen,", - f" EmbFold,", - f" SeqFold,", - f" QType,", - f" KType,", - f" VType,", - f" MType,", - f" AType,", - f" OType,", # Note: OType and last MatMul out must match - f" AccQKMatMul,", - f" OutQKMatMul,", - f" ActQKMatMul,", - f" AccAVMatMul,", - f" OType,", # Note: OType and last MatMul out must match - f" ActAVMatMul,", - f" ActASoftmax", - f">;", - # Short type aliases of attention input and output streams - f"using QStream = Attention::QStream;", - f"using KStream = Attention::KStream;", - f"using VStream = Attention::VStream;", - f"using OStream = Attention::OStream;", - f"using MStream = Attention::MStream;", - ] - - # Generates C++ code for reading data from .npy (numpy format) for testing - # in C++ simulation - def read_npy_data(self): - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - - # Generate function calls for reading the input files into the input - # streams - self.code_gen_dict["$READNPYDATA$"] = [ - # Deduce the datatype of elements packed into the query input stream - # TODO: Maybe these type-deductions can be removed by changing the - # order of the template arguments of the npy2apintstream, such - # that type-deduction is handled there? - f'using QPacked = decltype(QStream{{}}.read());', - # Generate function call reading from file into the input stream - # Note: Inputs are always represented as numpy floats - f'npy2apintstream(', - f' "{code_gen_dir}/q.npy", q_{self.hls_sname()}, false', - ');', - - # Deduce the datatype of elements packed into the key input stream - f'using KPacked = decltype(KStream{{}}.read());', - # Generate function call reading from file into the input stream - # Note: Inputs are always represented as numpy floats - f'npy2apintstream(', - f' "{code_gen_dir}/k.npy", k_{self.hls_sname()}, false', - ');', - - # Deduce the datatype of elements packed into the value input stream - f'using VPacked = decltype(VStream{{}}.read());', - # Generate function call reading from file into the input stream - # Note: Inputs are always represented as numpy floats - f'npy2apintstream(', - f' "{code_gen_dir}/v.npy", v_{self.hls_sname()}, false', - ');', - ] - - # If the mask is provided as an input, it needs to be read as well - if self.get_nodeattr("mask_mode") == "input": - # Generate function call for reading the mask file into the input - # stream - self.code_gen_dict["$READNPYDATA$"] += [ - # Deduce the datatype of elements packed into the mask input - # stream - f'using MPacked = decltype(MStream{{}}.read());', - # Generate function call reading from file into the input stream - # Note: Inputs are always represented as numpy floats - f'npy2apintstream(', - f' "{code_gen_dir}/m.npy", m_{self.hls_sname()}, false', - ');', - ] - - # Generates C++ code for declaring all streams involved in C++ simulation - # for testing - def strm_decl(self): - # Declare input (query, key, value) and output streams - self.code_gen_dict["$STREAMDECLARATIONS$"] = [ - # Note: Assumes stream type aliases to be set in defines - f"QStream q_{self.hls_sname()};", - f"KStream k_{self.hls_sname()};", - f"VStream v_{self.hls_sname()};", - f"OStream out_{self.hls_sname()};" - ] - # If the mask is provided as an input, it needs a stream declaration as - # well - if self.get_nodeattr("mask_mode") == "input": - # Append the mask stream to the declaration list - self.code_gen_dict["$STREAMDECLARATIONS$"] += [ - # Note: Assumes stream type aliases to be set in defines - f"MStream m_{self.hls_sname()};", - ] - - # Generates C++ code for calling the computation part of the operator - def docompute(self): - # Write the body of the attention top-level function - self.code_gen_dict["$DOCOMPUTE$"] = [ - # Instantiate the attention operator and connect to the generated - # threshold parameters - # Note: Assumes "Attention" to be aliased appropriate configuration - # in defines with. - # Note: Assumes parameters to be generated in 'generate_params' and - # made available via include/defines before. - f"Attention attention {{", - f" act_qk_matmul, act_av_matmul, act_a_softmax, dequant_softmax", - f"}};", - # Connect the attention operator to the input and output streams - f"attention(" - f"q_{self.hls_sname()}, " - f"k_{self.hls_sname()}, " - f"v_{self.hls_sname()}, " - f"out_{self.hls_sname()}, " - # TODO: Does not work for "input" mode mask - f"attention_mask" - f");", - ] - - # Generates C++ code for reading the output stream and converting back to - # numpy format for testing in C** simulation - def dataoutstrm(self): - # Output data will be stored in numpy files in the code generation - # dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Get the expected shape of the folded output array formatted as a C++ - # vector initializer - # Note: Valid formatting relies on correct placement of curly braces - # and line breaks: Open/close all three braces on the same line of code - # to avoid '\n' to be inserted into the string - shape = f"""{{{ - ','.join((str(i) for i in self.get_folded_output_shape())) - }}}""" - # Generate function call for reading from the output stream into the - # output file - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - # Deduce the datatype of elements packed into the output stream - f'using OPacked = decltype(OStream{{}}.read());', - # Generate function call reading from stream into the output file - # Note: Outputs are always represented as numpy floats - f'apintstream2npy(', - f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false', - ');', - ] - - # Generates C++ code for saving the output of C++ simulation to a file in - # numpy format - def save_as_npy(self): - # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used - # for something before, which is now integrated into dataoutstrm()? - self.code_gen_dict["$SAVEASCNPY$"] = [] - - # Generates essentially the head of the C++ function from which the IP block - # will be generated during ipgen, i.e. actual synthesis - def blackboxfunction(self): - # Insert function head describing the top level interface of the - # attention operator - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - # Note: Assumes stream type aliases to be set in defines - f"void {self.onnx_node.name} (", - f" QStream &q_{self.hls_sname()}," - f" KStream &k_{self.hls_sname()}," - f" VStream &v_{self.hls_sname()}," - f" OStream &out_{self.hls_sname()}", - f")", - ] - - # Generates C++ pragmas to be inserted into the main function of the C++ - # simulation and the ipgen-blackboxfunction as well - def pragmas(self): - # Add HLS interface directives specifying how to create RTL ports for - # the top-level function arguments - self.code_gen_dict["$PRAGMAS$"] = [ - # Connect the query input stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=q_{self.hls_sname()}", - # Connect the key input stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=k_{self.hls_sname()}", - # Connect the value input stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=v_{self.hls_sname()}", - # Connect the output stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}", - ] - # No block-level I/O protocol for the function return value - self.code_gen_dict["$PRAGMAS$"].append( - f"#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - # Returns the names of input and output interfaces grouped by protocol - def get_verilog_top_module_intf_names(self): - # Start collecting interface names in a dictionary starting with clock - # and reset - intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa - # AXI stream input interfaces - intf_names["s_axis"] = [ - (f"q_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), - (f"k_{self.hls_sname()}", self.get_instream_width_padded(ind=1)), - (f"v_{self.hls_sname()}", self.get_instream_width_padded(ind=2)) - ] - # AXI stream output interfaces - intf_names["m_axis"] = [ - (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0)) - ] - # No AXI-MM, AXI-Lite or protocol-less interfaces - intf_names["aximm"] = [] - intf_names["axilite"] = [] - intf_names["ap_none"] = [] - # Return the interface name dictionary - return intf_names diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 6a738e0b13..ef8bd652c6 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -1,31 +1,29 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Numpy math and arrays +import numpy as np # Operating system stuff, e.g. paths import os # Python warning subsystem import warnings -# Numpy math and arrays -import numpy as np -# Protobuf onnx graph node type -from onnx import NodeProto # noqa # Helper for creating ONNX nodes -from onnx import helper as oh # noqa +from onnx import helper as oh # QONNX/FINN datatypes -from qonnx.core.datatype import DataType # noqa qonnx dependency is specified -# in setup.cfg as well as in fetch-repos.sh +from qonnx.core.datatype import DataType # QONNX wrapper to ONNX model graphs -from qonnx.core.modelwrapper import ModelWrapper # noqa - -# Converts inputs/outputs to/from RTL simulation format -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from qonnx.core.modelwrapper import ModelWrapper # Derive custom operators form the FINN base custom op from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -# Specialize the custom op as HLS backend implementation -from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +# Converts inputs/outputs to/from RTL simulation format +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # Splitting of attention heads (after input projections) custom operator -class SplitMultiHeads(HWCustomOp, HLSBackend): +class SplitMultiHeads(HWCustomOp): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -41,12 +39,8 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): # Start from parent operator class attributes attrs = HWCustomOp.get_nodeattr_types(self) - attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ - # Force implementation style to HLS backend - "preferred_impl_style": ("s", False, "hls", {"", "hls"}), - # Number of attention heads "heads": ("i", True, 1), # Specifies whether the output is packed as a single output tensor @@ -183,32 +177,12 @@ def _execute_node_python(self, context, graph): # noqa: graph unused # Write the output into the execution context context[o] = out - # Executes multi-head slicing in C++ simulation + # Executes multi-head splitting in C++ simulation def _execute_node_cppsim(self, context, graph): # noqa: graph unused - # Get the node wrapped by this custom op # noqa Duplicate - node = self.onnx_node - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Get the input out of the execution context - # Note: Shape must be either seq x 1 x dim or seq x dim - inp = context[node.input[0]] - # Validate the shape of the input - assert inp.shape == self.get_normal_input_shape(ind=0), \ - f"Input shape mismatch for {node.input[0]}" - # Reshape the input into folded form - inp = inp.reshape(self.get_folded_input_shape(ind=0)) - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"in.npy"), inp) - - # Execute the precompiled model - super().exec_precompiled_singlenode_model() - - # Enumerate the node outputs - for i, name in enumerate(node.output): - # Load the output numpy file generated by the C++ simulation - out = np.load(os.path.join(code_gen_dir, f"out{i}.npy")) - # Reshape the folded output and insert into the execution context - context[name] = out.reshape(self.get_normal_output_shape(ind=i)) + # C++ Simulation needs to be implemented in HLS backend specialization + raise NotImplementedError( + f"exec_mode cppsim of {self.__class__.__name__} is not implemented!" + ) # Executes multi-head slicing in RTL simulation def _execute_node_rtlsim(self, context, graph): # noqa: graph unused @@ -284,7 +258,7 @@ def verify_node(self): # TODO: Implement return [] - # Note: End of QONNX CustomOp region, below is FINN HLSCustomOp stuff + # Note: End of QONNX CustomOp region, below is FINN HWCustomOp stuff # Gets the datatype of input at index ind def get_input_datatype(self, ind=0): @@ -341,19 +315,6 @@ def get_outstream_width(self, ind=0): # Width of a stream producing output elements in parallel return elems * o_bits - # Maximum width of any ap_int used in this operator - def get_ap_int_max_w(self): - # Find the widths of the widest input - # Note: There is just one input. - i_bits_max = self.get_instream_width(ind=0) - # Find the widths of the widest output - # Note: there is one output per head - o_bits_max = max( - (self.get_outstream_width(ind) for ind in range(self.heads)) - ) - # Find the biggest of the inputs/outputs - return max([i_bits_max, o_bits_max]) - # Gets the number of expected output values, i.e. how many times read() # could/should be called on any output stream of this operator def get_number_output_values(self): @@ -364,194 +325,9 @@ def get_number_output_values(self): # N outputs per cycle... return np.prod(self.get_folded_output_shape()[:-1]) * self.heads - # Note: End of shape and datatype utilities - - # Generates list of C++ includes to be placed at the top of the generated - # code - def global_includes(self): - # Currently nothing to include - self.code_gen_dict["$GLOBALS$"] = [] - - # Generates C++ code of type alias, global constant and macro definitions - def defines(self, var): - # Insert constants and type aliases into the dictionary - self.code_gen_dict["$DEFINES$"] = [ - # Input and output element datatypes - f"using IType = {self.dtype.get_hls_datatype_str()};", - f"using OType = {self.dtype.get_hls_datatype_str()};", - # Datatype of elements packed into the input stream - f"using IPacked = ap_uint<{self.get_instream_width()}>;", - # Datatype of elements packed into the output stream - f"using OPacked = ap_uint<{self.get_outstream_width()}>;", - # Input and output HLS stream datatypes - f"using IStream = hls::stream<" - f" ap_uint<{self.get_instream_width()}>" - f">;", - f"using OStream = hls::stream<" - f" ap_uint<{self.get_outstream_width()}>" - f">;", - ] - - # Generates C++ code for reading data from .npy (numpy format) for testing - # in C++ simulation - def read_npy_data(self): - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Generate function calls for reading the input files into the input - # streams - self.code_gen_dict["$READNPYDATA$"] = [ - # Generate function call reading from file into the input stream - # Note: Inputs are always represented as numpy floats - f'npy2apintstream(', - f'"{code_gen_dir}/in.npy", in_{self.hls_sname()}, false', - f');' - ] - - # Generates C++ code for declaring all streams involved in C++ simulation - # for testing - def strm_decl(self): - # Declare input and output streams - # Note: Assumes stream type aliases to be set in defines - self.code_gen_dict["$STREAMDECLARATIONS$"] = [ - # There is one input datastream - f"IStream in_{self.hls_sname()};", - # There is one output datastream per head - *(f"OStream out{i}_{self.hls_sname()};" for i in range(self.heads)) - ] - - # Generates C++ code for calling the computation part of the operator - def docompute(self): - # Generates the bit-slicing indices string for the ith split of the - # input - def split(i): - # Assemble a C++ indexing/bit-slicing string - return f"({i + 1} * OPacked::width - 1, {i} * OPacked::width)" - - # Generates the name of the ith output stream - def out(i): - return f"out{i}_{self.hls_sname()}" - - # Write the body of the head-splitting top-level function - self.code_gen_dict["$DOCOMPUTE$"] = [ - # Repeat for the number of inputs - # Note: Repeat for all num_inputs dimensions - f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", - # Pipeline the steps of this loop - f"#pragma HLS pipeline II=1 style=flp", - # Read the next input element from the stream - f"const auto x = in_{self.hls_sname()}.read();", - # Split the next element from the input stream into the number of - # output elements per head and write into the corresponding stream - *(f"{out(i)}.write(x{split(i)});" for i in range(self.heads)), - # End of for-loop over repetitions body - f"}}" - ] - - # Generates C++ code for reading the output stream and converting back to - # numpy format for testing in C++ simulation - def dataoutstrm(self): - # Output data will be stored in numpy files in the # noqa Duplicate - # code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Get the expected shape of the folded output array formatted as a C++ - # vector initializer - # Note: Valid formatting relies on correct placement of curly braces - # and line breaks: Open/close all three braces on the same line of code - # to avoid '\n' to be inserted into the string - shape = f"""{{{ - ','.join((str(i) for i in self.get_folded_output_shape())) - }}}""" - # Start collecting function calls to write the output data stream - self.code_gen_dict["$DATAOUTSTREAM$"] = [] - - # Generates the name of the ith output stream - def out(i): - return f"out{i}_{self.hls_sname()}" - - # Generate code for each output stream - for i in range(self.heads): - # Append each reading/writing function call - self.code_gen_dict["$DATAOUTSTREAM$"] += [ - # Generate function call reading from stream into the output - # file - # Note: Outputs are always represented as numpy floats - f'apintstream2npy(', - f'{out(i)}, {shape}, "{code_gen_dir}/out{i}.npy", false', - f');' - ] - - # Generates C++ code for saving the output of C++ simulation to a file in - # numpy format - def save_as_npy(self): - # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used - # for something before, which is now integrated into dataoutstrm()? - self.code_gen_dict["$SAVEASCNPY$"] = [] - - # Generates essentially the head of the C++ function from which the IP block - # will be generated during ipgen, i.e. actual synthesis - def blackboxfunction(self): - # Insert function head describing the top level interface of the head - # splitting operator - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - # @formatter:off Prevent Python formatter from messing with C++ - # formatting - # Note: Assumes stream type aliases to be set in defines - f"void {self.onnx_node.name} (", - # Input HLS stream - f" IStream &in_{self.hls_sname()}, ", ",".join([ - # One output HLS stream per head # noqa: Formatting - f" OStream &out{i}_{self.hls_sname()}" for i in range(self.heads) - ]), - f")", - # @formatter:off - ] - - # Generates C++ pragmas to be inserted into the main function of the C++ - # simulation and the ipgen-blackboxfunction as well - def pragmas(self): - # Add HLS interface directives specifying how to create RTL ports for - # the top-level function arguments - self.code_gen_dict["$PRAGMAS$"] = [ - # Connect the input stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=in_{self.hls_sname()}" - ] - # Connect each output stream with an axi stream interface - for i in range(self.heads): - # Add new interface directive for the output stream - self.code_gen_dict["$PRAGMAS$"] += [ - f"#pragma HLS INTERFACE axis port=out{i}_{self.hls_sname()}" - ] - # No block-level I/O protocol for the function return value - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - # Returns the names of input and output interfaces grouped by protocol - def get_verilog_top_module_intf_names(self): - # Start collecting interface names in a dictionary # noqa Duplicate - # starting with clock and reset - intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa - # AXI stream input interfaces - intf_names["s_axis"] = [ - # Just one input stream - (f"in_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), - ] - # AXI stream output interfaces - intf_names["m_axis"] = [ - # One output stream per head - (f"out{i}_{self.hls_sname()}", - self.get_outstream_width_padded(ind=i)) for i in range(self.heads) - ] - # No AXI-MM, AXI-Lite or protocol-less interfaces - intf_names["aximm"] = [] - intf_names["axilite"] = [] - intf_names["ap_none"] = [] - # Return the interface name dictionary - return intf_names - # Merging of attention heads (before output projections) custom operator -class MergeMultiHeads(HWCustomOp, HLSBackend): +class MergeMultiHeads(HWCustomOp): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -567,12 +343,8 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): # Start from parent operator class attributes attrs = HWCustomOp.get_nodeattr_types(self) - attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ - # Force implementation style to HLS backend - "preferred_impl_style": ("s", False, "hls", {"", "hls"}), - # Number of attention heads "heads": ("i", True, 1), # Specifies whether the output is packed as a single output tensor @@ -657,7 +429,7 @@ def make_shape_compatible_op(self, model: ModelWrapper): # noqa # Infers the datatype of the node output def infer_node_datatype(self, model: ModelWrapper): # noqa # Get the node wrapped by this custom op - node = self.onnx_node # noqa Duplicate + node = self.onnx_node # noqa Duplicate # Test for changing input datatype if model.get_tensor_datatype(node.input[0]) != self.dtype: # Get the new datatype @@ -705,34 +477,11 @@ def _execute_node_python(self, context, graph): # noqa: graph unused # which might be squeezed context[node.output[0]] = out - # Executes multi-head slicing in C++ simulation + # Executes multi-head merging in C++ simulation def _execute_node_cppsim(self, context, graph): # noqa: graph unused - # Get the node wrapped by this custom op - node = self.onnx_node - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - - # Enumerate the node outputs - for i, name in enumerate(node.input): - # Get the input out of the execution context - # Note: Shape must be either 1 x seq x dim or seq x dim - inp = context[name] - # Validate the shape of the input - assert inp.shape == self.get_normal_input_shape(ind=i), \ - f"Input shape mismatch for {name}" - # Reshape the input into folded form - inp = inp.reshape(self.get_folded_input_shape(ind=i)) - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"in{i}.npy"), inp) - - # Execute the precompiled model - super().exec_precompiled_singlenode_model() - - # Load the output numpy file generated by the C++ simulation - out = np.load(os.path.join(code_gen_dir, f"out.npy")) - # Reshape the folded output and insert into the execution context - context[node.output[0]] = out.reshape( - self.get_normal_output_shape(ind=0) + # C++ Simulation needs to be implemented in HLS backend specialization + raise NotImplementedError( + f"exec_mode cppsim of {self.__class__.__name__} is not implemented!" ) # Executes multi-head slicing in RTL simulation @@ -815,7 +564,7 @@ def verify_node(self): # TODO: Implement return [] - # Note: End of QONNX CustomOp region, below is FINN HLSCustomOp stuff + # Note: End of QONNX CustomOp region, below is FINN HWCustomOp stuff # Gets the datatype of input at index ind def get_input_datatype(self, ind=0): @@ -872,196 +621,9 @@ def get_outstream_width(self, ind=0): # Width of a stream producing output elements in parallel return elems * o_bits - # Maximum width of any ap_int used in this operator - def get_ap_int_max_w(self): - # Find the widths of the widest input - # Note: There is just one input. - i_bits_max = self.get_instream_width(ind=0) - # Find the widths of the widest output - # Note: there is one output per head - o_bits_max = max( - (self.get_outstream_width(ind) for ind in range(self.heads)) - ) - # Find the biggest of the inputs/outputs - return max([i_bits_max, o_bits_max]) - # Gets the number of expected output values, i.e. how many times read() # could/should be called on any output stream of this operator def get_number_output_values(self): # Elements over all but the last dimension of the output folded along # the embedding dimension return np.prod(self.get_folded_output_shape()[:-1]) - - # Note: End of shape and datatype utilities - - # Generates list of C++ includes to be placed at the top of the generated - # code - def global_includes(self): - # Currently nothing to include - self.code_gen_dict["$GLOBALS$"] = [] - - # Generates C++ code of type alias, global constant and macro definitions - def defines(self, var): - # Insert constants and type aliases into the dictionary - self.code_gen_dict["$DEFINES$"] = [ - # Input and output element datatypes - f"using IType = {self.dtype.get_hls_datatype_str()};", - f"using OType = {self.dtype.get_hls_datatype_str()};", - # Datatype of elements packed into the input stream - f"using IPacked = ap_uint<{self.get_instream_width()}>;", - # Datatype of elements packed into the output stream - f"using OPacked = ap_uint<{self.get_outstream_width()}>;", - # Input and output HLS stream datatypes - f"using IStream = hls::stream<" - f" ap_uint<{self.get_instream_width()}>" - f">;", - f"using OStream = hls::stream<" - f" ap_uint<{self.get_outstream_width()}>" - f">;", - ] - - # Generates C++ code for reading data from .npy (numpy format) for testing - # in C++ simulation - def read_npy_data(self): - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Generate function calls for reading the input files into the input - # streams - self.code_gen_dict["$READNPYDATA$"] = [] - # Generate code for each input stream - for i in range(self.heads): - # Append each reading/writing function call - self.code_gen_dict["$READNPYDATA$"] += [ - # Generate function call reading from file into the input stream - # Note: Inputs are always represented as numpy floats - f'npy2apintstream(', - f'"{code_gen_dir}/in{i}.npy", in{i}_{self.hls_sname()}, false', - f');' - ] - - # Generates C++ code for declaring all streams involved in C++ simulation - # for testing - def strm_decl(self): - # Declare input and output streams - # Note: Assumes stream type aliases to be set in defines - self.code_gen_dict["$STREAMDECLARATIONS$"] = [ - # There is one output stream - f"OStream out_{self.hls_sname()};", - # There is one input stream per head - *(f"IStream in{i}_{self.hls_sname()};" for i in range(self.heads)) - ] - - # Generates C++ code for calling the computation part of the operator - def docompute(self): - reversed_reads = ", ".join([ - f"in{i}_{self.hls_sname()}.read()" - for i in reversed(range(self.heads)) - ]) - - # Write the body of the head-splitting top-level function - self.code_gen_dict["$DOCOMPUTE$"] = [ - # Repeat for the number of inputs - # Note: Repeat for all num_inputs dimensions - f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", - # Pipeline the steps of this loop - f"#pragma HLS pipeline II=1 style=flp", - # Read the next input element from each input stream and concatenate - # using the comma operator overload of ap_uint, writing into the - # output stream - f"out_{self.hls_sname()}.write(({reversed_reads}));" - # End of for-loop over repetitions body - f"}}" - ] - - # Generates C++ code for reading the output stream and converting back to - # numpy format for testing in C** simulation - def dataoutstrm(self): - # Output data will be stored in numpy files in the code generation - # dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Get the expected shape of the folded output array formatted as a C++ - # vector initializer - # Note: Valid formatting relies on correct placement of curly braces - # and line breaks: Open/close all three braces on the same line of code - # to avoid '\n' to be inserted into the string - shape = f"""{{{ - ','.join((str(i) for i in self.get_folded_output_shape())) - }}}""" - # Generate function call for reading from the output stream into the - # output file - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - # Generate function call reading from stream into the output file - # Note: Outputs are always represented as numpy floats - f'apintstream2npy(', - f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false', - f');', - ] - - # Generates C++ code for saving the output of C++ simulation to a file in - # numpy format - def save_as_npy(self): - # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used - # for something before, which is now integrated into dataoutstrm()? - self.code_gen_dict["$SAVEASCNPY$"] = [] - - # Generates essentially the head of the C++ function from which the IP block - # will be generated during ipgen, i.e. actual synthesis - def blackboxfunction(self): - # Insert function head describing the top level interface of the head - # splitting operator - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - # @formatter:off Prevent Python formatter from messing with C++ - # formatting - # Note: Assumes stream type aliases to be set in defines - f"void {self.onnx_node.name} (", - # Output HLS stream - f" OStream &out_{self.hls_sname()}, ", ",".join([ - # One input HLS stream per head # noqa: Formatting - f" IStream &in{i}_{self.hls_sname()}" for i in range(self.heads) - ]), - f")", - # @formatter:off - ] - - # Generates C++ pragmas to be inserted into the main function of the C++ - # simulation and the ipgen-blackboxfunction as well - def pragmas(self): - # Add HLS interface directives specifying how to create RTL ports for - # the top-level function arguments - self.code_gen_dict["$PRAGMAS$"] = [ - # Connect the output stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}" - ] - # Connect each input stream with an axi stream interface - for i in range(self.heads): - # Add new interface directive for the input stream - self.code_gen_dict["$PRAGMAS$"] += [ - f"#pragma HLS INTERFACE axis port=in{i}_{self.hls_sname()}" - ] - # No block-level I/O protocol for the function return value - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - # Returns the names of input and output interfaces grouped by protocol - def get_verilog_top_module_intf_names(self): - # Start collecting interface names in a dictionary starting with clock - # and reset - intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa - # AXI stream input interfaces - intf_names["s_axis"] = [ - # One input stream per head - (f"in{i}_{self.hls_sname()}", - self.get_instream_width_padded(ind=i)) for i in range(self.heads) - ] - # AXI stream output interfaces - intf_names["m_axis"] = [ - # Just one output stream - (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0)), - ] - # No AXI-MM, AXI-Lite or protocol-less interfaces - intf_names["aximm"] = [] - intf_names["axilite"] = [] - intf_names["ap_none"] = [] - # Return the interface name dictionary - return intf_names diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 9eb6793bbc..6ce421a9fb 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -27,6 +27,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls +from finn.custom_op.fpgadataflow.hls.attention_heads_hls import ( + MergeMultiHeads_hls, + SplitMultiHeads_hls, +) +from finn.custom_op.fpgadataflow.hls.attention_hls import ScaledDotProductAttention_hls from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls @@ -43,6 +48,7 @@ from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls +from finn.custom_op.fpgadataflow.hls.replicate_stream_hls import ReplicateStream_hls from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import ( StreamingDataWidthConverter_hls, ) @@ -80,18 +86,7 @@ custom_op["MVAU_hls"] = MVAU_hls custom_op["VVAU_hls"] = VVAU_hls -from finn.custom_op.fpgadataflow.attention import ( - ScaledDotProductAttention as ScaledDotProductAttention_hls -) -from finn.custom_op.fpgadataflow.attention_heads import ( - SplitMultiHeads as SplitMultiHeads_hls, - MergeMultiHeads as MergeMultiHeads_hls -) -from finn.custom_op.fpgadataflow.replicate_stream import ( - ReplicateStream as ReplicateStream_hls -) - -custom_op["ScaledDotProductAttention_hls"] = ScaledDotProductAttention_hls +custom_op["ScaledDotProductAttention_hls"] = ScaledDotProductAttention_hls custom_op["SplitMultiHeads_hls"] = SplitMultiHeads_hls custom_op["MergeMultiHeads_hls"] = MergeMultiHeads_hls custom_op["ReplicateStream_hls"] = ReplicateStream_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_heads_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_heads_hls.py new file mode 100644 index 0000000000..fe67d04c96 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/attention_heads_hls.py @@ -0,0 +1,490 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Numpy math and arrays +import numpy as np + +# Operating system stuff, e.g. paths +import os + +# Base class for specializing HW operators as implemented via HLS +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +# The generic HW custom operator version of the operator as a base class +from finn.custom_op.fpgadataflow.attention_heads import ( # noqa + MergeMultiHeads, SplitMultiHeads +) + + +# HLS Backend specialization of the multi-head attention splitting operator +class SplitMultiHeads_hls( # noqa: Class name does not follow + # CapWords convention + SplitMultiHeads, HLSBackend +): + # Node attributes matching the HLS operator + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = SplitMultiHeads.get_nodeattr_types(self) + # Add the HLSBackend default attributes on top + attrs.update(HLSBackend.get_nodeattr_types(self)) + # Add/Specialize implementation specific attributes here... + # Return the updated attributes dictionary + return attrs + + # Executes multi-head splitting in C++ simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op # noqa Duplicate + node = self.onnx_node # noqa Duplicate + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the input out of the execution context + # Note: Shape must be either seq x 1 x dim or seq x dim + inp = context[node.input[0]] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=0), \ + f"Input shape mismatch for {node.input[0]}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=0)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, "in.npy"), inp) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Enumerate the node outputs + for i, name in enumerate(node.output): + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, f"out{i}.npy")) + # Reshape the folded output and insert into the execution context + context[name] = out.reshape(self.get_normal_output_shape(ind=i)) + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + # Note: There is just one input. + i_bits_max = self.get_instream_width(ind=0) + # Find the widths of the widest output + # Note: there is one output per head + o_bits_max = max( + (self.get_outstream_width(ind) for ind in range(self.heads)) + ) + # Find the biggest of the inputs/outputs + return max([i_bits_max, o_bits_max]) + + # Note: End of shape and datatype utilities + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # Currently nothing to include + self.code_gen_dict["$GLOBALS$"] = [] + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Input and output element datatypes + f"using IType = {self.dtype.get_hls_datatype_str()};", + f"using OType = {self.dtype.get_hls_datatype_str()};", + # Datatype of elements packed into the input stream + f"using IPacked = ap_uint<{self.get_instream_width()}>;", + # Datatype of elements packed into the output stream + f"using OPacked = ap_uint<{self.get_outstream_width()}>;", + # Input and output HLS stream datatypes + "using IStream = hls::stream<" + f" ap_uint<{self.get_instream_width()}>" + ">;", + "using OStream = hls::stream<" + f" ap_uint<{self.get_outstream_width()}>" + ">;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [ + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + 'npy2apintstream(', + f'"{code_gen_dir}/in.npy", in_{self.hls_sname()}, false', + ');' + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Declare input and output streams + # Note: Assumes stream type aliases to be set in defines + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # There is one input datastream + f"IStream in_{self.hls_sname()};", + # There is one output datastream per head + *(f"OStream out{i}_{self.hls_sname()};" for i in range(self.heads)) + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + # Generates the bit-slicing indices string for the ith split of the + # input + def split(i): + # Assemble a C++ indexing/bit-slicing string + return f"({i + 1} * OPacked::width - 1, {i} * OPacked::width)" + + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + + # Write the body of the head-splitting top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Repeat for the number of inputs + # Note: Repeat for all num_inputs dimensions + f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", + # Pipeline the steps of this loop + "#pragma HLS pipeline II=1 style=flp", + # Read the next input element from the stream + f"const auto x = in_{self.hls_sname()}.read();", + # Split the next element from the input stream into the number of + # output elements per head and write into the corresponding stream + *(f"{out(i)}.write(x{split(i)});" for i in range(self.heads)), + # End of for-loop over repetitions body + f"}}" # noqa: f-string symmetry + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C++ simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the # noqa Duplicate + # code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Start collecting function calls to write the output data stream + self.code_gen_dict["$DATAOUTSTREAM$"] = [] + + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + + # Generate code for each output stream + for i in range(self.heads): + # Append each reading/writing function call + self.code_gen_dict["$DATAOUTSTREAM$"] += [ + # Generate function call reading from stream into the output + # file + # Note: Outputs are always represented as numpy floats + 'apintstream2npy(', + f'{out(i)}, {shape}, "{code_gen_dir}/out{i}.npy", false', + ');' + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the head + # splitting operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # @formatter:off Prevent Python formatter from messing with C++ + # formatting + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + # Input HLS stream + f"IStream &in_{self.hls_sname()}, ", ",".join([ + # One output HLS stream per head # noqa: Formatting + f"OStream &out{i}_{self.hls_sname()}" for i in range(self.heads) + ]), + ")", + # @formatter:off + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the input stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=in_{self.hls_sname()}" + ] + # Connect each output stream with an axi stream interface + for i in range(self.heads): + # Add new interface directive for the output stream + self.code_gen_dict["$PRAGMAS$"] += [ + f"#pragma HLS INTERFACE axis port=out{i}_{self.hls_sname()}" + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary # noqa Duplicate + # starting with clock and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + # Just one input stream + (f"in_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + # One output stream per head + (f"out{i}_{self.hls_sname()}", + self.get_outstream_width_padded(ind=i)) for i in range(self.heads) + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names + + +# HLS Backend specialization of the multi-head attention merging operator +class MergeMultiHeads_hls( # noqa: Class name does not follow + # CapWords convention + MergeMultiHeads, HLSBackend +): + # Node attributes matching the HLS operator + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = MergeMultiHeads.get_nodeattr_types(self) + # Add the HLSBackend default attributes on top + attrs.update(HLSBackend.get_nodeattr_types(self)) + # Add/Specialize implementation specific attributes here... + # Return the updated attributes dictionary + return attrs + + # Executes multi-head slicing in C++ simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + + # Enumerate the node outputs + for i, name in enumerate(node.input): + # Get the input out of the execution context + # Note: Shape must be either 1 x seq x dim or seq x dim + inp = context[name] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=i), \ + f"Input shape mismatch for {name}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=i)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, f"in{i}.npy"), inp) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, "out.npy")) + # Reshape the folded output and insert into the execution context + context[node.output[0]] = out.reshape( + self.get_normal_output_shape(ind=0) + ) + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + # Note: There is just one input. + i_bits_max = self.get_instream_width(ind=0) + # Find the widths of the widest output + # Note: there is one output per head + o_bits_max = max( + (self.get_outstream_width(ind) for ind in range(self.heads)) + ) + # Find the biggest of the inputs/outputs + return max([i_bits_max, o_bits_max]) + +# Note: End of shape and datatype utilities + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # Currently nothing to include + self.code_gen_dict["$GLOBALS$"] = [] + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Input and output element datatypes + f"using IType = {self.dtype.get_hls_datatype_str()};", + f"using OType = {self.dtype.get_hls_datatype_str()};", + # Datatype of elements packed into the input stream + f"using IPacked = ap_uint<{self.get_instream_width()}>;", + # Datatype of elements packed into the output stream + f"using OPacked = ap_uint<{self.get_outstream_width()}>;", + # Input and output HLS stream datatypes + "using IStream = hls::stream<" + f" ap_uint<{self.get_instream_width()}>" + ">;", + "using OStream = hls::stream<" + f" ap_uint<{self.get_outstream_width()}>" + ">;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [] + # Generate code for each input stream + for i in range(self.heads): + # Append each reading/writing function call + self.code_gen_dict["$READNPYDATA$"] += [ + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + 'npy2apintstream(', + f'"{code_gen_dir}/in{i}.npy", in{i}_{self.hls_sname()}, false', + ');' + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Declare input and output streams + # Note: Assumes stream type aliases to be set in defines + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # There is one output stream + f"OStream out_{self.hls_sname()};", + # There is one input stream per head + *(f"IStream in{i}_{self.hls_sname()};" for i in range(self.heads)) + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + reversed_reads = ", ".join([ + f"in{i}_{self.hls_sname()}.read()" + for i in reversed(range(self.heads)) + ]) + + # Write the body of the head-splitting top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Repeat for the number of inputs + # Note: Repeat for all num_inputs dimensions + f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", + # Pipeline the steps of this loop + "#pragma HLS pipeline II=1 style=flp", + # Read the next input element from each input stream and concatenate + # using the comma operator overload of ap_uint, writing into the + # output stream + f"out_{self.hls_sname()}.write(({reversed_reads}));" + # End of for-loop over repetitions body + f"}}" # noqa: f-string symmetry + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C** simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the code generation + # dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Generate function call for reading from the output stream into the + # output file + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + # Generate function call reading from stream into the output file + # Note: Outputs are always represented as numpy floats + 'apintstream2npy(', + f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false', + ');', + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the head + # splitting operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # @formatter:off Prevent Python formatter from messing with C++ + # formatting + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + # Output HLS stream + f"OStream &out_{self.hls_sname()}, ", ",".join([ + # One input HLS stream per head # noqa: Formatting + f"IStream &in{i}_{self.hls_sname()}" for i in range(self.heads) + ]), + ")", + # @formatter:off + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the output stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}" + ] + # Connect each input stream with an axi stream interface + for i in range(self.heads): + # Add new interface directive for the input stream + self.code_gen_dict["$PRAGMAS$"] += [ + f"#pragma HLS INTERFACE axis port=in{i}_{self.hls_sname()}" + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary starting with clock + # and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + # One input stream per head + (f"in{i}_{self.hls_sname()}", + self.get_instream_width_padded(ind=i)) for i in range(self.heads) + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + # Just one output stream + (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0)), + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py new file mode 100644 index 0000000000..43e4c4d7b1 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py @@ -0,0 +1,669 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Numpy math and arrays +import numpy as np +# Operating system stuff, e.g. paths +import os + +# QONNX/FINN datatypes +from qonnx.core.datatype import DataType +# QONNX wrapper to ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper +# Some utils for working with tensors in qonnx +from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions + +# The generic HW custom operator version of the operator as a base class +from finn.custom_op.fpgadataflow.attention import ScaledDotProductAttention +# Base class for specializing HW operators as implemented via HLS +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +# Convert and pack (numpy) data for C++ code generation +from finn.util.data_packing import numpy_to_hls_code + + +# HLS Backend specialization of the Scale Dot-product Attention Operator +class ScaledDotProductAttention_hls( # noqa: Class name does not follow + # CapWords convention + ScaledDotProductAttention, HLSBackend +): + # Node attributes matching the HLS operator + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = ScaledDotProductAttention.get_nodeattr_types(self) + # Add the HLSBackend default attributes on top + attrs.update(HLSBackend.get_nodeattr_types(self)) + # Add/Specialize implementation specific attributes here... + # Return the updated attributes dictionary + return attrs + + # Executes the attention operator in C++ mode simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op + node = self.onnx_node + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + + # By convention, inputs 0, 1 and 2 correspond to named inputs q, k and v + + # Read the input from the execution context and reshape to match the + # expected folding + q = context[node.input[0]].reshape(self.get_folded_input_shape(ind=0)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, "q.npy"), q) + + # Read the input from the execution context and reshape to match the + # expected folding + k = context[node.input[1]].reshape(self.get_folded_input_shape(ind=1)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, "k.npy"), k) + + # Read the input from the execution context and reshape to match the + # expected folding + v = context[node.input[2]].reshape(self.get_folded_input_shape(ind=2)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, "v.npy"), v) + + # Optionally, the mask may be provided as an input as well + if self.get_nodeattr("mask_mode") == "input": + # Read the input from the execution context and reshape to match the + # expected folding + m = context[node.input[3]].reshape( + self.get_folded_input_shape(ind=3) + ) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, "m.npy"), m) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, "out.npy")) + # Reshape the folded output and insert into the execution context + context[self.onnx_node.output[0]] = out.reshape( + self.get_normal_output_shape(ind=0) + ) + + # Executes the attention operator in RTL mode simulation + def _execute_node_rtlsim(self, context, graph): # noqa: graph unused + # TODO: Implement rtlsim mode + # Note: Cannot even compile this right now due to missing float ips + raise NotImplementedError( + "exec_mode rtlsim is not implemented yet!" + ) + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + i_bits_max = max((self.get_instream_width(ind) for ind in range(3))) + # Find the widths of the widest output + o_bits_max = max((self.get_outstream_width(ind) for ind in range(1))) + # Assume no bits to represent the mask, if there is no mask + m_bits = 0 + # A mask received as input has a bit-width as well + if self.get_nodeattr("mask_mode") in {"input", "const"}: + # Parallelism is the number of elements in the last dimension of the + # folded mask input + _, _, elems = self.get_folded_input_shape(ind=3) + # Get width of the mask datatype + m_bits = elems * DataType[self.get_nodeattr("MType")].bitwidth() + + # Elements per folded key input (second input) + _, _, i_elems = self.get_folded_input_shape(ind=1) + # Elements per folded value input (third input), same as the number of + # output elements + _, _, o_elems = self.get_folded_input_shape(ind=2) + + # Parallelism is the number of elements in the last dimension of the + # folded attention weights + _, _, s_elems = self.get_folded_attention_shape() + # Number of bits used for the attention weights stream + a_bits = s_elems * DataType[self.get_nodeattr("AType")].bitwidth() + + # Maximum bits per tile of the key and value matrix streams + tile_bits_max = max([ + i_elems * s_elems * DataType[self.get_nodeattr("KType")].bitwidth(), + o_elems * s_elems * DataType[self.get_nodeattr("VType")].bitwidth(), + ]) + # Maximum bits per matmul accumulators + acc_bits_max = max([ + # These are not streamed, thus single element width is counted + DataType[self.get_nodeattr("AccQKMatMul")].bitwidth(), + DataType[self.get_nodeattr("AccAVMatMul")].bitwidth(), + ]) + # Maximum bits per matmul outputs + out_bits_max = max([ + # These are the stream widths, which are always >= than individual + # elements + s_elems * DataType[self.get_nodeattr("OutQKMatMul")].bitwidth(), + o_elems * DataType[self.get_nodeattr("OutAVMatMul")].bitwidth(), + ]) + # Aggregate the maximum bit width in both matmul operators over all + # inputs, intermediates and outputs + matmul_bits_max = max([ + tile_bits_max, acc_bits_max, out_bits_max + ]) + + # Find maximum of all (maximal) bit-widths + return max([i_bits_max, o_bits_max, m_bits, a_bits, matmul_bits_max]) + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # FINN HLSLIB activation functions: e.g. PassThroughActivation + self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] + # Attention operator HLS code + self.code_gen_dict["$GLOBALS$"] += ['#include "attention.hpp"'] + + # Generates C++ parameters file, i.e. activation function thresholds + def generate_params(self, model: ModelWrapper, path): + # The code generation directory is specified as an argument, so this + # will work for both RTL and C++ simulation + code_gen_dir = path + + # Note: The attention operator itself has no weights to be generated as + # a parameter file + + # Start all three activations defaulting to pass-through of the + # accumulator type. + # Note: This might allow type-casts to the output types if they are + # not the same as the accumulators. + act_qk_matmul = "PassThroughActivation" + act_av_matmul = "PassThroughActivation" + act_a_softmax = "PassThroughActivation" + + # Start all thresholds defaulting to empty default initializer braces + thresholds_qk_matmul = "{}" + thresholds_av_matmul = "{}" + thresholds_a_softmax = "{}" + + # Prepares a threshold tensor as C++ string for code generation + def prepare_thresholds(ts, length, fold, dtype): + # Number of thresholds is given as the last dimension of the + # threshold tensor, first dimension is covering all output elements + num = ts.shape[-1] # noqa + # Partition the thresholds along the length into folds of parallel + # elements + ts = interleave_matrix_outer_dim_from_partitions(ts, length // fold) + # Reshape folded thresholds adding an outer dimension + # TODO: Why? MVAU does this, just copied the behavior. This is + # probably to generate the outer C++ initializer braces {} for + # object construction. Isn't it weird to rely on an artificial + # dimension just to have the code generator produce the correct + # string? + ts = ts.reshape(1, length // fold, fold, num) + # Format the thresholds as C++ array code + # Note: no packing, no variable name/type declaration + return numpy_to_hls_code(ts, dtype, "_", False, True), num + + # Get shape and folding configuration. None of the activations fold + # along the query-key embedding dimension or the query sequence length + (_, _, vdim, kvlen), (embfold, seqfold) = self.shapes, self.folds + + # Query-key matmul can have an optional activation function set to + # thresholding activations via node attribute + if self.get_nodeattr("ActQKMatMul") == "thresholds": + # In this case there will be a thresholds parameter initializer + thresholds = model.get_initializer( + self.get_input_name_by_name("thresholds_qk_matmul") + ) + # Get the datatype of the thresholds + thresholds_dtype = DataType[self.get_nodeattr("AccQKMatMul")] + # Activation value, i.e., bias applied after thresholding activation + bias = self.get_nodeattr("BiasActQKMatMul") + # No support for floating-point bias + assert int(bias) == bias, "BiasActQKMatMul must be integer" + # Convert the bias to integer representation, so it can be used as a + # template argument + bias = int(bias) + # Format the thresholds as C++ array code: QK matmul outputs fold + # along the key-value sequence length dimension + thresholds_qk_matmul, num = prepare_thresholds( + thresholds, kvlen, seqfold, thresholds_dtype + ) + # Get the HLS datatype string corresponding to the thresholds + # datatype for C++ code generation + dtype_str = thresholds_dtype.get_hls_datatype_str() + # Replace default pass-through activation by thresholding activation + # Note: Relies on type and shape definitions generated by the + # "defines" method + act_qk_matmul = "\n".join([ + "ThresholdsActivation<", + " SeqFold," + " KVLen/SeqFold," + f" {num}," + " AccQKMatMul," + " OutQKMatMul," + f" {bias}," + # Note: Not sure why the default comp::less does not work... + f" comp::less_equal<{dtype_str}, {dtype_str}>", + ">" + ]) + + # Softmax can have an optional activation function set to thresholding + # activations via node attribute + if self.get_nodeattr("ActASoftmax") == "thresholds": + # In this case there will be a thresholds parameter initializer + thresholds = model.get_initializer( + self.get_input_name_by_name("thresholds_a_softmax") + ) + # Get the datatype of the thresholds + thresholds_dtype = DataType[self.get_nodeattr("AccASoftmax")] + # Activation value, i.e., bias applied after thresholding activation + bias = self.get_nodeattr("BiasActASoftmax") + # No support for floating-point bias + assert int(bias) == bias, "BiasActASoftmax must be integer" + # Convert the bias to integer representation, so it can be used as a + # template argument + bias = int(bias) + # Format the thresholds as C++ array code: Softmax outputs fold + # along the key-value sequence length dimension + thresholds_a_softmax, num = prepare_thresholds( + thresholds, kvlen, seqfold, thresholds_dtype + ) + # Get the HLS datatype string corresponding to the thresholds + # datatype for C++ code generation + dtype_str = thresholds_dtype.get_hls_datatype_str() + # Replace default pass-through activation by thresholding activation + # Note: Relies on type and shape definitions generated by the + # "defines" method + act_a_softmax = "\n".join([ + "ThresholdsActivation<", + " SeqFold," + " KVLen/SeqFold," + f" {num}," + " AccASoftmax," + " AType," + f" {bias}," + # Note: Not sure why the default comp::less does not work... + f" comp::less_equal<{dtype_str}, {dtype_str}>", + ">" + ]) + + # Attention-value matmul can have an optional activation function set to + # thresholding activations via node attribute + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # In this case there will be a thresholds parameter initializer + thresholds = model.get_initializer( + self.get_input_name_by_name("thresholds_av_matmul") + ) + # Get the datatype of the thresholds + thresholds_dtype = DataType[self.get_nodeattr("AccAVMatMul")] + # Activation value, i.e., bias applied after thresholding activation + bias = self.get_nodeattr("BiasActAVMatMul") + # No support for floating-point bias + assert int(bias) == bias, "BiasActAVMatMul must be integer" + # Convert the bias to integer representation, so it can be used as a + # template argument + bias = int(bias) + # Format the thresholds as C++ array code: AV matmul outputs fold + # along the value embedding dimension + thresholds_av_matmul, num = prepare_thresholds( + thresholds, vdim, embfold, thresholds_dtype + ) + # Get the HLS datatype string corresponding to the thresholds + # datatype for C++ code generation + dtype_str = thresholds_dtype.get_hls_datatype_str() + # Replace default pass-through activation by thresholding activation + # Note: Relies on type and shape definitions generated by the + # "defines" method + act_av_matmul = "\n".join([ + "ThresholdsActivation<", + " EmbFold," + " VDim/EmbFold," + f" {num}," + " AccAVMatMul," + " OutAVMatMul," + f" {bias}," + # Note: Not sure why the default comp::less does not work... + f" comp::less_equal<{dtype_str}, {dtype_str}>", + ">" + ]) + + # Assume no attention mask as a default: Generate C++ code of tag + # instance of "none" mask type + attention_mask = \ + "static const auto attention_mask = attention::mask::NONE" + + # If a causal mask is specified, set the appropriate tag dispatching + # instance + if self.get_nodeattr("mask_mode") == "causal": + # Generate C++ code of tag instance of causal mask type + attention_mask = \ + "static const auto attention_mask = attention::mask::CAUSAL" + + # If a constant mask is specified, array code needs to be generated + if self.get_nodeattr("mask_mode") == "const": + # Attention mask type of folded constant mask array + mask_type = "attention::mask::Const" + # Get the constant mask values + mask = model.get_initializer(self.get_input_name_by_name("M")) + # Num should always be equal to QLen + num = mask.shape[-1] + # Partition the mask along the length into folds of parallel + # elements + mask = interleave_matrix_outer_dim_from_partitions( + mask, kvlen // seqfold + ) + # Reshape folded mask adding an outer dimension + mask = mask.reshape(num, kvlen // seqfold, seqfold).squeeze() + # Format the mask as C++ array code + # Note: no packing, no variable name/type declaration + mask = numpy_to_hls_code(mask, DataType["BINARY"], "_", False, True) + # Generate C++ code initializing the constant mask array + attention_mask = f"static const {mask_type} attention_mask = {mask}" + + # Of a mask is provided as input, no object parameters need to be + # generated here + if self.get_nodeattr("mask_mode") == "input": + # Attention mask type of input stream + mask_type = "attention::mask::Input" + # Generate C++ code creating an input stream instance for the mask + # Note: This is just a dummy, the real input stream will be part + # of the operator interface + attention_mask = f"static const {mask_type} attention_mask;" + + # Open a file to store the thresholds parameters as C++ code + with open(f"{code_gen_dir}/params.hpp", "w") as file: + # Write lines of C++ code separated by newlines to the file + file.write("\n".join([ + # Scale factor preceding the softmax activation function to + # dequantize the input to floating-point representation + "static const float dequant_softmax =" + f" {self.get_nodeattr('DequantSoftmax')};", + # Attention mask parameters if "none", "causal" or "const" + f"{attention_mask};", + # Type alias to the generated attention mask for convenience + "using AttentionMask = decltype(attention_mask);", + # Add type definition and threshold initialization of the + # query-key matmul activation + f"using ActQKMatMul = {act_qk_matmul};", + f"ActQKMatMul act_qk_matmul = {thresholds_qk_matmul};", + # Add type definition and threshold initialization of the + # attention-value matmul activation + f"using ActAVMatMul = {act_av_matmul};", + f"ActAVMatMul act_av_matmul = {thresholds_av_matmul};", + # Add type definition and threshold initialization of the + # softmax activation + f"using ActASoftmax = {act_a_softmax};", + f"ActASoftmax act_a_softmax = {thresholds_a_softmax};", + # Append a newline at the end of the file (to avoid problems + # when including, required by C standard?) + "\n" + ])) + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Generate shape definitions from attributes to C++ constant definitions + def shapedefs(*names): + # C++ qualified type to be used for shape constants + shape = "static constexpr std::size_t" + # Generate a C++ constant definition for each of the attributes + # given by argument list names + return ( + f"{shape} {name} = {self.get_nodeattr(name)};" for name in names + ) + + # Generate datatype definitions mapping from QONNX DataType to HLS type + def typedefs(*names): + # Gets the HLS type string for the datatype specified by the named + # attribute + def hls_type(name): + # Looks up the datatype specified for the attribute and + # translates from QONNX to HLS type + return DataType[self.get_nodeattr(name)].get_hls_datatype_str() + + # Generate a C++ type alias definition for each of the attributes + # given by argument list names + return (f"using {name} = {hls_type(name)};" for name in names) + + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Shape constant definitions of attention inputs (query, key and + # value) and folding configuration + *shapedefs( + "QKDim", + "QLen", + "VDim", + "KVLen", + "EmbFold", + "SeqFold" + ), + # Type alias definitions for all input, output and intermediate + # datatypes + *typedefs( + "QType", + "KType", + "VType", + "MType", + "AType", + "OType" + ), + # Type alias definitions for the matmul accumulators and output + # datatypes + *typedefs( + "AccQKMatMul", + "OutQKMatMul", + "AccAVMatMul", + "OutAVMatMul", + "AccASoftmax" + ), + # Include the activation function type definitions and parameters + # Note: The typedefs in this header require the typedefs above, + # thus adding this to the global includes is not possible. + '#include "params.hpp"', + # Type alias of the properly configured attention operator class + "using Attention = ScaledDotProductAttention<", + " QKDim,", + " QLen,", + " VDim,", + " KVLen,", + " EmbFold,", + " SeqFold,", + " QType,", + " KType,", + " VType,", + " MType,", + " AType,", + " OType,", # Note: OType and last MatMul out must match + " AccQKMatMul,", + " OutQKMatMul,", + " ActQKMatMul,", + " AccAVMatMul,", + " OType,", # Note: OType and last MatMul out must match + " ActAVMatMul,", + " ActASoftmax", + ">;", + # Short type aliases of attention input and output streams + "using QStream = Attention::QStream;", + "using KStream = Attention::KStream;", + "using VStream = Attention::VStream;", + "using OStream = Attention::OStream;", + "using MStream = Attention::MStream;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [ + # Deduce the datatype of elements packed into the query input stream + # TODO: Maybe these type-deductions can be removed by changing the + # order of the template arguments of the npy2apintstream, such + # that type-deduction is handled there? + 'using QPacked = decltype(QStream{}.read());', + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + 'npy2apintstream(', + f' "{code_gen_dir}/q.npy", q_{self.hls_sname()}, false', + ');', + + # Deduce the datatype of elements packed into the key input stream + 'using KPacked = decltype(KStream{}.read());', + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + 'npy2apintstream(', + f' "{code_gen_dir}/k.npy", k_{self.hls_sname()}, false', + ');', + + # Deduce the datatype of elements packed into the value input stream + 'using VPacked = decltype(VStream{}.read());', + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + 'npy2apintstream(', + f' "{code_gen_dir}/v.npy", v_{self.hls_sname()}, false', + ');', + ] + + # If the mask is provided as an input, it needs to be read as well + if self.get_nodeattr("mask_mode") == "input": + # Generate function call for reading the mask file into the input + # stream + self.code_gen_dict["$READNPYDATA$"] += [ + # Deduce the datatype of elements packed into the mask input + # stream + 'using MPacked = decltype(MStream{}.read());', + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + 'npy2apintstream(', + f' "{code_gen_dir}/m.npy", m_{self.hls_sname()}, false', + ');', + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Declare input (query, key, value) and output streams + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # Note: Assumes stream type aliases to be set in defines + f"QStream q_{self.hls_sname()};", + f"KStream k_{self.hls_sname()};", + f"VStream v_{self.hls_sname()};", + f"OStream out_{self.hls_sname()};" + ] + # If the mask is provided as an input, it needs a stream declaration as + # well + if self.get_nodeattr("mask_mode") == "input": + # Append the mask stream to the declaration list + self.code_gen_dict["$STREAMDECLARATIONS$"] += [ + # Note: Assumes stream type aliases to be set in defines + f"MStream m_{self.hls_sname()};", + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + # Write the body of the attention top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Instantiate the attention operator and connect to the generated + # threshold parameters + # Note: Assumes "Attention" to be aliased appropriate configuration + # in defines with. + # Note: Assumes parameters to be generated in 'generate_params' and + # made available via include/defines before. + "Attention attention {", + " act_qk_matmul, act_av_matmul, act_a_softmax, dequant_softmax", + "};", + # Connect the attention operator to the input and output streams + "attention(" + f"q_{self.hls_sname()}, " + f"k_{self.hls_sname()}, " + f"v_{self.hls_sname()}, " + f"out_{self.hls_sname()}, " + # TODO: Does not work for "input" mode mask + "attention_mask" + ");", + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C** simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the code generation + # dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Generate function call for reading from the output stream into the + # output file + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + # Deduce the datatype of elements packed into the output stream + 'using OPacked = decltype(OStream{}.read());', + # Generate function call reading from stream into the output file + # Note: Outputs are always represented as numpy floats + 'apintstream2npy(', + f'out_{self.hls_sname()}, {shape}, "{code_gen_dir}/out.npy", false', + ');', + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the + # attention operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + f" QStream &q_{self.hls_sname()}," + f" KStream &k_{self.hls_sname()}," + f" VStream &v_{self.hls_sname()}," + f" OStream &out_{self.hls_sname()}", + ")", + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the query input stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=q_{self.hls_sname()}", + # Connect the key input stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=k_{self.hls_sname()}", + # Connect the value input stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=v_{self.hls_sname()}", + # Connect the output stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=out_{self.hls_sname()}", + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary starting with clock + # and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + (f"q_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), + (f"k_{self.hls_sname()}", self.get_instream_width_padded(ind=1)), + (f"v_{self.hls_sname()}", self.get_instream_width_padded(ind=2)) + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + (f"out_{self.hls_sname()}", self.get_outstream_width_padded(ind=0)) + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py b/src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py new file mode 100644 index 0000000000..8a86e6aebc --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py @@ -0,0 +1,249 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Numpy math and arrays +import numpy as np +# Operating system stuff, e.g. paths +import os + +# Base class for specializing HW operators as implemented via HLS +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +# The generic HW custom operator version of the operator as a base class +from finn.custom_op.fpgadataflow.replicate_stream import ReplicateStream + + +# HLS Backend specialization of the stream-replication operator +class ReplicateStream_hls( # noqa: Class name does not follow + # CapWords convention + ReplicateStream, HLSBackend +): + # Node attributes matching the HLS operator + def get_nodeattr_types(self): + # Start from parent operator class attributes + attrs = ReplicateStream.get_nodeattr_types(self) + # Add the HLSBackend default attributes on top + attrs.update(HLSBackend.get_nodeattr_types(self)) + # Add/Specialize implementation specific attributes here... + # Return the updated attributes dictionary + return attrs + + # Executes replicating inputs in C++ simulation + def _execute_node_cppsim(self, context, graph): # noqa: graph unused + # Get the node wrapped by this custom op # noqa Duplicate + node = self.onnx_node # noqa Duplicate + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the input out of the execution context + inp = context[node.input[0]] + # Validate the shape of the input + assert inp.shape == self.get_normal_input_shape(ind=0), \ + f"Input shape mismatch for {node.input[0]}" + # Reshape the input into folded form + inp = inp.reshape(self.get_folded_input_shape(ind=0)) + # Save the folded inputs to file to be used by simulation + np.save(os.path.join(code_gen_dir, "in.npy"), inp) + + # Execute the precompiled model + super().exec_precompiled_singlenode_model() + + # Enumerate the node outputs + for i, name in enumerate(node.output): + # Load the output numpy file generated by the C++ simulation + out = np.load(os.path.join(code_gen_dir, f"out{i}.npy")) + # Reshape the folded output and insert into the execution context + context[name] = out.reshape(self.get_normal_output_shape(ind=i)) + + # Maximum width of any ap_int used in this operator + def get_ap_int_max_w(self): + # Find the widths of the widest input + # Note: There is just one input. + i_bits_max = self.get_instream_width(ind=0) + # Find the widths of the widest output + # Note: there is one output per replica + o_bits_max = max( + (self.get_outstream_width(ind) for ind in range(self.num)) + ) + # Find the biggest of the inputs/outputs + return max([i_bits_max, o_bits_max]) + + # Note: End of shape and datatype utilities + + # Generates list of C++ includes to be placed at the top of the generated + # code + def global_includes(self): + # Currently nothing to include + self.code_gen_dict["$GLOBALS$"] = [] + + # Generates C++ code of type alias, global constant and macro definitions + def defines(self, var): + # Insert constants and type aliases into the dictionary + self.code_gen_dict["$DEFINES$"] = [ + # Input and output element datatypes + f"using IType = {self.dtype.get_hls_datatype_str()};", + f"using OType = {self.dtype.get_hls_datatype_str()};", + # Width of single elements to avoid using ::width attribute which is + # not present for datatype float + f"static constexpr auto ElemWidth = {self.dtype.bitwidth()};" + # Datatype of elements packed into the input stream + f"using IPacked = ap_uint<{self.get_instream_width()}>;", + # Datatype of elements packed into the output stream + f"using OPacked = ap_uint<{self.get_outstream_width()}>;", + # Input and output HLS stream datatypes + "using IStream = hls::stream<" + f" ap_uint<{self.get_instream_width()}>" + ">;", + "using OStream = hls::stream<" + f" ap_uint<{self.get_outstream_width()}>" + ">;", + ] + + # Generates C++ code for reading data from .npy (numpy format) for testing + # in C++ simulation + def read_npy_data(self): + # Input data is stored in numpy files in the code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Generate function calls for reading the input files into the input + # streams + self.code_gen_dict["$READNPYDATA$"] = [ + # Generate function call reading from file into the input stream + # Note: Inputs are always represented as numpy floats + 'npy2apintstream(', + f'"{code_gen_dir}/in.npy", in_{self.hls_sname()}, false', + ');' + ] + + # Generates C++ code for declaring all streams involved in C++ simulation + # for testing + def strm_decl(self): + # Declare input and output streams + # Note: Assumes stream type aliases to be set in defines + self.code_gen_dict["$STREAMDECLARATIONS$"] = [ + # There is one input datastream + f"IStream in_{self.hls_sname()};", + # There is one output datastream per replica + *(f"OStream out{i}_{self.hls_sname()};" for i in range(self.num)) + ] + + # Generates C++ code for calling the computation part of the operator + def docompute(self): + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + + # Write the body of the stream replicating top-level function + self.code_gen_dict["$DOCOMPUTE$"] = [ + # Repeat for the number of inputs + # Note: Repeat for all num_inputs dimensions + f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", + # Pipeline the steps of this loop + "#pragma HLS pipeline II=1 style=flp", + # Read the next input element from the stream + f"const auto x = in_{self.hls_sname()}.read();", + # Write the same input element into each output stream + *(f"{out(i)}.write(x);" for i in range(self.num)), + # End of for-loop over repetitions body + f"}}" # noqa: f-string symmetry + ] + + # Generates C++ code for reading the output stream and converting back to + # numpy format for testing in C++ simulation + def dataoutstrm(self): + # Output data will be stored in numpy files in the # noqa Duplicate + # code generation dictionary + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # Get the expected shape of the folded output array formatted as a C++ + # vector initializer + # Note: Valid formatting relies on correct placement of curly braces + # and line breaks: Open/close all three braces on the same line of code + # to avoid '\n' to be inserted into the string + shape = f"""{{{ + ','.join((str(i) for i in self.get_folded_output_shape())) + }}}""" + # Start collecting function calls to write the output data stream + self.code_gen_dict["$DATAOUTSTREAM$"] = [] + + # Generates the name of the ith output stream + def out(i): + return f"out{i}_{self.hls_sname()}" + + # Generate code for each output stream + for i in range(self.num): + # Append each reading/writing function call + self.code_gen_dict["$DATAOUTSTREAM$"] += [ + # Generate function call reading from stream into the output + # file + # Note: Outputs are always represented as numpy floats + 'apintstream2npy(', + f'{out(i)}, {shape}, "{code_gen_dir}/out{i}.npy", false', + ');' + ] + + # Generates C++ code for saving the output of C++ simulation to a file in + # numpy format + def save_as_npy(self): + # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used + # for something before, which is now integrated into dataoutstrm()? + self.code_gen_dict["$SAVEASCNPY$"] = [] + + # Generates essentially the head of the C++ function from which the IP block + # will be generated during ipgen, i.e. actual synthesis + def blackboxfunction(self): + # Insert function head describing the top level interface of the stream + # replicating operator + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + # @formatter:off Prevent Python formatter from messing with C++ + # formatting + # Note: Assumes stream type aliases to be set in defines + f"void {self.onnx_node.name} (", + # Input HLS stream + f"IStream &in_{self.hls_sname()}, ", ",".join([ + # One output HLS stream per replica # noqa: Formatting + f"OStream &out{i}_{self.hls_sname()}" for i in range(self.num) + ]), + ")", + # @formatter:off + ] + + # Generates C++ pragmas to be inserted into the main function of the C++ + # simulation and the ipgen-blackboxfunction as well + def pragmas(self): + # Add HLS interface directives specifying how to create RTL ports for + # the top-level function arguments + self.code_gen_dict["$PRAGMAS$"] = [ + # Connect the input stream with an axi stream interface + f"#pragma HLS INTERFACE axis port=in_{self.hls_sname()}" + ] + # Connect each output stream with an axi stream interface + for i in range(self.num): + # Add new interface directive for the output stream + self.code_gen_dict["$PRAGMAS$"] += [ + f"#pragma HLS INTERFACE axis port=out{i}_{self.hls_sname()}" + ] + # No block-level I/O protocol for the function return value + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE ap_ctrl_none port=return" + ) + + # Returns the names of input and output interfaces grouped by protocol + def get_verilog_top_module_intf_names(self): + # Start collecting interface names in a dictionary # noqa Duplicate + # starting with clock and reset + intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa + # AXI stream input interfaces + intf_names["s_axis"] = [ + # Just one input stream + (f"in_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), + ] + # AXI stream output interfaces + intf_names["m_axis"] = [ + # One output stream per replica + (f"out{i}_{self.hls_sname()}", + self.get_outstream_width_padded(ind=i)) for i in range(self.num) + ] + # No AXI-MM, AXI-Lite or protocol-less interfaces + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + # Return the interface name dictionary + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/replicate_stream.py b/src/finn/custom_op/fpgadataflow/replicate_stream.py index aaba72757b..37709ca24b 100644 --- a/src/finn/custom_op/fpgadataflow/replicate_stream.py +++ b/src/finn/custom_op/fpgadataflow/replicate_stream.py @@ -1,30 +1,30 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Numpy math and arrays +import numpy as np # Operating system stuff, e.g. paths import os # Python warning subsystem import warnings -# Numpy math and arrays -import numpy as np # Helper for creating ONNX nodes -from onnx import helper as oh # noqa +from onnx import helper as oh # QONNX/FINN datatypes -from qonnx.core.datatype import DataType # noqa qonnx dependency is specified -# in setup.cfg as well as in fetch-repos.sh +from qonnx.core.datatype import DataType # QONNX wrapper to ONNX model graphs -from qonnx.core.modelwrapper import ModelWrapper # noqa qonnx - -# Converts inputs/outputs to/from RTL simulation format -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from qonnx.core.modelwrapper import ModelWrapper # Derive custom operators form the FINN base custom op from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -# Specialize the custom op as HLS backend implementation -from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +# Converts inputs/outputs to/from RTL simulation format +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # Replicates an input stream to arbitrary many output streams # See DuplicateStreams_Batch for feeding exactly two streams -class ReplicateStream(HWCustomOp, HLSBackend): +class ReplicateStream(HWCustomOp): # Initializes the operator given an onnx graph node def __init__(self, onnx_node, **kwargs): # Just forward all arguments to the init method of the CustomOp base @@ -40,12 +40,8 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): # Start from parent operator class attributes attrs = HWCustomOp.get_nodeattr_types(self) - attrs.update(HLSBackend.get_nodeattr_types(self)) # Update attributes dictionary for new custom operator attrs.update({ - # Force implementation style to HLS backend - "preferred_impl_style": ("s", False, "hls", {"", "hls"}), - # Number of replicas to produce "num": ("i", True, 1), # Data type of input and output elements @@ -138,29 +134,10 @@ def _execute_node_python(self, context, graph): # noqa: graph unused # Executes replicating inputs in C++ simulation def _execute_node_cppsim(self, context, graph): # noqa: graph unused - # Get the node wrapped by this custom op # noqa Duplicate - node = self.onnx_node - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Get the input out of the execution context - inp = context[node.input[0]] - # Validate the shape of the input - assert inp.shape == self.get_normal_input_shape(ind=0), \ - f"Input shape mismatch for {node.input[0]}" - # Reshape the input into folded form - inp = inp.reshape(self.get_folded_input_shape(ind=0)) - # Save the folded inputs to file to be used by simulation - np.save(os.path.join(code_gen_dir, f"in.npy"), inp) - - # Execute the precompiled model - super().exec_precompiled_singlenode_model() - - # Enumerate the node outputs - for i, name in enumerate(node.output): - # Load the output numpy file generated by the C++ simulation - out = np.load(os.path.join(code_gen_dir, f"out{i}.npy")) - # Reshape the folded output and insert into the execution context - context[name] = out.reshape(self.get_normal_output_shape(ind=i)) + # C++ Simulation needs to be implemented in HLS backend specialization + raise NotImplementedError( + f"exec_mode cppsim of {self.__class__.__name__} is not implemented!" + ) # Executes replicating inputs in RTL simulation def _execute_node_rtlsim(self, context, graph): # noqa: graph unused @@ -235,7 +212,7 @@ def verify_node(self): # TODO: Implement return [] - # Note: End of QONNX CustomOp region, below is FINN HLSCustomOp stuff + # Note: End of QONNX CustomOp region, below is FINN HWCustomOp stuff # Gets the datatype of input at index ind def get_input_datatype(self, ind=0): @@ -289,19 +266,6 @@ def get_outstream_width(self, ind=0): # Width of a stream producing output elements in parallel return elems * o_bits - # Maximum width of any ap_int used in this operator - def get_ap_int_max_w(self): - # Find the widths of the widest input - # Note: There is just one input. - i_bits_max = self.get_instream_width(ind=0) - # Find the widths of the widest output - # Note: there is one output per replica - o_bits_max = max( - (self.get_outstream_width(ind) for ind in range(self.num)) - ) - # Find the biggest of the inputs/outputs - return max([i_bits_max, o_bits_max]) - # Gets the number of expected output values, i.e. how many times read() # could/should be called on any output stream of this operator def get_number_output_values(self): @@ -311,184 +275,3 @@ def get_number_output_values(self): # outputs, i.e., producing N replica outputs per cycle in parallel, # count N outputs per cycle... return np.prod(self.get_folded_output_shape()[:-1]) * self.num - - # Note: End of shape and datatype utilities - - # Generates list of C++ includes to be placed at the top of the generated - # code - def global_includes(self): - # Currently nothing to include - self.code_gen_dict["$GLOBALS$"] = [] - - # Generates C++ code of type alias, global constant and macro definitions - def defines(self, var): - # Insert constants and type aliases into the dictionary - self.code_gen_dict["$DEFINES$"] = [ - # Input and output element datatypes - f"using IType = {self.dtype.get_hls_datatype_str()};", - f"using OType = {self.dtype.get_hls_datatype_str()};", - # Width of single elements to avoid using ::width attribute which is - # not present for datatype float - f"static constexpr auto ElemWidth = {self.dtype.bitwidth()};" - # Datatype of elements packed into the input stream - f"using IPacked = ap_uint<{self.get_instream_width()}>;", - # Datatype of elements packed into the output stream - f"using OPacked = ap_uint<{self.get_outstream_width()}>;", - # Input and output HLS stream datatypes - f"using IStream = hls::stream<" - f" ap_uint<{self.get_instream_width()}>" - f">;", - f"using OStream = hls::stream<" - f" ap_uint<{self.get_outstream_width()}>" - f">;", - ] - - # Generates C++ code for reading data from .npy (numpy format) for testing - # in C++ simulation - def read_npy_data(self): - # Input data is stored in numpy files in the code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Generate function calls for reading the input files into the input - # streams - self.code_gen_dict["$READNPYDATA$"] = [ - # Generate function call reading from file into the input stream - # Note: Inputs are always represented as numpy floats - f'npy2apintstream(', - f'"{code_gen_dir}/in.npy", in_{self.hls_sname()}, false', - f');' - ] - - # Generates C++ code for declaring all streams involved in C++ simulation - # for testing - def strm_decl(self): - # Declare input and output streams - # Note: Assumes stream type aliases to be set in defines - self.code_gen_dict["$STREAMDECLARATIONS$"] = [ - # There is one input datastream - f"IStream in_{self.hls_sname()};", - # There is one output datastream per replica - *(f"OStream out{i}_{self.hls_sname()};" for i in range(self.num)) - ] - - # Generates C++ code for calling the computation part of the operator - def docompute(self): - # Generates the name of the ith output stream - def out(i): - return f"out{i}_{self.hls_sname()}" - - # Write the body of the stream replicating top-level function - self.code_gen_dict["$DOCOMPUTE$"] = [ - # Repeat for the number of inputs - # Note: Repeat for all num_inputs dimensions - f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", - # Pipeline the steps of this loop - f"#pragma HLS pipeline II=1 style=flp", - # Read the next input element from the stream - f"const auto x = in_{self.hls_sname()}.read();", - # Write the same input element into each output stream - *(f"{out(i)}.write(x);" for i in range(self.num)), - # End of for-loop over repetitions body - f"}}" - ] - - # Generates C++ code for reading the output stream and converting back to - # numpy format for testing in C++ simulation - def dataoutstrm(self): - # Output data will be stored in numpy files in the # noqa Duplicate - # code generation dictionary - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - # Get the expected shape of the folded output array formatted as a C++ - # vector initializer - # Note: Valid formatting relies on correct placement of curly braces - # and line breaks: Open/close all three braces on the same line of code - # to avoid '\n' to be inserted into the string - shape = f"""{{{ - ','.join((str(i) for i in self.get_folded_output_shape())) - }}}""" - # Start collecting function calls to write the output data stream - self.code_gen_dict["$DATAOUTSTREAM$"] = [] - - # Generates the name of the ith output stream - def out(i): - return f"out{i}_{self.hls_sname()}" - - # Generate code for each output stream - for i in range(self.num): - # Append each reading/writing function call - self.code_gen_dict["$DATAOUTSTREAM$"] += [ - # Generate function call reading from stream into the output - # file - # Note: Outputs are always represented as numpy floats - f'apintstream2npy(', - f'{out(i)}, {shape}, "{code_gen_dir}/out{i}.npy", false', - f');' - ] - - # Generates C++ code for saving the output of C++ simulation to a file in - # numpy format - def save_as_npy(self): - # Note: This seems to be empty in ALL HLSCustomOps. Probably it was used - # for something before, which is now integrated into dataoutstrm()? - self.code_gen_dict["$SAVEASCNPY$"] = [] - - # Generates essentially the head of the C++ function from which the IP block - # will be generated during ipgen, i.e. actual synthesis - def blackboxfunction(self): - # Insert function head describing the top level interface of the stream - # replicating operator - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - # @formatter:off Prevent Python formatter from messing with C++ - # formatting - # Note: Assumes stream type aliases to be set in defines - f"void {self.onnx_node.name} (", - # Input HLS stream - f" IStream &in_{self.hls_sname()}, ", ",".join([ - # One output HLS stream per replica # noqa: Formatting - f" OStream &out{i}_{self.hls_sname()}" for i in range(self.num) - ]), - f")", - # @formatter:off - ] - - # Generates C++ pragmas to be inserted into the main function of the C++ - # simulation and the ipgen-blackboxfunction as well - def pragmas(self): - # Add HLS interface directives specifying how to create RTL ports for - # the top-level function arguments - self.code_gen_dict["$PRAGMAS$"] = [ - # Connect the input stream with an axi stream interface - f"#pragma HLS INTERFACE axis port=in_{self.hls_sname()}" - ] - # Connect each output stream with an axi stream interface - for i in range(self.num): - # Add new interface directive for the output stream - self.code_gen_dict["$PRAGMAS$"] += [ - f"#pragma HLS INTERFACE axis port=out{i}_{self.hls_sname()}" - ] - # No block-level I/O protocol for the function return value - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - # Returns the names of input and output interfaces grouped by protocol - def get_verilog_top_module_intf_names(self): - # Start collecting interface names in a dictionary # noqa Duplicate - # starting with clock and reset - intf_names = {"clk": ["ap_clk"], "rst": ["ap_rst_n"]} # noqa - # AXI stream input interfaces - intf_names["s_axis"] = [ - # Just one input stream - (f"in_{self.hls_sname()}", self.get_instream_width_padded(ind=0)), - ] - # AXI stream output interfaces - intf_names["m_axis"] = [ - # One output stream per replica - (f"out{i}_{self.hls_sname()}", - self.get_outstream_width_padded(ind=i)) for i in range(self.num) - ] - # No AXI-MM, AXI-Lite or protocol-less interfaces - intf_names["aximm"] = [] - intf_names["axilite"] = [] - intf_names["ap_none"] = [] - # Return the interface name dictionary - return intf_names From 5f1ee21a8f353b9597118c984c5e1961578554c5 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 4 Apr 2024 18:04:41 +0200 Subject: [PATCH 59/88] [Refactor] Fix ScaledDotProductAttention C++ simulation test --- .../test_fpgadataflow_attention.py | 89 +++++++++++-------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_attention.py b/tests/fpgadataflow/test_fpgadataflow_attention.py index 38b3737f04..a5c7ac0d15 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention.py @@ -1,43 +1,47 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + # Testing framework -import pytest # noqa pytest dependecy is listed in setup.cfg +import pytest +# Use numpy for python execution / computing the ground truth expected values +import numpy as np + # Automatically generate init, repr, ... for classes containing a lot of # attributes from dataclasses import dataclass -# Use numpy for python execution / computing the ground truth expected values -import numpy as np - # Utility types and function for creating onnx nodes and graphs from onnx import TensorProto, helper -# QONNX utility for generating random input data for testing and for creating -# models -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model # noqa qonnx -# dependency is specified in setup.cfg as well as in fetch-repos.sh # QONNX datatypes -from qonnx.core.datatype import ( # noqa - DataType, IntType, FloatType, BaseDataType -) +from qonnx.core.datatype import BaseDataType, DataType, FloatType, IntType # Wrapper around ONNX model with some graph manipulation utility -from qonnx.core.modelwrapper import ModelWrapper # noqa +from qonnx.core.modelwrapper import ModelWrapper # Execute onnx model graphs -from qonnx.core.onnx_exec import execute_onnx # noqa -# Graph transformation giving unique names to each node in a QONNX model graph -from qonnx.transformation.general import GiveUniqueNodeNames # noqa +from qonnx.core.onnx_exec import execute_onnx # Multithreshold activations -from qonnx.custom_op.general.multithreshold import multithreshold # noqa +from qonnx.custom_op.general.multithreshold import multithreshold +# Registry of all QONNX CustomOps +from qonnx.custom_op.registry import getCustomOp +# Graph transformation giving unique names to each node in a QONNX model graph +from qonnx.transformation.general import GiveUniqueNodeNames +# QONNX utility for generating random input data for testing and for creating +# models +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model -# FINN graph transformations for preparing simulation (cppsim or rtlsim) -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +# Softmax function on numpy arrays with overflow handling matching the HLS +# operator +from finn.custom_op.fpgadataflow.attention import softmax from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -# Softmax function on numpy arrays with overflow handling matching the HLS -# operator -from finn.custom_op.fpgadataflow.attention import softmax +# FINN graph transformations for preparing simulation (cppsim or rtlsim) +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers # Python/Numpy model of the scaled dot-product attention operator as it is (will @@ -222,8 +226,8 @@ def make_modelwrapper(self): # Note: Order matters... thresholds = [ "thresholds_qk_matmul", + "thresholds_a_softmax", "thresholds_av_matmul", - "thresholds_a_softmax" ] # Build up the node attribute dictionary kwargs = { @@ -368,9 +372,9 @@ def test_attention_cppsim( AType=AType, OType=OType, # Accumulator type configuration - AccQKMatMul=DataType["UINT22"], + AccQKMatMul=DataType["UINT32"], OutQKMatMul=DataType["UINT8"], - AccAVMatMul=DataType["UINT22"], + AccAVMatMul=DataType["UINT32"], OutAVMatMul=OType, # Dequantizer scale, factor to convert the whole UINT8 range to floats # in range 0.0 to 1.0 @@ -385,6 +389,17 @@ def test_attention_cppsim( context = { "Q": q, "K": k, "V": v, "mask": mask } + + # Mark all nodes to be specialized as HLS backend implementations + for node in model.graph.node: + # Get the CustomOp instance of the node to get access to the node + # attributes + inst = getCustomOp(node) + # Note: only HLS-based layers execute C++ Simulation + inst.set_nodeattr("preferred_impl_style", "hls") + # Turn all HWCustomOp layers into HLS specializations + model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) + # Set model execution mode to C++ simulation model = model.transform(SetExecMode("cppsim")) # Generates the C++ source and compiles the C++ simulation @@ -407,13 +422,6 @@ def test_attention_cppsim( assert np.allclose(o_produced, o_expected), "cppsim exec failed" -# This is a fpgadataflow type of test -@pytest.mark.fpgadataflow -# Tests rtl simulation of single scaled dot-product attention head -def test_fpgadataflow_attention_rtlsim(): - pass - - # Size of query and key embedding dimension @pytest.mark.parametrize("QKDim", [4]) # noqa: Duplicated code fragment # Size of value embedding dimension @@ -478,9 +486,9 @@ def test_attention_rtlsim( AType=AType, OType=OType, # Accumulator type configuration - AccQKMatMul=DataType["UINT22"], + AccQKMatMul=DataType["UINT32"], OutQKMatMul=DataType["UINT8"], - AccAVMatMul=DataType["UINT22"], + AccAVMatMul=DataType["UINT32"], OutAVMatMul=OType, # Dequantizer scale, factor to convert the whole UINT8 range to floats # in range 0.0 to 1.0 @@ -495,6 +503,17 @@ def test_attention_rtlsim( context = { "Q": q, "K": k, "V": v, "mask": mask } + + # Mark all nodes to be specialized as HLS backend implementations + for node in model.graph.node: + # Get the CustomOp instance of the node to get access to the node + # attributes + inst = getCustomOp(node) + # Note: only HLS-based layers execute C++ Simulation + inst.set_nodeattr("preferred_impl_style", "hls") + # Turn all HWCustomOp layers into HLS specializations + model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) + # Set model execution mode to RTL simulation model = model.transform(SetExecMode("rtlsim")) # Generates the C++ source and compiles the RTL simulation From d139c17e8bcbe1821b008ec06ca421308cdb27db Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 4 Apr 2024 18:06:23 +0200 Subject: [PATCH 60/88] [Attention] Fix broadcasting of maximum subtraction in numpy softmax --- src/finn/custom_op/fpgadataflow/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 293b442bc2..c42c6e60d9 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -30,7 +30,7 @@ def softmax(x, axis): # Count the occurrences of the maximum along the normalization axis max_counts = np.sum(max_ones, axis=axis, keepdims=True) # Exponential of the input - exp = np.exp(x - np.max(x, axis=axis)) + exp = np.exp(x - np.max(x, axis=axis)[:, np.newaxis]) # Compute the total along axis total = np.sum(exp, axis=axis, keepdims=True) # Detect overflow of the summation From f77ba1dbcdf65467d2e663e0e3e023e216ed8221 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 8 Apr 2024 13:41:30 +0200 Subject: [PATCH 61/88] [Refactor] Fix Split/MergeMultiHeads and ReplicateStream C++/RTL tests --- .../custom_op/fpgadataflow/attention_heads.py | 2 +- .../test_fpgadataflow_attention_heads.py | 37 +++++++++++++++---- .../test_fpgadataflow_replicate_stream.py | 33 +++++++++++++---- 3 files changed, 57 insertions(+), 15 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index ef8bd652c6..381776202b 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -497,7 +497,7 @@ def _execute_node_rtlsim(self, context, graph): # noqa: graph unused "inputs": {}, "outputs": {"out": []} } - # Enumerate the node outputs + # Enumerate the node inputs for i, name in enumerate(node.input): # Get the input out of the execution context # Note: Shape must be either 1 x seq x dim or seq x dim diff --git a/tests/fpgadataflow/test_fpgadataflow_attention_heads.py b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py index 4155d11543..097d4a63f5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_attention_heads.py +++ b/tests/fpgadataflow/test_fpgadataflow_attention_heads.py @@ -1,5 +1,5 @@ # Testing framework -import pytest # noqa pytest dependecy is listed in setup.cfg +import pytest # Use numpy for python execution / computing the ground truth expected values import numpy as np @@ -10,17 +10,18 @@ from onnx import helper as oh # QONNX/FINN datatypes -from qonnx.core.datatype import DataType # noqa qonnx dependency is specified -# in setup.cfg as well as in fetch-repos.sh +from qonnx.core.datatype import DataType # QONNX wrapper to ONNX model graphs -from qonnx.core.modelwrapper import ModelWrapper # noqa: qonnx +from qonnx.core.modelwrapper import ModelWrapper # Execute onnx model graphs -from qonnx.core.onnx_exec import execute_onnx # noqa: qonnx +from qonnx.core.onnx_exec import execute_onnx +# Registry of all QONNX CustomOps +from qonnx.custom_op.registry import getCustomOp # Utility for wrapping onnx graphs and generating tensor of FINN datatypes -from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # noqa: qonnx +from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # Graph transformation giving unique names to each node in a QONNX model graph -from qonnx.transformation.general import GiveUniqueNodeNames # noqa: qonnx +from qonnx.transformation.general import GiveUniqueNodeNames # FINN graph transformations for preparing simulation (cppsim or rtlsim) from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode @@ -29,6 +30,20 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + + +# Specializes all nodes to be implemented as HLS backend +def specialize_hls(model: ModelWrapper): + # Mark all nodes to be specialized as HLS backend implementations + for node in model.graph.node: # noqa: Duplicate test setup code + # Get the CustomOp instance of the node to get access to the node + # attributes + inst = getCustomOp(node) + # Note: only HLS-based layers execute C++ Simulation + inst.set_nodeattr("preferred_impl_style", "hls") + # Turn all HWCustomOp layers into HLS specializations + return model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) # Creates a model executing mult-head splitting @@ -184,6 +199,8 @@ def test_attention_heads_split_cppsim(seq, dim, heads, dtype): # Prepare the execution context context = {"inp": gen_finn_dt_tensor(DataType[dtype], (seq, dim))} + # Specializes all nodes to be implemented as HLS backend + model = specialize_hls(model) # Set model execution mode to Python simulation model = model.transform(SetExecMode("cppsim")) # Generates the C++ source and compiles the C++ simulation @@ -224,6 +241,8 @@ def test_attention_heads_split_rtlsim(seq, dim, heads, dtype): # Prepare the execution context context = {"inp": gen_finn_dt_tensor(DataType[dtype], (seq, dim))} + # Specializes all nodes to be implemented as HLS backend + model = specialize_hls(model) # Set model execution mode to Python simulation model = model.transform(SetExecMode("rtlsim")) # Generates the C++ source and compiles the RTL simulation @@ -320,6 +339,8 @@ def make_inp_tensor(): f"inp{i}": make_inp_tensor() for i in range(heads) } + # Specializes all nodes to be implemented as HLS backend + model = specialize_hls(model) # Set model execution mode to C++ simulation model = model.transform(SetExecMode("cppsim")) # Generates the C++ source and compiles the C++ simulation @@ -370,6 +391,8 @@ def make_inp_tensor(): f"inp{i}": make_inp_tensor() for i in range(heads) } + # Specializes all nodes to be implemented as HLS backend + model = specialize_hls(model) # Set model execution mode to RTL simulation model = model.transform(SetExecMode("rtlsim")) # Generates the C++ source and compiles the RTL simulation diff --git a/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py index 89d439fd3b..f896ccae39 100644 --- a/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py +++ b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py @@ -1,5 +1,5 @@ # Testing framework -import pytest # noqa pytest dependecy is listed in setup.cfg +import pytest # Protobuf onnx graph node type from onnx import TensorProto @@ -7,17 +7,18 @@ from onnx import helper as oh # QONNX/FINN datatypes -from qonnx.core.datatype import DataType # noqa qonnx dependency is specified -# in setup.cfg as well as in fetch-repos.sh +from qonnx.core.datatype import DataType # QONNX wrapper to ONNX model graphs -from qonnx.core.modelwrapper import ModelWrapper # noqa: qonnx +from qonnx.core.modelwrapper import ModelWrapper # Execute onnx model graphs -from qonnx.core.onnx_exec import execute_onnx # noqa: qonnx +from qonnx.core.onnx_exec import execute_onnx +# Registry of all QONNX CustomOps +from qonnx.custom_op.registry import getCustomOp # Utility for wrapping onnx graphs and generating tensor of FINN datatypes -from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # noqa +from qonnx.util.basic import qonnx_make_model, gen_finn_dt_tensor # Graph transformation giving unique names to each node in a QONNX model graph -from qonnx.transformation.general import GiveUniqueNodeNames # noqa: qonnx +from qonnx.transformation.general import GiveUniqueNodeNames # FINN graph transformations for preparing simulation (cppsim or rtlsim) from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode @@ -26,6 +27,20 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + + +# Specializes all nodes to be implemented as HLS backend +def specialize_hls(model: ModelWrapper): + # Mark all nodes to be specialized as HLS backend implementations + for node in model.graph.node: # noqa: Duplicate test setup code + # Get the CustomOp instance of the node to get access to the node + # attributes + inst = getCustomOp(node) + # Note: only HLS-based layers execute C++ Simulation + inst.set_nodeattr("preferred_impl_style", "hls") + # Turn all HWCustomOp layers into HLS specializations + return model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e")) # Creates a model executing stream replication @@ -138,6 +153,8 @@ def test_replicate_stream_cppsim(num_inputs, num_elems, num, dtype): "inp": gen_finn_dt_tensor(DataType[dtype], (*num_inputs, num_elems)) } + # Specializes all nodes to be implemented as HLS backend + model = specialize_hls(model) # Set model execution mode to C++ simulation model = model.transform(SetExecMode("cppsim")) # Generates the C++ source and compiles the C++ simulation @@ -181,6 +198,8 @@ def test_replicate_stream_rtlsim(num_inputs, num_elems, num, dtype): "inp": gen_finn_dt_tensor(DataType[dtype], (*num_inputs, num_elems)) } + # Specializes all nodes to be implemented as HLS backend + model = specialize_hls(model) # Set model execution mode to RTL simulation model = model.transform(SetExecMode("rtlsim")) # Generates the C++ source and compiles the RTL simulation From 22891f51136aca00d561277b7b51cc29b76f9b69 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 8 Apr 2024 15:09:28 +0200 Subject: [PATCH 62/88] [ReplicateStream] Introduce configurable PE parallelism for streams --- .../fpgadataflow/hls/replicate_stream_hls.py | 6 +++- .../fpgadataflow/replicate_stream.py | 23 ++++++++---- .../test_fpgadataflow_replicate_stream.py | 36 +++++++++++-------- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py b/src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py index 8a86e6aebc..84631f5dde 100644 --- a/src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/replicate_stream_hls.py @@ -131,11 +131,15 @@ def docompute(self): def out(i): return f"out{i}_{self.hls_sname()}" + # Number of iterations required to process the whole folded input stream + # Note: This is all but the PE (last) dimension + num_iter = np.prod(self.get_folded_output_shape()[:-1]) + # Write the body of the stream replicating top-level function self.code_gen_dict["$DOCOMPUTE$"] = [ # Repeat for the number of inputs # Note: Repeat for all num_inputs dimensions - f"for(std::size_t i = 0; i < {np.prod(self.num_inputs)}; ++i) {{", + f"for(std::size_t i = 0; i < {num_iter}; ++i) {{", # Pipeline the steps of this loop "#pragma HLS pipeline II=1 style=flp", # Read the next input element from the stream diff --git a/src/finn/custom_op/fpgadataflow/replicate_stream.py b/src/finn/custom_op/fpgadataflow/replicate_stream.py index 37709ca24b..a2ca666e2a 100644 --- a/src/finn/custom_op/fpgadataflow/replicate_stream.py +++ b/src/finn/custom_op/fpgadataflow/replicate_stream.py @@ -46,8 +46,10 @@ def get_nodeattr_types(self): "num": ("i", True, 1), # Data type of input and output elements "dtype": ("s", True, ""), - # Number of input elements received in parallel + # Number of input elements in the last dimension "num_elems": ("i", True, 1), + # Number of elements in the last dimensions processed in parallel + "PE": ("i", True, 1), # Number of inputs to be processed sequentially "num_inputs": ("ints", True, [1]), # Possible execution modes for simulating this node @@ -79,6 +81,11 @@ def dtype(self): def num_elems(self): return self.get_nodeattr("num_elems") + # Number of parallel processed elements as property for convenience + @property + def pe(self): + return self.get_nodeattr("PE") + # Number of inputs attribute as property for convenience @property def num_inputs(self): @@ -232,19 +239,23 @@ def get_normal_input_shape(self, ind=0): # Gets the shape of the output at index ind without folding def get_normal_output_shape(self, ind=0): - # All output have the same shape, which is the same as the input + # All outputs have the same shape, which is the same as the input # Unpack multi-axis inputs list to yield a flat tuple as shape return *self.num_inputs, self.num_elems # Gets the shape of the input at index ind with folding def get_folded_input_shape(self, ind=0): - # No folding for now, normal and folded shape are the same - return self.get_normal_input_shape(ind=ind) + # Valid folding requires the PE to divides the number of elements + assert self.num_elems % self.pe == 0, "PE must divide num_elems" + # Folding along the last dimension + return *self.num_inputs, self.num_elems // self.pe, self.pe # Gets the shape of the output at index ind with folding def get_folded_output_shape(self, ind=0): - # No folding for now, normal and folded shape are the same - return self.get_normal_output_shape(ind=ind) + # Valid folding requires the PE to divides the number of elements + assert self.num_elems % self.pe == 0, "PE must divide num_elems" + # Folding along the last dimension + return *self.num_inputs, self.num_elems // self.pe, self.pe # Widths of the input data stream of the input at index ind def get_instream_width(self, ind=0): diff --git a/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py index f896ccae39..643a46461c 100644 --- a/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py +++ b/tests/fpgadataflow/test_fpgadataflow_replicate_stream.py @@ -44,7 +44,7 @@ def specialize_hls(model: ModelWrapper): # Creates a model executing stream replication -def mock_replicate_streams(num_inputs, num_elems, num, dtype): +def mock_replicate_streams(num_inputs, num_elems, pe, num, dtype): # Create a node representing the stream replication operation node = oh.make_node( # Operator type from the name of the fpgadataflow hlscustomop @@ -62,8 +62,10 @@ def mock_replicate_streams(num_inputs, num_elems, num, dtype): num=num, # Datatype of inputs and outputs dtype=dtype, - # Number of input elements received in parallel + # Number of input elements in the last dimension num_elems=num_elems, + # Number of elements to process in parallel + PE=pe, # Number of inputs to be processed sequentially num_inputs=num_inputs ) @@ -92,10 +94,12 @@ def mock_replicate_streams(num_inputs, num_elems, num, dtype): # Number of inputs to be processed sequentially @pytest.mark.parametrize( # noqa Duplicate - "num_inputs", [[64], [1, 64], [2, 64], [2, 2, 64]] + "num_inputs", [[8], [1, 8], [2, 8], [2, 2, 8]] ) -# Number of input elements received in parallel +# Number of input elements in the last dimension @pytest.mark.parametrize("num_elems", [32]) +# Number of elements to process in parallel +@pytest.mark.parametrize("pe", [1, 2, 4, 8]) # Number of replicas to produce @pytest.mark.parametrize("num", [1, 2, 4, 8]) # Datatypes to simulate @@ -104,9 +108,9 @@ def mock_replicate_streams(num_inputs, num_elems, num, dtype): @pytest.mark.fpgadataflow # Tests replicating of tensors/streams to multiple outputs using python mode # execution -def test_replicate_stream_python(num_inputs, num_elems, num, dtype): +def test_replicate_stream_python(num_inputs, num_elems, pe, num, dtype): # Make dummy model for testing - model = mock_replicate_streams(num_inputs, num_elems, num, dtype) + model = mock_replicate_streams(num_inputs, num_elems, pe, num, dtype) # Prepare the execution context context = { @@ -130,10 +134,12 @@ def test_replicate_stream_python(num_inputs, num_elems, num, dtype): # Number of inputs to be processed sequentially @pytest.mark.parametrize( # noqa Duplicate - "num_inputs", [[64], [1, 64], [2, 64], [2, 2, 64]] + "num_inputs", [[8], [1, 8], [2, 8], [2, 2, 8]] ) -# Number of input elements received in parallel +# Number of input elements in the last dimension @pytest.mark.parametrize("num_elems", [32]) +# Number of elements to process in parallel +@pytest.mark.parametrize("pe", [1, 2, 4, 8]) # Number of replicas to produce @pytest.mark.parametrize("num", [1, 2, 4, 8]) # Datatypes to simulate @@ -144,9 +150,9 @@ def test_replicate_stream_python(num_inputs, num_elems, num, dtype): @pytest.mark.vivado # Tests replicating of tensors/streams to multiple outputs using C++ mode # execution -def test_replicate_stream_cppsim(num_inputs, num_elems, num, dtype): +def test_replicate_stream_cppsim(num_inputs, num_elems, pe, num, dtype): # Make dummy model for testing - model = mock_replicate_streams(num_inputs, num_elems, num, dtype) + model = mock_replicate_streams(num_inputs, num_elems, pe, num, dtype) # Prepare the execution context context = { @@ -175,10 +181,12 @@ def test_replicate_stream_cppsim(num_inputs, num_elems, num, dtype): # Number of inputs to be processed sequentially @pytest.mark.parametrize( # noqa Duplicate - "num_inputs", [[64], [1, 64], [2, 64], [2, 2, 64]] + "num_inputs", [[8], [1, 8], [2, 8], [2, 2, 8]] ) -# Number of input elements received in parallel +# Number of input elements in the last dimension @pytest.mark.parametrize("num_elems", [32]) +# Number of elements to process in parallel +@pytest.mark.parametrize("pe", [1, 2, 4, 8]) # Number of replicas to produce @pytest.mark.parametrize("num", [1, 2, 4, 8]) # Datatypes to simulate @@ -189,9 +197,9 @@ def test_replicate_stream_cppsim(num_inputs, num_elems, num, dtype): @pytest.mark.vivado # Tests replicating of tensors/streams to multiple outputs using RTL mode # execution -def test_replicate_stream_rtlsim(num_inputs, num_elems, num, dtype): +def test_replicate_stream_rtlsim(num_inputs, num_elems, pe, num, dtype): # Make dummy model for testing - model = mock_replicate_streams(num_inputs, num_elems, num, dtype) + model = mock_replicate_streams(num_inputs, num_elems, pe, num, dtype) # Prepare the execution context context = { From 117a1b39af6cc5bca5ed52f17b5d4043340c55b4 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 25 Apr 2024 14:15:35 +0200 Subject: [PATCH 63/88] [Streamline] Add CollapseRepeatedTranspose transformation This transformation merges repeated transpose operations into a single transpose operation having the same effect. --- .../streamline/collapse_repeated.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py index d297110186..796087ffc0 100644 --- a/src/finn/transformation/streamline/collapse_repeated.py +++ b/src/finn/transformation/streamline/collapse_repeated.py @@ -26,11 +26,24 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Helper for creating ONNX nodes from onnx import helper as oh + +# QONNX arbitrary precision data types from qonnx.core.datatype import DataType + +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# QONNX graph transformation base class from qonnx.transformation.base import Transformation + +# QONNX graph transformations for inferring datatypes and shapes from qonnx.transformation.infer_shapes import InferShapes +# Gets items from protobuf by name +from qonnx.util.basic import get_by_name + class CollapseRepeatedOp(Transformation): """Collapse repeated consecutive operations with constant parameters into @@ -106,3 +119,88 @@ class CollapseRepeatedMul(CollapseRepeatedOp): def __init__(self): super().__init__("Mul", lambda x, y: y * x) + + +# Collapses repeated transpose operations into a single transpose operation +# having the same effect +class CollapseRepeatedTranspose(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Transpose operation types + if node.op_type == "Transpose": + # Currently does not handle fork- or join-nodes + if model.is_fork_node(node) or model.is_join_node(node): + # Softly skip this node + continue + # As this is not a fork-node, there can be at most one successor + successor = model.find_direct_successors(node)[0] + # If Transpose is the final operation in the graph, there might + # be no successor + if successor is None or successor.op_type != "Transpose": + # Softly skip this node + continue + + # Get the (optional) permutation indices of the first transpose + # in case it is a multi-axis transpose + perm1 = get_by_name(node.attribute, "perm") + # Convert permutation indices to list of integers + perm1 = perm1.ints if perm1 is not None else None + + # Get the (optional) permutation indices of the second transpose + # in case it is a multi-axis transpose + perm2 = get_by_name(successor.attribute, "perm") + # Convert permutation indices to list of integers + perm2 = perm2.ints if perm2 is not None else None + + # Get the shape of the input tensor + shape = model.get_tensor_shape( + # fmt: off + node.input[0], fix_missing_init_shape=True + # fmt: on + ) + # List of dimension indices in order + dims = range(len(shape)) + + # Substitute the permutation indices by the reversed index list + # if they are not given: This is default behavior, see the docs: + # https://onnx.ai/onnx/operators/onnx__Transpose.html + perm1 = list(reversed(dims)) if perm1 is None else perm1 + perm2 = list(reversed(dims)) if perm2 is None else perm2 + + # Combined permutation permutes the first permutation of the + # dimensions according to the second permutation + perm = [perm1[i] for i in perm2] + + # Create a new Transpose operator replacing the other two + transpose = oh.make_node( + # Name of the operator type + "Transpose", + # Connect to the inputs to the first transpose + inputs=node.input, + # Connect to the outputs of the second transpose + outputs=successor.output, + # Insert the new permutation indices + perm=perm, + ) + # Insert the collapsed transpose operator + graph.node.insert(index + 2, transpose) + # Remove the two original transpose operators + graph.node.remove(node) + graph.node.remove(successor) + # Track whether the graph has been modified, never resets to + # False + graph_modified = True + # Break the loop after adding and removing nodes to start over + # with a clean index + break + # Need to redo the shape inference after potentially removing nodes + model = model.transform(InferShapes()) # noqa: Shadows model + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified From 649087ae9ae4cf69e7ac72acc433f7680785193c Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 25 Apr 2024 14:17:36 +0200 Subject: [PATCH 64/88] [Streamline] Add MoveTransposePastEltwise transformation This is currently configured to handle constant elementwise Add and Mul operations, but could be extended to other constant elementwise operations later. --- src/finn/transformation/streamline/reorder.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 8ac2d7dad6..293f633df5 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -1244,3 +1244,86 @@ def apply(self, model): class MoveTransposePastJoinAdd(MoveIdenticalOpPastJoinOp): def __init__(self): super().__init__(["Transpose"], ["Add"]) + + +# Moves a transpose operator past elementwise addition or multiplication +class MoveTransposePastEltwise(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Transpose operation types + if node.op_type == "Transpose": + # Currently does not handle fork- or join-nodes + if model.is_fork_node(node) or model.is_join_node(node): + # Softly skip this node + continue + # As this is not a fork-node, there can be at most one successor + successor = model.find_direct_successors(node)[0] + # If Transpose is the final operation in the graph, there might + # be no successor + if successor is None: + # Softly skip this node + continue + # Applies to elementwise add and mul operations + if successor.op_type in {"Add", "Mul"}: + # Get names of all tensors involved in connecting the nodes + inp = node.input[0] + mid = node.output[0] + out = successor.output[0] + + # y = x^T + a <=> y = (x + a^T)^T + + # Assume left-to-right order of input to the Add operator + xt, a = successor.input + # Check whether the assumption holds true + if xt != mid: + # Leaves only the option of a and xt commuting + xt, a = a, xt + # If this assumption still does not hold true, something is + # wrong with the graph + assert xt == mid, f"Messed up graph pattern at {node.name}" + + # Get the (optional) permutation indices of the transpose in + # case it is a multi-axis transpose + perm = get_by_name(node.attribute, "perm") + # Convert permutation indices to list of integers + perm = perm.ints if perm is not None else None + + # This transformation does only apply to Add nodes where the + # second input is a constant initializer + if (value := model.get_initializer(a)) is not None: + # Transpose the initializer and re-insert into the model + model.set_initializer(a, value.transpose(perm)) + # Rewire the graph to feed original input and the + # transposed initializer into the Add node first + successor.input[:] = [inp, a] + # Repurpose the middle tensor for the output of the + # addition + successor.output[0] = mid + # The Transpose operator now gets the middle tensor as + # its input + node.input[0] = mid + # Transpose now produces the original output tensor + node.output[0] = out + # Delete the shape annotation of the connecting tensors + # to be re-done later + model.set_tensor_shape(inp, None) + model.set_tensor_shape(mid, None) + model.set_tensor_shape(out, None) + # Track whether the graph has been modified, never + # resets to False + graph_modified = True + # Break the loop after deleting shape annotations to + # immediately re-do these before changing the next + # operator + break + # Need to redo the shape inference after potentially removing nodes + model = model.transform(InferShapes()) # noqa: Shadows model + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified From 2b29bfc6a55cb6d2a1b07ade2904d99cfe25dc66 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 25 Apr 2024 14:22:28 +0200 Subject: [PATCH 65/88] [Streamline] Add RemoveIdentityReshape and RemoveIdentityTranspose --- src/finn/transformation/streamline/remove.py | 101 +++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 src/finn/transformation/streamline/remove.py diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py new file mode 100644 index 0000000000..005425bacd --- /dev/null +++ b/src/finn/transformation/streamline/remove.py @@ -0,0 +1,101 @@ +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# QONNX graph transformation base class +from qonnx.transformation.base import Transformation + +# QONNX graph transformations for inferring datatypes and shapes +from qonnx.transformation.infer_shapes import InferShapes + +# Reuse node removal and rewiring from qonnx +from qonnx.transformation.remove import remove_node_and_rewire + +# Gets items from protobuf by name +from qonnx.util.basic import get_by_name + + +# Removes identity reshape operations, i.e., Reshape where input shape is the +# same as the target shape +class RemoveIdentityReshape(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Reshape operation types + if node.op_type == "Reshape": + # Currently does not handle fork- or join-nodes + if model.is_fork_node(node) or model.is_join_node(node): + # Softly skip this node + continue + # Second input to the reshape operation is the target shape + shape = model.get_initializer(node.input[1]) + # If the initializer is present, this is a constant shape + # reshape which can be removed if it does not reshape + if shape is not None: + # Get the shape of the input to the reshape + inp = model.get_tensor_shape(node.input[0]) + # If input and target shape are the same, this is an + # identity operation + if len(shape) == len(inp) and (shape == inp).all(): # noqa + # Remove and rewire this node + remove_node_and_rewire(model, node) + # Track whether the graph has been modified, never + # resets to False + graph_modified = True + # Need to redo the shape inference after potentially removing nodes + model = model.transform(InferShapes()) # noqa: Shadows from outer scope + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified + + +# Removes identity transpose operations, i.e., Transpose where input order is +# the same as the target permutation +class RemoveIdentityTranspose(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Transpose operation types + if node.op_type == "Transpose": + # Currently does not handle fork- or join-nodes + if model.is_join_node(node): + # Softly skip this node + continue + # Get the (optional) permutation indices of the transpose in + # case it is a multi-axis transpose + perm = get_by_name(node.attribute, "perm") + # If the permutation indices are given, we can check whether + # they are in order making this an identity transpose + # Note: Without perm attribute, this is implicitly reversing the + # axes, i.e., not an identity transpose + if perm is not None: + # Convert permutation indices to list of integers + perm = perm.ints + # Get the shape of the input tensor + shape = model.get_tensor_shape( + # fmt: off + node.input[0], fix_missing_init_shape=True + # fmt: on + ) + # If the permutation indices cover the input shape in order, + # this transpose does nothing + if perm == [i for i in range(len(shape))]: + # Remove and rewire this node + remove_node_and_rewire(model, node) + # Track whether the graph has been modified, never + # resets to False + graph_modified = True + # Need to redo the shape inference after potentially removing nodes + model = model.transform(InferShapes()) # noqa: Shadows model + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified From 28f84a5ef5b57a2a0cc9a0c050295a6f3e34d990 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 25 Apr 2024 14:34:01 +0200 Subject: [PATCH 66/88] [Attention] Add Squeeze "cleanup" transformation This is required to work around the rather rudimentary support for data layouts other than 2d or 4d layouts - transformer models after export typically have 3d layouts. --- src/finn/transformation/squeeze.py | 185 +++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 src/finn/transformation/squeeze.py diff --git a/src/finn/transformation/squeeze.py b/src/finn/transformation/squeeze.py new file mode 100644 index 0000000000..a34d48c06c --- /dev/null +++ b/src/finn/transformation/squeeze.py @@ -0,0 +1,185 @@ +# QONNX wrapper of ONNX model graphs +# For array handling +import numpy as np + +# Helper for creating ONNX nodes +from onnx import helper as oh + +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# QONNX graph transformation base class +from qonnx.transformation.base import Transformation + +# Gets items from protobuf by name +from qonnx.util.basic import get_by_name, remove_by_name + + +# Squeezes, i.e., removes, dimensions of size 1 +# Note: Use this transformation with great care, it currently serves only the +# purpose of turning the not well-supported 3d data layouts encountered in +# transformer models with batch dimension of size 1 into 2d data layouts where +# the sequence dimension is treated as a batch dimension. Everything else is +# not tested, it might break the model or simply lack support for certain node +# op-types. +class Squeeze(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # There should not be any squeeze or unsqueeze operations in the + # graph as these would interfere with this transformation + assert node.op_type not in { + "Squeeze", + "Unsqueeze", + }, f"Squeezing graph containing {node.op_type}" + + # Validate slice not slicing along squeezed dimension + if node.op_type == "Slice": + # Axes to slice along is supplied as the 4th input to the node + axes = model.get_initializer(node.input[3]) + # If this is an initializer, there are constant axes to slice + if axes is not None: + # Get the shape of the input, assuming the input from + # upstream to be the 1st input + shape = model.get_tensor_shape(node.input[0]) + # Slice might operate on multiple axes + for axis in axes: + # Axis must not refer to a dimension of size 1 + # fmt: off + assert shape[axis] > 1, \ + f"Slice along dimension to be squeezed: {node.name}" + # fmt: on + + # Need to adapt reshape operations to drop dimensions of size 1 + if node.op_type == "Reshape": + # Second input to the reshape operation is the target shape + shape = model.get_initializer(node.input[1]) + # If the initializer is present, this is a constant shape + # reshape which can be replaced by the squeezed shape + if shape is not None: + # Squeeze the shape by removing all dimensions with size 1 + # fmt: off + new_shape = np.asarray([ + size for size in shape if size != 1 + ]) + # fmt: on + # Reassign the squeezed tensor + model.set_initializer(node.input[1], new_shape) + # Track whether the shape actually changed + if len(new_shape) != len(shape): + # Is never reset back to False during iteration + graph_modified = True + + # Need to drop dimensions of size 1 from transpose permutation list + if node.op_type == "Transpose": + # Get the (optional) permutation indices of the transpose in + # case it is a multi-axis transpose + perm = get_by_name(node.attribute, "perm") + # If the permutation indices are given, we need to remove all + # dimension of size 1 from these + if perm is not None: + # Convert permutation indices to list of integers + perm = perm.ints + # Get the shape of the input tensor to seek for input + # dimensions of size 1 + shape = model.get_tensor_shape( + # fmt: off + node.input[0], fix_missing_init_shape=True + # fmt: on + ) + # Keep track of new axis enumeration, skipping dimensions of + # size 1 + mapping, new_axis = {}, 0 + # Enumerate the sizes per axis + for axis, size in enumerate(shape): + # Insert mapping from old to new axis + mapping[axis] = new_axis + # Only advance the new axis index for dimensions not to + # be squeezed + new_axis += size > 1 + # Filter and remap the axis enumeration of the permutation + new_perm = [ + # fmt: off + mapping[axis] for axis in perm if shape[axis] > 1 + # fmt: on + ] + # Track whether the permutations actually changed + if len(new_perm) != len(perm) or new_perm != perm: + # Is never reset back to False during iteration + graph_modified = True + # Remove the permutation attribute before setting the new + # permutation + remove_by_name(node.attribute, "perm") + # Insert new permutation attribute + node.attribute.append(oh.make_attribute("perm", new_perm)) + + # Need to squeeze the number of inputs to multi-head splitting + if node.op_type == "SplitMultiHeads": + # Get number of input feature maps to the merging operation + num_inputs = get_by_name(node.attribute, "num_inputs") # noqa + # Squeeze all dimensions of size 1 + new_num_inputs = [size for size in num_inputs.ints if size != 1] + # Update the attribute by removing and reinserting + remove_by_name(node.attribute, "num_inputs") + node.attribute.append( + # fmt: off + oh.make_attribute("num_inputs", new_num_inputs) + # fmt: on + ) + # Track whether the number of inputs actually changed + if len(new_num_inputs) != len(num_inputs.ints): + # Is never reset back to False during iteration + graph_modified = True + + # Need to set the squeezed output mode of multi-head merging + if node.op_type == "MergeMultiHeads": + # Remove the squeezed attribute + remove_by_name(node.attribute, "squeezed") + # Set squeezed mode attribute + node.attribute.append(oh.make_attribute("squeezed", True)) + # Get number of input feature maps to the merging operation + num_inputs = get_by_name(node.attribute, "num_inputs") # noqa + # Squeeze all dimensions of size 1 + new_num_inputs = [size for size in num_inputs.ints if size != 1] + # Update the attribute by removing and reinserting + remove_by_name(node.attribute, "num_inputs") + node.attribute.append( + # fmt: off + oh.make_attribute("num_inputs", new_num_inputs) + # fmt: on + ) + # Track whether the number of inputs actually changed + if len(new_num_inputs) != len(num_inputs.ints): + # Is never reset back to False during iteration + graph_modified = True + + # Iterate all tensors in the graph keeping track of the index + for index, name in enumerate(model.get_all_tensor_names()): + # Query the shape of the tensor adding annotations for initializers + # if missing + shape = model.get_tensor_shape(name, fix_missing_init_shape=True) + # Skip squeezing 0d or 1d tensors + if len(shape) <= 1: + continue + # Squeeze the shape by removing all dimensions with size 1 + new_shape = [size for size in shape if size != 1] + # Try to get the initializer of the tensor + initializer = model.get_initializer(name) + # If an initializer is present replace by the squeezed tensor + if initializer is not None: + # Reassign the squeezed tensor + model.set_initializer(name, initializer.squeeze()) + # Set new shape annotation + model.set_tensor_shape(name, new_shape) + # Track whether the shape actually changed + if len(new_shape) != len(shape): + # Is never reset back to False during iteration + graph_modified = True + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified From 69dc954b863685d90e064feae2f95f474d753a0f Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 25 Apr 2024 14:50:31 +0200 Subject: [PATCH 67/88] [Attention] Add graph transformations for handling attention operators These are the new Infer* transformations for detecting the scaled dot-product attention and multi-head attention patterns to convert these to the hardware custom operations, as well as the InferReplicateStream transformation, corresponding to the new ReplicateStream operation. This also adds a few reordering transformations for optimizing the attention pattern and eventually unrolling the attention heads within the ONNX graph to achieve a fully-parallel hardware implementation of multi-head attention, which might look like streamline transformations, but must be applied when/after converting the attention pattern to hardware operators. --- .../transformation/fpgadataflow/attention.py | 658 +++++++++++++++ .../fpgadataflow/attention_heads.py | 782 ++++++++++++++++++ .../fpgadataflow/replicate_stream.py | 110 +++ src/finn/transformation/util.py | 124 +++ 4 files changed, 1674 insertions(+) create mode 100644 src/finn/transformation/fpgadataflow/attention.py create mode 100644 src/finn/transformation/fpgadataflow/attention_heads.py create mode 100644 src/finn/transformation/fpgadataflow/replicate_stream.py create mode 100644 src/finn/transformation/util.py diff --git a/src/finn/transformation/fpgadataflow/attention.py b/src/finn/transformation/fpgadataflow/attention.py new file mode 100644 index 0000000000..0b68d05884 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/attention.py @@ -0,0 +1,658 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Standard math functions +import math + +# Need numpy for modifying the onnx graph tensors, which are numpy style arrays +import numpy as np + +# Output warning messages +import warnings + +# Utility for handling ONNX nodes and tensors +from onnx import NodeProto +from onnx import helper as oh + +# QONNX datatypes +from qonnx.core.datatype import BaseDataType, DataType + +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# Convert ONNX nodes to QONNX custom ops +from qonnx.custom_op.registry import getCustomOp + +# QONNX graph transformation base class +from qonnx.transformation.base import Transformation + +# Transformations running qonnx datatype inference +from qonnx.transformation.infer_datatypes import InferDataTypes + +# Transformation running onnx shape inference +from qonnx.transformation.infer_shapes import InferShapes + +# Gets items from protobuf by name +from qonnx.util.basic import get_by_name, remove_by_name + +# Utility function for transforming ONNX graphs +from finn.transformation.util import ( + all_upstream_to_matmul, + is_add, + is_join_matmul, + is_matmul, + is_mul, + is_softmax, + op_types, +) + + +# Convert the operator pattern corresponding to scaled dot-product attention to +# the hardware custom operator node +class InferScaledDotProductAttention(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # This transformation is triggered by finding a join-node MatMul + if is_join_matmul(node, model): + # If there are more than two branches feeding the MatMul, this + # is probably not attention, softly skip the node + if len(node.input) != 2: + continue + # Follow both branches upstream looking for the next MatMul + lhs, rhs = all_upstream_to_matmul(node, model) + # Exactly one of the branches is supposed to contain a Softmax + # operation + if ("Softmax" in op_types(lhs)) == ("Softmax" in op_types(rhs)): + # TODO: Near match. But what is this? just skip? + continue + # By convention and following the equation, the left hand side + # of attention is the attention matrix, i.e., the one containing + # Softmax and terminating in a join-node MatMul + if "Softmax" not in op_types(lhs): + # Softmax must currently be on the right hand side, swap the + # order + lhs, rhs = rhs, lhs + # The left hand side, i.e, attention matrix must terminate in a + # join-node MatMul involving the query and key input + if not is_join_matmul(lhs[-1], model): + # TODO: Near match. But what is this? just skip? + continue + # Get shapes of input tensors, expect the second inputs, i.e., + # the keys to be transposed + qh, ql, qe = model.get_tensor_shape(lhs[-1].input[0]) + kh, ke, kl = model.get_tensor_shape(lhs[-1].input[1]) + # The input shapes of the two matmul inputs must be compatible, + # i.e., they must have matching embedding dimension + if (qh, True, qe) != (kh, True, ke): + # Issue a warning of near match of the supported attention + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Mismatch in head or embedding dim at {lhs[-1].name}: " + f" {(qh, ql, qe)} vs. {(kh, kl, ke)}" + ) + # @formatter:on + # Skip transforming this instance + continue + # There must be a Transpose feeding the key input + transpose = model.find_producer(lhs[-1].input[1]) + # The transform applies only to transpose with exactly one input + if transpose is None or len(transpose.input) != 1: + # Issue a warning of near match of the supported attention + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Missing Transpose near {lhs[-1].name}: " + f" {op_types([transpose])[0]}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Skip this node if the transpose output forks into multiple + # branches + if model.is_fork_node(transpose): + # Issue a warning of near match of the supported attention + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Fork Transpose near {node.name}: {transpose.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # The input shape of the transpose must match the transpose + # of the key matrix + # @formatter:off + assert model.get_tensor_shape(transpose.input[0]) == [ + kh, kl, ke + ] + # @formatter:on + # Collect the input tensors to the attention operation, i.e., + # the query, key and value tensors + q, k, v = lhs[-1].input[0], transpose.input[0], rhs[0].output[0] + # Validate that the values are actually consumed by the final + # matmul. For queries and keys this should all be given, as we + # just walked upwards the graph. + assert node in model.find_consumers(v) + + # Get the (optional) Softmax activation function + act_a_softmax = lhs[0] if is_softmax(lhs[1]) else None + # Get the (optional) query-key matmul activation function + act_qk_matmul = lhs[-2] if is_matmul(lhs[-1]) else None + + # There might be no activation function between qk matmul and + # softmax normalization + if is_mul(act_qk_matmul) or is_softmax(act_qk_matmul): + # Remove the detected activation function node from the + # pattern candidates + act_qk_matmul = None + + # Check whether the node is a supported type of activation + def is_supported_activation(n: NodeProto): # noqa: Shadows name + # Currently, only none-type and MultiThreshold activations + # are supported + return n is None or n.op_type in {"MultiThreshold"} + + # Get the (optional) output matmul activation function + act_av_matmul = model.find_direct_successors(node) + # If the final matmul is a fork node, this needs to be handled + # separately + if act_av_matmul is not None and len(act_av_matmul) > 1: + # Assume no activation in this case + act_av_matmul = [None] + # Unwrap the output activation from the list + act_av_matmul, = act_av_matmul + # The final activation can be omitted if it is not supported as + # it might just be part of the next operator pattern + if not is_supported_activation(act_av_matmul): + # Remove by setting to None (will be ignored by the next + # steps) + act_av_matmul = None + # List all activations for validation and further processing + # Note: Order matters! + acts = [act_qk_matmul, act_a_softmax, act_av_matmul] + # Skip this node if any activation is not supported + if not all(is_supported_activation(act) for act in acts): + # Issue a warning of near match of the supported attention + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported activation near {node.name}: " + f" One of {', '.join(op_types(acts))}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Check whether there is a de-quantizer scale factor preceding + # the Softmax operator + dequant_softmax = lhs[2] if is_softmax(lhs[1]) else None + + # If there is no dequant softmax yet, check alternative pattern + if dequant_softmax is None: + # Alternatively, there might not be a quantizer following + # the softmax + dequant_softmax = lhs[1] if is_softmax(lhs[0]) else None + + # Assume no attention mask by default + mask, mask_mode, mask_dtype = [], 'none', DataType["BINARY"] + # If there is an elementwise add operation where we have + # expected the dequantizer, this might be an attention mask + if is_add(dequant_softmax): + # Remember the candidate of the masking operation + maybe_mask = dequant_softmax + # If there is a mask candidate, the dequantizer, must be + # right before + dequant_softmax = model.find_direct_predecessors( + dequant_softmax + ) + # The attention mask may not have multiple producers + if len(dequant_softmax) != 1: + # Issue a warning of near match of the supported + # attention pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported de-quantizer near {maybe_mask.name}: " + f" {op_types(dequant_softmax)}" + ) + # @formatter:on + # Skip transforming this instance + continue + # There is a single producer, which is probably the + # dequantizer + dequant_softmax, = dequant_softmax + + # The mask can be an initializer or provided as an input. If + # it is given as an initializer, it can either be a causal + # mask or some arbitrary pattern. + + # Check whether a tensor is a valid mask tensor + def valid_mask(tensor): + # Valid masks contain only two types of values, i.e., + # zero for not masked and -inf for masked slots + return all( + x in {0.0, -np.inf} for x in np.unique(tensor) + ) + + # Check whether a tensor describes a causal attention mask + def is_causal(tensor): + # Generate a causal mask of the same size + causal = np.triu(-np.inf * np.ones_like(tensor), 1) + # Compare candidate against the causal mask + return (tensor == causal).all() # noqa: 'all' + + # Try to get the initializer of the masking operation + mask_tensor = model.get_initializer(maybe_mask.input[1]) + # Check whether this is constant mask known at export time + if mask_tensor is not None: + # We have a constant mask and need to validated that it + # only contains valid values + if not valid_mask(mask_tensor): + # Issue a warning of near match of the supported + # attention pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near" + f" match: Invalid values in mask near" + f" {maybe_mask.name}: {np.unique(mask_tensor)}" + ) + # @formatter:on + # Skip transforming this instance + continue + # If this is a causal mask, just set the flag and drop + # the input as the behavior can be generated on the fly + if is_causal(mask_tensor): + # Set the mode flag + mask_mode = "causal" + # This is a constant but non-causal mask which needs to + # be kept as an input to the operator + else: + # Keep the input and set the mode flag + mask, mask_mode = [maybe_mask.input[1]], "const" + # Convert the mask to a binary mask getting rid of + # explicitly storing the infinities + mask_tensor = (mask_tensor == -np.inf) + # Set the initializer to the binary mask still using + # float as the container type + model.set_initializer( + *mask, mask_tensor.astype(np.float32) + ) + # Set the quantization type annotation to binary + model.set_tensor_datatype(*mask, DataType["BINARY"]) + # Dynamic input mask, cannot be validated beforehand + else: + # # Keep the input and set the corresponding mode flag + # mask, mask_mode = [maybe_mask.input[1]], "input" + # # Keep track of the datatype of the mask + # mask_dtype = model.get_tensor_datatype(*mask) + + # Handling dynamic masks is more difficult and there is + # no solution for now. + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported dynamic mask near {maybe_mask.name}: " + f" {mask}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Currently, only elementwise Mul is supported as de-quantizer + if not is_mul(dequant_softmax): + # Issue a warning of near match of the supported attention + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported de-quantizer near {lhs[1].name}: " + f" {dequant_softmax.op_type}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # If there is a dequant scale factor, try to lift it from + # initializer to node attribute + if dequant_softmax is not None: + # Get the initializer tensor + scale = model.get_initializer(dequant_softmax.input[1]) + # This must be an initializer, the attention operator + # currently does not handle any dynamically produced scale + # factors + if scale is None: + # Issue a warning of near match of the supported + # attention pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Non-constant dequantizer near {node.name}: " + f" {dequant_softmax.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + # Currently, only scalar dequantizer scale factors are + # supported + if not all(x == 1 for x in scale.shape): + # Issue a warning of near match of the supported + # attention pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Non-scalar dequantizer near {node.name}: " + f" {dequant_softmax.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + # Extract the single float value of the tensor + dequant_softmax = float(scale.item()) + # Insert default scale if the is no dequantizer present + else: + # Default is identity scale + dequant_softmax = 1.0 + + # The last node of the attention operator is either the detected + # matmul or the following, optional activation function + last = act_av_matmul if act_av_matmul is not None else node + + # Tensor names of the threshold inputs + # Note: order matters + thresholds = [ + # TODO: Fix condition once more activation types are + # supported, currently there are only none and thresholds + act.input[1] for act in acts if act is not None + ] + + # Convert activation function types to string representation + def act_op_type_str(act): + # Only MultiThreshold is supported currently + if act is not None and act.op_type == "MultiThreshold": + # The attention custom op uses "thresholds" to identify + return "thresholds" + # All other types are not supported + return "none" + + # The value tensor shape must be compatible with the attention + # matrix + assert model.get_tensor_shape(v)[:2] == [qh, kl] + + # Output type of the first matmul + out_qk_matmul = lhs[-1].output[0] + # Extend the output type to include the optional thresholding + # activation + if act_qk_matmul is not None: + # Single output tensor of the activation function + out_qk_matmul = act_qk_matmul.output[0] + + # Extract output bias of the thresholding activation functions + def out_bias(act): + # Does only apply to thresholding activations + if act is not None and act.op_type == "MultiThreshold": + # Extract via interpreting the node as QONNX custom op + return getCustomOp(act).get_nodeattr("out_bias") + # Default bias if no bias + return 0.0 + + # Fixed node attributes and extracted input/output/initializer + # tensor names + kwargs = { + # Refer to this operator type by its name + "op_type": "ScaledDotProductAttention", + # Execution will try to look up the implementation in the + # package + # referred to by the domain + "domain": "finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + "backend": "fpgadataflow", + # Named inputs and activation thresholds extracted from the + # graph pattern + "inputs": [q, k, v, *mask, *thresholds], + # Named model output extracted from the graph pattern + "outputs": last.output, + # Set the attribute specifying how to handel the optional + # attention mask + "mask_mode": mask_mode, + # Give node name derived from the operator type and the name + # of the triggering node to be removed + "name": f"ScaledDotProductAttention_{node.name}" + } + + # Extract the node attributes of the attention operator from + # all constituent nodes + node_attrs = { + # Number of attention heads + "Heads": qh, + # Embedding dimension of queries and keys + "QKDim": qe, + # Length of the query sequence + "QLen": ql, + # Embedding dimension of the values + "VDim": model.get_tensor_shape(v)[2], + # Length of the key and value sequence + "KVLen": kl, + + # Folding along the embedding dimensions + # Note: Assume biggest folding possible fitting both + # embedding dimensions + "EmbFold": math.gcd(qe, model.get_tensor_shape(v)[2]), + # Folding along the sequence dimensions + # Note: Assume biggest folding possible fitting both + # sequence dimensions + "SeqFold": math.gcd(ql, kl), + + # Datatype of query matrix elements + "QType": model.get_tensor_datatype(q), + # Datatype of key matrix elements + "KType": model.get_tensor_datatype(k), + # Datatype of value matrix elements + "VType": model.get_tensor_datatype(v), + # # Datatype of mask matrix elements + "MType": mask_dtype.name, + # Datatype of attention weights elements + "AType": model.get_tensor_datatype(lhs[0].output[0]), + # Datatype of output elements + "OType": model.get_tensor_datatype(last.output[0]), + + # Datatype of accumulator elements of the first matmul + "AccQKMatMul": model.get_tensor_datatype(lhs[-1].output[0]), + # Datatype of output elements of the first matmul + # Note: Can be extracted from the left hand side + # intermediate outputs + "OutQKMatMul": model.get_tensor_datatype(out_qk_matmul), + # Activation function type following the first matmul + "ActQKMatMul": act_op_type_str(act_qk_matmul), + # Output bias to be applied to the thresholding activation + # following the Query x Key multiplication + "BiasActQKMatMul": out_bias(act_qk_matmul), + + # Datatype of accumulator elements of the second matmul + "AccAVMatMul": model.get_tensor_datatype(node.output[0]), + # Datatype of output elements of the second matmul + # Note: Always the same as the OType + "OutAVMatMul": model.get_tensor_datatype(last.output[0]), + # Activation function type following the second matmul + "ActAVMatMul": act_op_type_str(act_av_matmul), + # Output bias to be applied to the thresholding activation + # following the Attention x Value multiplication + "BiasActAVMatMul": out_bias(act_av_matmul), + + # Softmax may be preceded by a de-quantizer scalar + # multiplication + "DequantSoftmax": dequant_softmax, + # Datatype of softmax normalization before applying + # activation or type cast. This is called Acc to stick to + # the naming scheme of the MatMul operators before. + # Note: Currently this is ALWAYS floats + "AccASoftmax": "FLOAT32", + # Activation function type following the softmax + # normalization of the attention weights + "ActASoftmax": act_op_type_str(act_a_softmax), + # Output bias to be applied to the thresholding activation + # following the softmax normalization of the attention + # weights + "BiasActASoftmax": out_bias(act_a_softmax), + } + + # Converts QONNX datatypes to their name (as a string) + def maybe_name(value): + # All QONNX datatypes are instances of the BaseDataType + if isinstance(value, BaseDataType): + # Convert to the name by referring to the datatypes name + # attribute + return value.name + # Everything else is just assumed to be in the right format + return value + + # Convert all node attributes DataTypes to string + # representations of their names + node_attrs = { + key: maybe_name(value) for key, value in node_attrs.items() + } + + # Create a new custom node replacing the scaled dot-product + # attention pattern + attention = oh.make_node(**kwargs, **node_attrs) + # Insert the new node into the graph + graph.node.insert(index, attention) + # Collect all nodes comprising the original pattern + nodes = [node, transpose, *lhs, act_av_matmul] + # Remove all nodes of the original pattern + for n in nodes: + # Do not try to remove non-existing nodes + if n is not None: + graph.node.remove(n) + # The graph has been modified + graph_modified = True + # After rewiring need to re-do the shape annotations + model = model.transform(InferShapes()) # noqa: Shadows model + # As attention mask datatype might have been changed, it might be + # necessary to re-do the datatype annotations + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified + + +# Absorbs a MultiThreshold into ScaledDotProductAttention if there is not +# already an activation included +class AbsorbMultiThresholdIntoScaledDotProductAttention(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Any MultiThreshold is a candidate node + if node.op_type == "MultiThreshold": + # Cannot be a join-node + if model.is_join_node(node): + # Softly skip transforming this node + continue + # Now we know there is only one producer operation preceding the + # multi-threshold node + attention = model.find_direct_predecessors(node) + # The first node in the graph might have no predecessor + if attention is None: + # Skip this node + continue + # Unpack the single predecessor from the list + attention = attention[0] + # Predecessor must actually be a ScaledDotProductAttention for + # this transform to apply + if not attention.op_type == "ScaledDotProductAttention": + # Skip transforming this instance, probably no need to warn + continue + # The attention operation may not fork for this transformation + # to be applicable + if model.is_fork_node(attention): + # Softly skip transforming this, will result in standalone + # thresholds + continue + + # Check whether the attention operation already has an output + # activation + if getCustomOp(attention).get_nodeattr("ActAVMatMul") != "none": + # Issue a warning to make the user aware of this mismatch + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f" {attention.name} already has an activation:" + f" {get_by_name(attention.attribute, 'ActAVMatMul').s}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Datatype of the thresholding output, which will be the new + # output datatype of the attention operator + dtype = getCustomOp(node).get_nodeattr("out_dtype") + # Output bias after the thresholding, needs to be absorbed into + # the attention operator as well + out_bias = getCustomOp(node).get_nodeattr("out_bias") + + # Collect new attributes + attrs = { + # Datatype of output elements of the second matmul + # Note: Always the same as the OType + "OutAVMatMul": dtype, + # Attention operator output type must be the same as the + # output type of the last matmul + "OType": dtype, + # Activation function type following the second matmul + "ActAVMatMul": "thresholds", + # Output bias to be applied to the thresholding activation + # following the Attention x Value multiplication + "BiasActAVMatMul": out_bias, + } + + # Run over all attributes to be changed + for key, value in attrs.items(): + # Remove the existing attribute + remove_by_name(attention.attribute, key) + # Insert a new attribute with the same name + attention.attribute.append(oh.make_attribute(key, value)) + + # Append the new threshold tensor as the last input + attention.input.append(node.input[1]) + # Annotate the new thresholds tensor datatype + model.set_tensor_datatype( + node.input[1], model.get_tensor_datatype(node.input[0]) + ) + # Rewire the output of the attention operator to skip the + # thresholds node + attention.output[0] = node.output[0] + # Remove the thresholding node + graph.node.remove(node) + # The graph has been modified + graph_modified = True + # Break the loop after adding and removing nodes to start over + # with a clean index + break + # After rewiring need to re-do the shape annotations + model = model.transform(InferShapes()) # noqa: Shadows model + # As attention mask datatype might have been changed, it might be + # necessary to re-do the datatype annotations + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified diff --git a/src/finn/transformation/fpgadataflow/attention_heads.py b/src/finn/transformation/fpgadataflow/attention_heads.py new file mode 100644 index 0000000000..6d97a4bfe6 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/attention_heads.py @@ -0,0 +1,782 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Make copies and deep copies of python objects +import copy + +# Need numpy for modifying the onnx graph tensors, which are numpy style arrays +import numpy as np + +# Output warning messages +import warnings + +# Utility for handling ONNX nodes and tensors +from onnx import NodeProto +from onnx import helper as oh + +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# QONNX graph transformation base class +from qonnx.transformation.base import Transformation + +# QONNX graph transformations for renaming and cleaning up +from qonnx.transformation.general import GiveUniqueParameterTensors + +# Transformation running qonnx datatype inference +from qonnx.transformation.infer_datatypes import InferDataTypes + +# Transformation running onnx shape inference +from qonnx.transformation.infer_shapes import InferShapes + +# Gets items from protobuf by name +from qonnx.util.basic import get_by_name, remove_by_name + +# Utility function for transforming ONNX graphs +from finn.transformation.util import ( + is_reshape_transpose, + is_transpose_reshape, + op_types, +) + + +# Infers reshaping of attention heads, i.e., converts the Reshape and transpose +# patterns to the SplitMultiHeads and MergeMultiHeads hardware custom operators. +class InferMultiHeads(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Head-slicing reshaping is triggered by detecting a reshape + # operation followed by a transpose + if is_reshape_transpose(node, model): + # Get the single successor node + transpose = model.find_direct_successors(node)[0] + + # Get the input and output tensor names to the pattern + inp = node.input[0] + mid = node.output[0] + end = transpose.output[0] + + # Get the shape of the input tensor for inferring the number of + # heads and correctly propagating shapes + shape = model.get_tensor_shape(inp) + # Determine the rank of the input tensor to support batched and + # non-batched inputs + rank = len(shape) + + # The input shape determines the sequence length + seq, _, dim = shape if (rank == 3) else (shape[0], 1, shape[1]) + + # The intermediate shape must be the same as specified as the + # second input to the reshape operation + assert (model.get_tensor_shape(mid) # noqa + == model.get_initializer(node.input[1])).all() # noqa + # Expected layout after reshape is "head last" + _, heads, _ = model.get_tensor_shape(mid) + + # Get the (optional) permutation indices of the transpose in + # case it is a multi-axis transpose + perm = get_by_name(transpose.attribute, "perm") + # Convert permutation indices to list of integers if it is + # given + perm = perm.ints if perm is not None else None + + # Transpose must either keep or flip the sequence and embedding + # dimensions + if perm not in [[1, 0, 2], [1, 2, 0]]: + # Issue a warning of near match of the supported head + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported permutation near {transpose.name}: {perm}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Check whether the transpose only permutes to head first or + # additionally transposes sequence and embedding dimension as + # well + keep_transpose = (perm == [1, 2, 0]) + + # Start assuming there is no middle node, as the transpose is + # removed + maybe_mid = end + + # Insert a new transpose node if the sequence and embedding + # dimensions are flipped + if keep_transpose: + # Construct a new intermediate tensor using the current one + # as template + maybe_mid = mid + # Construct a new Transpose with attributes inferred from + # the detected graph patter + new_transpose = oh.make_node(**{ + "op_type": "Transpose", + # Named inputs extracted from the graph pattern + "inputs": [maybe_mid], + # Named outputs extracted from the graph pattern + "outputs": [end], + # Give node name derived from the operator type and the + # name of the triggering node to be removed + "name": f"MultiHeads_Transpose_{node.name}", + # Permute the last two dimensions + "perm": [0, 2, 1] + }) + # Insert the new node into the graph + graph.node.insert(index + 1, new_transpose) + # Change the shape of the intermediate tensor to reflect + # partial reshaping + model.set_tensor_shape( + maybe_mid, (heads, seq, dim // heads) + ) + + # Fixed node attributes and extracted input/output/initializer + # tensor names + kwargs = { + # Refer to this operator type by its name + "op_type": "SplitMultiHeads", + # Execution will try to look up the implementation in the + # package referred to by the domain + "domain": "finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + "backend": "fpgadataflow", + # Named inputs extracted from the graph pattern + "inputs": [inp], + # Named outputs extracted from the graph pattern + "outputs": [maybe_mid], + # Give node name derived from the operator type and the name + # of the triggering node to be removed + "name": f"SplitMultiHeads_{node.name}", + # Number of attention heads inferred + "heads": heads, + # Inferred multi-heads produce packed tensors + "packed": True, + # Datatype of inputs and outputs + "dtype": model.get_tensor_datatype(node.input[0]).name, + # Number of input elements, i.e., embedding dimension + "num_elems": dim, + # Number of embeddings in the whole input sequence/feature + # map + "num_inputs": [seq, 1] if (rank == 3) else [seq] + } + + # Create a new custom node replacing the multi head reshape + heads = oh.make_node(**kwargs) + # Insert the new node into the graph + graph.node.insert(index, heads) + # Collect all nodes comprising the original pattern + nodes = [node, transpose] + # Remove all nodes of the original pattern + for n in nodes: + # Do not try to remove non-existing nodes + if n is not None: + graph.node.remove(n) + # The graph has been modified + graph_modified = True + + # Head-merging reshaping is triggered by detecting a transpose + # operation followed by a reshape + if is_transpose_reshape(node, model): + # Get the single successor node + reshape = model.find_direct_successors(node)[0] + + # Get the input and output tensor names to the pattern + inp = node.input[0] + end = reshape.output[0] + + # The input shape determines the heads, sequence length and + # embedding dimension + heads, seq, dim = model.get_tensor_shape(inp) + + # Get the (optional) permutation indices of the transpose in + # case it is a multi-axis transpose + perm = get_by_name(node.attribute, "perm") + # Convert permutation indices to list of integers if it is given + perm = perm.ints if perm is not None else None + + # Transpose must flip the heads and sequence dimensions + if perm not in [[1, 0, 2]]: + # Issue a warning of near match of the supported head + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported permutation near {node.name}: {perm}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Shape of the final output of the operator pattern + out_shape = model.get_tensor_shape(end) + + # The output of the reshape must be the same as specified as the + # second input to the reshape operation + assert (out_shape # noqa + == model.get_initializer(reshape.input[1])).all() + + # The final output shape must match the expectation of + # reintegrating the heads back into the embeddings + if out_shape not in [[seq, heads * dim], [seq, 1, heads * dim]]: + # Issue a warning to make the user aware of this mismatch + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Output shape mismatch near: {reshape.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Fixed node attributes and extracted input/output/initializer + # tensor names + kwargs = { + # Refer to this operator type by its name + "op_type": "MergeMultiHeads", + # Execution will try to look up the implementation in the + # package referred to by the domain + "domain": "finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + "backend": "fpgadataflow", + # Named inputs extracted from the graph pattern + "inputs": [inp], + # Named outputs extracted from the graph pattern + "outputs": [end], + # Give node name derived from the operator type and the name + # of the triggering node to be removed + "name": f"MergeMultiHeads_{node.name}", + # Number of attention heads inferred + "heads": heads, + # Remember, whether the output needs to be squeezed + "squeezed": out_shape == [seq, heads * dim], + # Inferred multi-heads produce packed tensors + "packed": True, + # Datatype of inputs and outputs + "dtype": model.get_tensor_datatype(node.input[0]).name, + # Number of input elements, i.e., embedding dimension + "num_elems": dim, + # Number of embeddings in the whole input sequence/feature + # map + "num_inputs": [heads, seq], + } + + # Create a new custom node replacing the multi head reshape + heads = oh.make_node(**kwargs) + # Insert the new node into the graph + graph.node.insert(index, heads) + # Collect all nodes comprising the original pattern + nodes = [node, reshape] + # Remove all nodes of the original pattern + for n in nodes: + # Do not try to remove non-existing nodes + if n is not None: + graph.node.remove(n) + # The graph has been modified + graph_modified = True + # After rewiring need to re-do the shape annotations + model = model.transform(InferShapes()) # noqa: Shadows from outer scope + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified + + +# Move SplitMultiHeads operation past MultiThreshold operation. This is required +# as a precondition for later unrolling the attention heads, as there may not be +# any other operations between splitting and merging the attention heads, +# besides the actual attention operator. +class MoveSplitMultiHeadsPastMultiThreshold(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Transformation applies to SplitMultiHeads operation (not Merge) + if node.op_type == "SplitMultiHeads": + # Slicing should not fork or join + if model.is_fork_node(node) or model.is_join_node(node): + # Issue a warning to make the user aware of this mismatch + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Slicing may not join or fork: {node.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + # Now we know there is only one consumer operation following the + # slice node + thresholds_node = model.find_direct_successors(node)[0] # noqa + # Successor must actually be a MultiThresholds for this + # transform to apply + if not thresholds_node.op_type == "MultiThreshold": + # Skip transforming this instance, probably no need to warn + continue + + # Thresholds should not fork or join either + if (model.is_fork_node(thresholds_node) + or model.is_join_node(thresholds_node)): + # Issue a warning to make the user aware of this mismatch + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"MultiThreshold may not join or fork:" + f" {thresholds_node.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Get the thresholds tensor, which must be an initializer at + # the second input + thresholds = model.get_initializer(thresholds_node.input[1]) + # This is indeed an error, no way to recover from this, so + # assertion is fine + assert thresholds is not None, \ + f"Missing threshold tensor for {thresholds_node.name}" + + # The slice node should have an attribute specifying the number + # of heads + heads = get_by_name(node.attribute, "heads") + # Heads must be present, otherwise this is an errr + assert heads is not None, \ + f"Missing number of heads for {node.name}" + # Convert heads attribute proto to integer + heads = heads.i + + # Repeat the thresholds for each head along the channel + # dimension + thresholds = np.concatenate(heads * [thresholds]) + # Update the thresholds tensor to simply repurpose the existing + # node + model.set_initializer(thresholds_node.input[1], thresholds) + + # Get names of all tensors involved in connecting the nodes + inp = node.input[0] + mid = node.output[0] + out = thresholds_node.output[0] + + # The middle tensor is now produced by the multi-threshold, + # which does not change the shape. Propagate the shape of the + # input tensor + model.set_tensor_shape(mid, model.get_tensor_shape(inp)) + # As the middle tensor is now produced by the multi-threshold, + # the datatype needs to be taken from the output tensor + model.set_tensor_datatype(mid, model.get_tensor_datatype(out)) + # Remove the datatype attribute before setting the new + # datatype + remove_by_name(node.attribute, "dtype") + # Insert new datatype attribute + node.attribute.append( + oh.make_attribute( + "dtype", model.get_tensor_datatype(out).name + ) + ) + + # Rewire the nodes locally switching order. Reuses all the + # exising tensors. + thresholds_node.input[0] = inp + thresholds_node.output[0] = mid + node.input[0] = mid + node.output[0] = out + + # Graph has been modified, required additional transformations + # to be run + graph_modified = True + # After rewiring need to re-do the shape annotations + model = model.transform(InferShapes()) # noqa: Shadows from outer scope + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified + + +# Move MergeMultiHeads operation past MultiThreshold operation to avoid merging +# excessively large streams and maybe even allow absorbing the thresholds into +# the attention operator. +class MoveMergeMultiHeadsPastMultiThreshold(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Transformation applies to MergeMultiHeads operation + if node.op_type == "MergeMultiHeads": + # Merging should not fork, but it may join + if model.is_fork_node(node): + # Issue a warning to make the user aware of this mismatch + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Slicing may not fork: {node.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + # Now we know there is only one consumer operation following the + # slice node + thresholds_node = model.find_direct_successors(node)[0] # noqa + # Successor must actually be a MultiThresholds for this + # transform to apply + if not thresholds_node.op_type == "MultiThreshold": + # Skip transforming this instance, probably no need to warn + continue + + # Thresholds must not fork or join either + if (model.is_fork_node(thresholds_node) + or model.is_join_node(thresholds_node)): + # Issue a warning to make the user aware of this mismatch + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"MultiThreshold may not join or fork:" + f" {thresholds_node.name}" + ) + # @formatter:on + # Skip transforming this instance + continue + + # Get the thresholds tensor, which must be an initializer at + # the second input + thresholds = model.get_initializer(thresholds_node.input[1]) + # This is indeed an error, no way to recover from this, so + # assertion is fine + assert thresholds is not None, \ + f"Missing threshold tensor for {thresholds_node.name}" + + # The merge node should have an attribute specifying the number + # of heads + heads = get_by_name(node.attribute, "heads") + # Heads must be present, otherwise this is an errr + assert heads is not None, \ + f"Missing number of heads for {node.name}" + # Convert heads attribute proto to integer + heads = heads.i + + # Split the thresholds for each head along the channel dimension + # Note: This is a list of thresholds per head now + thresholds = np.split(thresholds, heads) + + # Need to insert a new thresholding operation at each input of + # the multi-head merging + for i, inp in enumerate(node.input): + # Start by making a full copy of the original thresholds + # node + new_thresholds = copy.deepcopy(thresholds_node) + # The input to the original merging node becomes the first + # input to the new thresholds node + new_thresholds.input[0] = inp + # Create a new input tensor name for the thresholds + new_thresholds.input[1] = model.make_new_valueinfo_name() + # Annotate the new thresholds input with the new shape of + # the split thresholds + model.set_tensor_shape( + new_thresholds.input[1], thresholds[i].shape + ) + # Set the initializer input to the split thresholds + model.set_initializer( + new_thresholds.input[1], thresholds[i] + ) + # Create a new output tensor name + new_thresholds.output[0] = model.make_new_valueinfo_name() + # Annotate the new output with the shape of the input + model.set_tensor_shape( + new_thresholds.output[0], model.get_tensor_shape(inp) + ) + # Connect the new output tensor to the corresponding input + # of the merge node + node.input[i] = new_thresholds.output[0] + # Connect the output of the merging node to successor of the + # original thresholding node + node.output[0] = thresholds_node.output[0] + # Insert the thresholding node into the graph + graph.node.insert(index + i - 1, new_thresholds) + # Remove the original thresholds node + graph.node.remove(thresholds_node) + # Graph has been modified, required additional transformations + # to be run + graph_modified = True + # Break the loop after adding and removing nodes to start over + # with a clean index + break + # After rewiring need to re-do the shape annotations + model = model.transform(InferShapes()) # noqa: Shadows from outer scope + # Re-do the datatype annotations after inserting new tensors without and + # moving tensors with existing annotations + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified + + +# Detects multi-head attention pattern, i.e., scaled dot-product attention +# between head splitting and merging +def is_multi_head_attention(node: NodeProto, model: ModelWrapper): # noqa + # The anchor node must be scaled dot product attention + if node.op_type == "ScaledDotProductAttention": + # Get the nodes feeding the attention operation + predecessors = model.find_direct_predecessors(node) + # There must be exactly three predecessors of type head-splitting + # Note: there must be nothing in between splitting and the attention + # itself + if op_types(predecessors) == 3 * ["SplitMultiHeads"]: + # Get the node fed by the attention operation + successors = model.find_direct_successors(node) + # There must be exactly onde successor of type head-merging + # Note: there must be nothing in between attention and the merging + if op_types(successors) == 1 * ["MergeMultiHeads"]: + # Get the shape of the input tensor for inferring the number of + # heads and correctly propagating shapes + shape = model.get_tensor_shape(node.input[0]) + # Determine the rank of the input tensor to support batched and + # non-batched inputs + rank = len(shape) + # The input shape determines the sequence length + heads, _, _ = shape if (rank == 3) else (1, shape[0], shape[1]) + # Pattern detected, if there are actually multiple heads + return heads > 1 + # Pattern not detected + return False + + +# Unrolls multiple attention heads in the onnx graph to be implemented in +# parallel +class UnrollMultiHeadAttention(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Apply transformation to nodes which match the multi-head attention + # pattern + if is_multi_head_attention(node, model): + # Get the splitting nodes fed by the attention operation + split0, split1, split2 = model.find_direct_predecessors(node) + # Get the single merging node + merge0, = model.find_direct_successors(node) + # Get the number of heads produced by an arbitrary splitters + heads = get_by_name(split0.attribute, "heads").i + # Get the number of input elements to the heads splitting + # Note: Embedding dims might actually differ per input stream, + # e.g., for cross-attention + dim0 = get_by_name(split0.attribute, "num_elems").i + dim1 = get_by_name(split1.attribute, "num_elems").i + dim2 = get_by_name(split2.attribute, "num_elems").i + # get the number of input features per splitting + # Note: Feature map sizes might actually differ per input + # stream, e.g., for cross-attention + ins0 = get_by_name(split0.attribute, "num_inputs").ints + ins1 = get_by_name(split1.attribute, "num_inputs").ints + ins2 = get_by_name(split2.attribute, "num_inputs").ints + # Validate the number of heads matches between all slice and + # merge nodes + for n in [split0, split1, split2, merge0]: + # All heads must match, otherwise this is a failure from + # which we cannot recover + assert get_by_name(n.attribute, "heads").i == heads, \ + f"Differing number of heads at {node.name} and {n.name}" + # Remove the original node from the graph + graph.node.remove(n) + + # TODO: Clean up the following code + + # Create replicas of the splitting nodes with expanded output + # list + split0 = oh.make_node( + # Refer to this operator type by its name + op_type="SplitMultiHeads", + # Execution will try to look up the implementation in the + # package referred to by the domain + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + backend="fpgadataflow", + # Connect to the same input as the original + inputs=split0.input, + # Generate new output tensor names for each head + outputs=[ + model.make_new_valueinfo_name() for _ in range(heads) + ], + # Attribute specifying the number of heads + heads=heads, + # Unrolled heads do not produce packed tensors + packed=False, + # Datatype of inputs and outputs + dtype=get_by_name(split1.attribute, "dtype").s, + # Number of input elements, i.e., embedding dimension + num_elems=dim0, + # Number of embeddings in the whole input sequence/feature + # map + num_inputs=[*ins0] + ) + split1 = oh.make_node( + # Refer to this operator type by its name + op_type="SplitMultiHeads", + # Execution will try to look up the implementation in the + # package referred to by the domain + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + backend="fpgadataflow", + # Connect to the same input as the original + inputs=split1.input, + # Generate new output tensor names for each head + outputs=[ + model.make_new_valueinfo_name() for _ in range(heads) + ], + # Attribute specifying the number of heads + heads=heads, + # Unrolled heads do not produce packed tensors + packed=False, + # Datatype of inputs and outputs + dtype=get_by_name(split1.attribute, "dtype").s, + # Number of input elements, i.e., embedding dimension + num_elems=dim1, + # Number of embeddings in the whole input sequence/feature + # map + num_inputs=[*ins1] + ) + split2 = oh.make_node( + # Refer to this operator type by its name + op_type="SplitMultiHeads", + # Execution will try to look up the implementation in the + # package referred to by the domain + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + backend="fpgadataflow", + # Connect to the same input as the original + inputs=split2.input, + # Generate new output tensor names for each head + outputs=[ + model.make_new_valueinfo_name() for _ in range(heads) + ], + # Attribute specifying the number of heads + heads=heads, + # Unrolled heads do not produce packed tensors + packed=False, + # Datatype of inputs and outputs + dtype=get_by_name(split2.attribute, "dtype").s, + # Number of input elements, i.e., embedding dimension + num_elems=dim2, + # Number of embeddings in the whole input sequence/feature + # map + num_inputs=[*ins2] + ) + # Create replica of the merging node with expanded input list + merge0 = oh.make_node( + # Refer to this operator type by its name + op_type="MergeMultiHeads", + # Execution will try to look up the implementation in the + # package referred to by the domain + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + backend="fpgadataflow", + # Generate new input tensor names for each head + inputs=[ + model.make_new_valueinfo_name() for _ in range(heads) + ], + # Connect to the same input as the original + outputs=merge0.output, + # Attribute specifying the number of heads + heads=heads, + # Attribute specifying whether the output needs to be + # squeezed + squeezed=get_by_name(merge0.attribute, "squeezed").i, + # Unrolled heads do not produce packed tensors + packed=False, + # Datatype of inputs and outputs + dtype=get_by_name(merge0.attribute, "dtype").s, + # Number of input elements, i.e., embedding dimension + num_elems=get_by_name(merge0.attribute, "num_elems").i, + # Number of embeddings in the whole input sequence/feature + # map + # Note: Drop head-first head dimension of previously packed + # input + num_inputs=get_by_name( + merge0.attribute, "num_inputs").ints[1:] + ) + + # Replicate the attention operator for each head + for i in range(heads): + # Start by making a full copy of the original node + attention = copy.deepcopy(node) + # Get the original shape of each input to remove the head + # number + _, seq, dim = model.get_tensor_shape(attention.input[0]) + model.set_tensor_shape(split0.output[i], (1, seq, dim)) + _, seq, dim = model.get_tensor_shape(attention.input[1]) + model.set_tensor_shape(split1.output[i], (1, seq, dim)) + _, seq, dim = model.get_tensor_shape(attention.input[2]) + model.set_tensor_shape(split2.output[i], (1, seq, dim)) + + # Propagate the original datatype to each of the head inputs + dtype = model.get_tensor_datatype(attention.input[0]) + model.set_tensor_datatype(split0.output[i], dtype) + dtype = model.get_tensor_datatype(attention.input[1]) + model.set_tensor_datatype(split1.output[i], dtype) + dtype = model.get_tensor_datatype(attention.input[2]) + model.set_tensor_datatype(split2.output[i], dtype) + + # Connect the inputs of the replica to the output of each + # of the new slice operators + attention.input[0] = split0.output[i] + attention.input[1] = split1.output[i] + attention.input[2] = split2.output[i] + + # Get the original shape the output to remove the head + # number + _, seq, dim = model.get_tensor_shape(attention.output[0]) + model.set_tensor_shape(merge0.input[i], (1, seq, dim)) + + # Propagate the original datatype to each of the head + # outputs + dtype = model.get_tensor_datatype(attention.output[0]) + model.set_tensor_datatype(merge0.input[i], dtype) + + # Connect the output of the attention replica to the input + # of the new merge operator + attention.output[0] = merge0.input[i] + # Insert the new node into the graph + graph.node.insert(index + i + 1, attention) + # Insert the new slice and merge nodes into the graph + for i, n in enumerate([split0, split1, split2, merge0]): + # Insert the new node into the graph at index offset by + # number of heads + graph.node.insert(index + heads + i + 1, n) + # Remove the original attention operator from the graph + graph.node.remove(node) + # The graph has been modified, needs to be reported back to the + # caller + graph_modified = True + # After rewiring need to re-do the shape annotations + model = model.transform(InferShapes()) # noqa: Shadows model + # By replicating the attention operator, multiple instances refer to the + # same initializer, replace these by a unique one for each head + model = model.transform(GiveUniqueParameterTensors()) + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified diff --git a/src/finn/transformation/fpgadataflow/replicate_stream.py b/src/finn/transformation/fpgadataflow/replicate_stream.py new file mode 100644 index 0000000000..fa7fd6a275 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/replicate_stream.py @@ -0,0 +1,110 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Utility for handling ONNX nodes and tensors +from onnx import TensorProto +from onnx import helper as oh + +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# QONNX graph transformation base class +from qonnx.transformation.base import Transformation + +# Transformations running qonnx datatype inference +from qonnx.transformation.infer_datatypes import InferDataTypes + +# Transformation running onnx shape inference +from qonnx.transformation.infer_shapes import InferShapes + + +# Inserts the ReplicateStream hardware operator on tensors with multiple +# consumers +class InferReplicateStream(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Check each output of the node, as there might be multiple distinct + # outputs, each feeding multiple consumers + for out in node.output: + # Get the list of all consumers of this output tensor + consumers = model.find_consumers(out) + # No need to replicate if there is just one or no consumer + if consumers is None or len(consumers) <= 1: + # Check next output tensor + continue + # Ok, now we have multiple consumers of a single output tensor + # which requires streams to be replicated for HLS synthesis + # Get the shape of the original output tensor + out_shape = model.get_tensor_shape(out) + # Generate a list of unique replicas of the output tensor, one + # for each consumer + replicas = [model.make_new_valueinfo_name() for _ in consumers] + # Create an instance of the ReplicateStream operator for this + # output + replicate_stream = oh.make_node( + # Name of the operator class as it can be found within FINN + "ReplicateStream", + # Execution will try to look up the implementation in the + # package referred to by the domain + domain="finn.custom_op.fpgadataflow", + # Execution backend: Required attribute inherited from + # HLSCustomOp + backend="fpgadataflow", + # Connect to the original output tensor + inputs=[out], + # Connect to a unique output tensor for each consumer + outputs=replicas, + # The operator needs to now the number of replicas as an + # attribute + num=len(replicas), + # Number of input elements in the last dimension + num_elems=out_shape[-1], + # Number of elements to process in parallel: default fully + # sequential + PE=1, + # Number of inputs to be processed sequentially + num_inputs=out_shape[:-1], + # Infer the datatype from the original output + dtype=model.get_tensor_datatype(out).name, + # Derive a node name based on the original node name + name=f"ReplicateStream_{node.name}" + ) + # Insert the replicate operator into the graph right behind the + # current node + graph.node.insert(index + 1, replicate_stream) + # Need to modify each consumer to have the replica as input + for replica, consumer in zip(replicas, consumers): + # Properly construct a value info object for the new tensor + # replica + model.graph.value_info.append(oh.make_tensor_value_info( + replica, TensorProto.FLOAT, out_shape + )) + # Find the first input of the consumer corresponding to the + # original output tensor + for i, inp in enumerate(consumer.input): + # Check whether this input is the original output + if inp == out: + # Connect this input to the replica of the output + consumer.input[i] = replica + # Break here as multiple inputs to the node might + # connect to the original output, but each gets its + # own replica. + break + # The graph has been modified, needs to be reported back to the + # caller + graph_modified = True + # After rewiring need to re-do the shape annotations + model = model.transform(InferShapes()) # noqa: Shadows model + # As new tensor value infos have been inserted, it is necessary to re-do + # the datatype annotations + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the graph actually + # has been transformed + return model, graph_modified diff --git a/src/finn/transformation/util.py b/src/finn/transformation/util.py new file mode 100644 index 0000000000..b749c83288 --- /dev/null +++ b/src/finn/transformation/util.py @@ -0,0 +1,124 @@ +# fmt: off +# Disable formatter. This is deliberately formatted to stay within 80 characters +# per line. Black, however, formats some lines going beyond this. + +# Protobuf onnx graph node type +from onnx import NodeProto +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + + +# Tests whether a node is a join-node MatMul operation, i.e., a MatMul with two +# runtime inputs but no weights initializers +def is_join_matmul(node: NodeProto, model: ModelWrapper): # noqa + # Only handle existing MatMul type nodes + if node is not None and node.op_type in {"MatMul"}: + # No input must have an initializer + return all(model.get_initializer(i) is None for i in node.input) + # Did not match the operator type + return False + + +# Tests whether a node is a MatMul operator +def is_matmul(node: NodeProto): + # Node must exist and be of type MatMul + return node is not None and node.op_type in {"MatMul"} + + +# Tests whether a node is a Softmax operator +def is_softmax(node: NodeProto): + # Node must exist and be of type Softmax + return node is not None and node.op_type in {"Softmax"} + + +# Tests whether a node is an element-wise Mul +def is_mul(node: NodeProto): + # Node must exist and be of type Mul + return node is not None and node.op_type in {"Mul"} + + +# Tests whether a node is an element-wise Add +def is_add(node: NodeProto): + # Node must exist and be of type Add + return node is not None and node.op_type in {"Add"} + + +def is_end(node: NodeProto, model: ModelWrapper): # noqa + return node is not None and not model.find_direct_predecessors(node) + + +# Follow all input branches of a node until reaching a matmul +def all_upstream_to_matmul(node: NodeProto, model: ModelWrapper): # noqa + # Check whether the node is either a matmul node or the end of the graph + def is_matmul_or_end(n: NodeProto): + return is_matmul(n) or is_end(n, model) + + # Enumerate all inputs and collect everything upstream until finding the + # next matmul operation + return (model.find_upstream(i, is_matmul_or_end, True) for i in node.input) + + +# Projects a list of ONNX graph nodes to the string representation of the +# operator types +def op_types(nodes: list[NodeProto]) -> list[str]: + return [node.op_type if node is not None else "None" for node in nodes] + + +# Tests whether a node is a Reshape operator +def is_reshape(node: NodeProto): + return node is not None and node.op_type in {"Reshape"} + + +# Tests whether a node is a Transpose operator +def is_transpose(node: NodeProto): + return node is not None and node.op_type in {"Transpose"} + + +# Tests whether a node is a Reshape-Transpose operator chain +def is_reshape_transpose(node: NodeProto, model: ModelWrapper): # noqa + # Reshape-transpose pattern detection is triggered by detecting a reshape + # operation + if is_reshape(node): + # The reshape may not be a join or fork node + if model.is_join_node(node) or model.is_fork_node(node): + # Reject detection of the pattern + return False + # Get the single successor node + transpose = model.find_direct_successors(node)[0] + # The consumer must be Transpose finalizing the reshaping + if not is_transpose(transpose): + # Reject detection of the pattern + return False + # The transpose may not fork or join either + if model.is_join_node(transpose) or model.is_fork_node(transpose): + # Reject detection of the pattern + return False + # Accept detecting the pattern + return True + # Reject detection of the pattern + return False + + +# Tests whether a node is a Transpose-Reshape operator chain +def is_transpose_reshape(node: NodeProto, model: ModelWrapper): # noqa + # Transpose-Reshape pattern detection is triggered by detecting a transpose + # operation + if is_transpose(node): + # The transpose may not be a join or fork node + if model.is_join_node(node) or model.is_fork_node(node): + # Reject detection of the pattern + return False + # Get the single successor node + reshape = model.find_direct_successors(node)[0] + # The consumer must be a reshape finalizing the transpose-reshape + if not is_reshape(reshape): + # Reject detection of the pattern + return False + # The reshape may not fork or join either + if model.is_join_node(reshape) or model.is_fork_node(reshape): + # Reject detection of the pattern + return False + # Accept detecting the pattern + return True + # Reject detection of the pattern + return False From 6ae63f938ce900070d4f0c78988bfe72dec0e585 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 25 Apr 2024 15:24:31 +0200 Subject: [PATCH 68/88] Add missing ModelWrapper import to reorder.py --- src/finn/transformation/streamline/reorder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 293f633df5..6ba4bf026e 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -32,6 +32,7 @@ from onnx import TensorProto from onnx import helper as oh from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper from qonnx.core.onnx_exec import execute_node from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation From 15246c8b2b668dcf146e94e5fae1222d98ef14e0 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 26 Apr 2024 15:54:02 +0200 Subject: [PATCH 69/88] [Streamline] Fix eager access to potentially empty successors list --- .../transformation/streamline/collapse_repeated.py | 12 +++++++++--- src/finn/transformation/streamline/reorder.py | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/finn/transformation/streamline/collapse_repeated.py b/src/finn/transformation/streamline/collapse_repeated.py index 796087ffc0..db18aeed39 100644 --- a/src/finn/transformation/streamline/collapse_repeated.py +++ b/src/finn/transformation/streamline/collapse_repeated.py @@ -139,13 +139,19 @@ def apply(self, model: ModelWrapper): # noqa # Softly skip this node continue # As this is not a fork-node, there can be at most one successor - successor = model.find_direct_successors(node)[0] + successor = model.find_direct_successors(node) # If Transpose is the final operation in the graph, there might # be no successor - if successor is None or successor.op_type != "Transpose": + if successor is None: + # Softly skip this node + continue + # Now there is exactly one successor which needs to be extracted + # from the list + successor = successor[0] + # Successor must be a Transpose to be collapsed + if successor.op_type != "Transpose": # Softly skip this node continue - # Get the (optional) permutation indices of the first transpose # in case it is a multi-axis transpose perm1 = get_by_name(node.attribute, "perm") diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 6ba4bf026e..276cc40c1c 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -1264,12 +1264,15 @@ def apply(self, model: ModelWrapper): # noqa # Softly skip this node continue # As this is not a fork-node, there can be at most one successor - successor = model.find_direct_successors(node)[0] + successor = model.find_direct_successors(node) # If Transpose is the final operation in the graph, there might # be no successor if successor is None: # Softly skip this node continue + # Now there is exactly one successor which needs to be extracted + # from the list + successor = successor[0] # Applies to elementwise add and mul operations if successor.op_type in {"Add", "Mul"}: # Get names of all tensors involved in connecting the nodes From 5eda0f6a4b6e059ad1fd21405126ccda457c15aa Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 3 May 2024 18:54:57 +0200 Subject: [PATCH 70/88] [Attention] Implement get_exp_cycles for attention-related HWCustomOps --- src/finn/custom_op/fpgadataflow/attention.py | 33 +++++++++++++++++++ .../custom_op/fpgadataflow/attention_heads.py | 14 ++++++++ .../fpgadataflow/replicate_stream.py | 7 ++++ 3 files changed, 54 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index c42c6e60d9..1027bdab74 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -709,3 +709,36 @@ def get_input_name_by_name(self, name): # Find the position of the requested input name and look up the # corresponding input name of the ONNX node return self.onnx_node.input[inputs.index(name)] + + # Derives the expected cycles for the attention operation given the folding + # configuration + def get_exp_cycles(self): + # Verify the folding configuration + assert self.is_valid_folding, \ + f"Invalid folding configuration for {self.onnx_node.name}" + # Get the input/output dimensions + qk_dim, q_len, v_dim, kv_len = self.shapes + # Get folding configuration describing how to parallelize along the + # dimensions + emb_fold, seq_fold = self.folds + # Assume perfect overlap of the constituents of the operator, i.e., of + # the buffering, both matmul and the softmax, then the expected cycles + # is the maximum over these operators + # Overall worst case cycles without any parallelization: ~ T x T x d + return max( + # Transposed keys buffer cycles + # Worst case: kv_len * qk_dim, ~ T x d + kv_len * emb_fold, + # Queries - keys matmul cycles + # Worst case: q_len * qk_dim * kv_len, ~ T x T x d + q_len * emb_fold * seq_fold, + # Softmax normalization cycles + # Worst case: q_len * kv_len, ~ T x T + q_len * seq_fold, + # Values buffer cycles + # Worst case: kv_len * v_dim, ~ T x d + kv_len * emb_fold, + # Attention weights - values matmul + # Worst case: q_len * v_dim * kv_len, ~ T x T x d + q_len * emb_fold * seq_fold + ) diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 381776202b..13e1419cbb 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -325,6 +325,13 @@ def get_number_output_values(self): # N outputs per cycle... return np.prod(self.get_folded_output_shape()[:-1]) * self.heads + # Derives the expected cycles for the attention head splitting operation + # given the folding configuration + def get_exp_cycles(self): + # Currently, this implicitly assumes fully parallelized processing + # along the embedding dimension, i.e., always max PE + return np.prod(self.num_inputs) + # Merging of attention heads (before output projections) custom operator class MergeMultiHeads(HWCustomOp): @@ -627,3 +634,10 @@ def get_number_output_values(self): # Elements over all but the last dimension of the output folded along # the embedding dimension return np.prod(self.get_folded_output_shape()[:-1]) + + # Derives the expected cycles for the attention head merging operation given + # the folding configuration + def get_exp_cycles(self): + # Currently, this implicitly assumes fully parallelized processing + # along the embedding dimension, i.e., always max PE + return np.prod(self.num_inputs) diff --git a/src/finn/custom_op/fpgadataflow/replicate_stream.py b/src/finn/custom_op/fpgadataflow/replicate_stream.py index a2ca666e2a..b593da1c7b 100644 --- a/src/finn/custom_op/fpgadataflow/replicate_stream.py +++ b/src/finn/custom_op/fpgadataflow/replicate_stream.py @@ -286,3 +286,10 @@ def get_number_output_values(self): # outputs, i.e., producing N replica outputs per cycle in parallel, # count N outputs per cycle... return np.prod(self.get_folded_output_shape()[:-1]) * self.num + + # Derives the expected cycles for the stream replication operation given the + # folding configuration + def get_exp_cycles(self): + # Number of iterations required to process the whole folded input stream + # Note: This is all but the PE (last, parallelized) dimension + return np.prod(self.get_folded_output_shape()[:-1]) From 0b00f697136deac26cdf87fdfe3aab8b882cfcca Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 3 May 2024 19:14:42 +0200 Subject: [PATCH 71/88] Add support for ReplicateStream_hls as a PE-operation to SetFolding --- src/finn/transformation/fpgadataflow/set_folding.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index eaee499e6a..200b0a92e8 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -106,6 +106,7 @@ def apply(self, model): "GlobalAccPool_hls", "Thresholding_hls", "Thresholding_rtl", + "ReplicateStream_hls", ] # these ops use SIMD parallelism, up to a max value of NumChannels # ConvolutionInputGenerator* has a special case when depthwise=1 @@ -151,7 +152,16 @@ def apply(self, model): # increase PE until target met or reached max_pe self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in pe_ops: - max_pe = node_inst.get_nodeattr("NumChannels") + # Note: Keep original behavior for all custom-ops defining the + # NumChannels attribute as it is + try: + max_pe = node_inst.get_nodeattr("NumChannels") + # Note: Some of the recent additions do not define the + # NumChannels attribute + except AttributeError: + # We can extract the channels from the normal, i.e., not + # folded, shape of the input in these cases + max_pe = node_inst.get_normal_input_shape()[-1] self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type == "LabelSelect_hls": max_pe = node_inst.get_nodeattr("Labels") From 7fba682e3973802d3d7ba6cf3195c84950a497cb Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 10 May 2024 17:20:32 +0200 Subject: [PATCH 72/88] [Attention] Add method to get the number of folded inputs --- src/finn/custom_op/fpgadataflow/attention.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 1027bdab74..90b0006287 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -534,7 +534,8 @@ def get_folded_input_shape(self, ind=0): # differently and would require to actually keep track of mapping # indices to optional inputs to correctly associate the folding # dimensions. - raise Exception(f"Requested shape of invalid input index {ind}") + # TODO: This is just a dummy shape + return 0, 0, 0 # Gets the shape of the output at index ind (there is just one) with folding def get_folded_output_shape(self, ind=0): # noqa, there is just one output @@ -661,6 +662,13 @@ def minimize_accumulator_width(self, model): # noqa: model is unused # each custom op instead of once after traversing the whole graph. # self.set_nodeattr("OType", AccQKMatMul.name) + # Gets the number of expected input values, i.e. how many times read() + # could/should be called on the input stream of this operator + def get_number_input_values(self, ind=0): + # Elements over all but the last dimension of the input folded along + # the embedding dimension + return np.prod(self.get_folded_input_shape(ind=ind)[:-1]) + # Gets the number of expected output values, i.e. how many times read() # could/should be called on the output stream of this operator def get_number_output_values(self): From 620705777398fd96b17fe4a04b02849e650a4f1d Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 16 May 2024 09:49:08 +0200 Subject: [PATCH 73/88] [Attention] Make use of resource type attributes for buffers and MACs Note: This is currently not controlling the memory used by the internal threshold operations and also not controlling the resoruce type used for implementing the floating-point operations within the softmax. These are all still handled by the tools' automatic strategy. --- src/finn/custom_op/fpgadataflow/attention.py | 14 +++++++++ .../fpgadataflow/hls/attention_hls.py | 31 +++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 90b0006287..3ca08fe03f 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -132,6 +132,20 @@ def get_nodeattr_types(self): "s", False, "python", {"", "rtlsim", "cppsim", "python"} ), + # FPGA resource type for memories/internal buffers of the operator + # Note: Currently only used for StreamTile buffers + "ram_style": ( + "s", False, "auto", {"auto", "block", "distributed", "ultra"} + ), + # FPGA resource type for memories of the thresholds parameters + # Note: Not yet used... + "ram_style_thresholds": ( + "s", False, "auto", {"auto", "block", "distributed", "ultra"} + ), + # FPGA resource type to implement the MAC operations of the two + # internal matmul operations + "mac_resource": ("s", False, "auto", {"auto", "lut", "dsp"}), + # Input and output FIFO depths for multi-I/O nodes # Note: Need to override here as there are three inputs "inFIFODepths": ("ints", False, [2, 2, 2]), diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py index 43e4c4d7b1..9048762e18 100644 --- a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py @@ -353,7 +353,7 @@ def prepare_thresholds(ts, length, fold, dtype): # Generate C++ code initializing the constant mask array attention_mask = f"static const {mask_type} attention_mask = {mask}" - # Of a mask is provided as input, no object parameters need to be + # If a mask is provided as input, no object parameters need to be # generated here if self.get_nodeattr("mask_mode") == "input": # Attention mask type of input stream @@ -417,6 +417,28 @@ def hls_type(name): # given by argument list names return (f"using {name} = {hls_type(name)};" for name in names) + # Attribute specifying the memory to use for internal buffers + ram_style = self.get_nodeattr("ram_style") + # Attribute specifying the resources to use for implementing MAC + # operations + mac_resource = self.get_nodeattr("mac_resource") + + # Mapping of memory resource attributes to the corresponding C++ tag + # types + mem_resources = { + "auto": "Resource::AUTO", + "block": "Resource::BRAM", + "distributed": "Resource::LUTRAM", + "ultra": "Resources::URAM" + } + # Mapping of compute resource attributes to the corresponding C++ tag + # types + compute_resources = { + "auto": "ap_resource_dflt", + "lut": "ap_resource_lut", + "dsp": "ap_resource_dsp" + } + # Insert constants and type aliases into the dictionary self.code_gen_dict["$DEFINES$"] = [ # Shape constant definitions of attention inputs (query, key and @@ -448,6 +470,9 @@ def hls_type(name): "OutAVMatMul", "AccASoftmax" ), + # Type alias definitions for the resource type selection tags + f"using MacResource = {compute_resources[mac_resource]};", + f"using MemResource = {mem_resources[ram_style]};", # Include the activation function type definitions and parameters # Note: The typedefs in this header require the typedefs above, # thus adding this to the global includes is not possible. @@ -472,7 +497,9 @@ def hls_type(name): " AccAVMatMul,", " OType,", # Note: OType and last MatMul out must match " ActAVMatMul,", - " ActASoftmax", + " ActASoftmax,", + " MacResource,", + " MemResource" ">;", # Short type aliases of attention input and output streams "using QStream = Attention::QStream;", From 174c098e7c0a680d16db878b138ff2ba2c8d2d3e Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 16 May 2024 12:13:34 +0200 Subject: [PATCH 74/88] [Attention] Make use of resource type attributes for embedded thresholds --- .../fpgadataflow/hls/attention_hls.py | 90 ++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py index 9048762e18..45bd253d7b 100644 --- a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py @@ -583,17 +583,103 @@ def strm_decl(self): # Generates C++ code for calling the computation part of the operator def docompute(self): + # Mapping of memory resource attributes to the corresponding C++ HLS + # pragma directives + ram_style_thresholds = { + "auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", + }[self.get_nodeattr("ram_style_thresholds")] + + # Generates the "BIND_STORAGE" pragma for the threshold activations + # threshold memory of "name" + def bind_threshold_storage(name: str): + return (f"#pragma HLS BIND_STORAGE variable={name}" + f" type=ROM_2P impl={ram_style_thresholds}") + + # Generates the ARRAY_PARTITION pragma for the threshold activations + # threshold memory of "name" and along dimension "dim" + def partition_thresholds_array(name: str, dim: int): + return (f"#pragma HLS ARRAY_PARTITION variable={name}" + f" complete dim={dim}") + + # Collect pragmas which need to be inserted into the DOCOMPUTE code + pragmas = [] + + # If there are thresholds activations following the query-key matmul, + # these need storage and array partition pragmas + if self.get_nodeattr("ActQKMatMul") == "thresholds": + # Add pragma compiler directives to the list of pragmas inserted + # into the DOCOMPUTE + pragmas.extend([ + # Partition the thresholds array along the PE (dim=1) and number + # of thresholds (dim=3) axis for parallel access + partition_thresholds_array( + "attention.qk_matmul.activation.m_thresholds", dim=1 + ), + partition_thresholds_array( + "attention.qk_matmul.activation.m_thresholds", dim=3 + ), + # Implement the thresholds array as a dual-port ROM with the + # RAM-Style selected via attribute + bind_threshold_storage( + "attention.qk_matmul.activation.m_thresholds" + ) + ]) + + # If there are thresholds activations following the attention-value + # matmul, these need storage and array partition pragmas + if self.get_nodeattr("ActAVMatMul") == "thresholds": + # Add pragma compiler directives to the list of pragmas inserted + # into the DOCOMPUTE + pragmas.extend([ + # Partition the thresholds array along the PE (dim=1) and number + # of thresholds (dim=3) axis for parallel access + partition_thresholds_array( + "attention.av_matmul.activation.m_thresholds", dim=1 + ), + partition_thresholds_array( + "attention.av_matmul.activation.m_thresholds", dim=3 + ), + # Implement the thresholds array as a dual-port ROM with the + # RAM-Style selected via attribute + bind_threshold_storage( + "attention.av_matmul.activation.m_thresholds" + ) + ]) + + # If there are thresholds activations following the softmax + # normalization, these need storage and array partition pragmas + if self.get_nodeattr("ActASoftmax") == "thresholds": + # Add pragma compiler directives to the list of pragmas inserted + # into the DOCOMPUTE + pragmas.extend([ + # Partition the thresholds array along the PE (dim=1) and number + # of thresholds (dim=3) axis for parallel access + partition_thresholds_array( + "attention.softmax.activation.m_thresholds", dim=1 + ), + partition_thresholds_array( + "attention.softmax.activation.m_thresholds", dim=3 + ), + # Implement the thresholds array as a dual-port ROM with the + # RAM-Style selected via attribute + bind_threshold_storage( + "attention.softmax.activation.m_thresholds" + ) + ]) + # Write the body of the attention top-level function self.code_gen_dict["$DOCOMPUTE$"] = [ # Instantiate the attention operator and connect to the generated # threshold parameters - # Note: Assumes "Attention" to be aliased appropriate configuration - # in defines with. + # Note: Assumes "Attention" to be aliased and configured in defines # Note: Assumes parameters to be generated in 'generate_params' and # made available via include/defines before. "Attention attention {", " act_qk_matmul, act_av_matmul, act_a_softmax, dequant_softmax", "};", + # Insert some more pragmas here to be able to configure + # implementation details of components internal to "attention" + *pragmas, # Connect the attention operator to the input and output streams "attention(" f"q_{self.hls_sname()}, " From 4f7072bc2caf190ba4f5cd5d774f6cf9edfdc654 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Thu, 16 May 2024 14:24:40 +0200 Subject: [PATCH 75/88] [Attention] Add resource attribute for the attention mask in const mode --- src/finn/custom_op/fpgadataflow/attention.py | 9 ++++++-- .../fpgadataflow/hls/attention_hls.py | 23 +++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 3ca08fe03f..a934f397f8 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -107,7 +107,7 @@ def get_nodeattr_types(self): # input "DequantSoftmax": ("f", False, 1.0), # Datatype of softmax normalization before applying activation or - # type cast. THis is called Acc to stick to the naming scheme of the + # type cast. This is called Acc to stick to the naming scheme of the # MatMul operators before. # Note: Currently this is ALWAYS floats "AccASoftmax": ("s", False, "FLOAT32"), @@ -140,7 +140,12 @@ def get_nodeattr_types(self): # FPGA resource type for memories of the thresholds parameters # Note: Not yet used... "ram_style_thresholds": ( - "s", False, "auto", {"auto", "block", "distributed", "ultra"} + "s", False, "auto", {"auto", "block", "distributed"} + ), + # FPGA resource type for memories of the attention mask if the + # mask_mode is "const" + "ram_style_mask": ( + "s", False, "auto", {"auto", "block", "distributed"} ), # FPGA resource type to implement the MAC operations of the two # internal matmul operations diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py index 45bd253d7b..621e512cdb 100644 --- a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py @@ -585,9 +585,15 @@ def strm_decl(self): def docompute(self): # Mapping of memory resource attributes to the corresponding C++ HLS # pragma directives - ram_style_thresholds = { + ram_styles = { "auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", - }[self.get_nodeattr("ram_style_thresholds")] + } + # Convert the thresholds RAM style attribute to HLS directive + ram_style_thresholds = ram_styles[ + self.get_nodeattr("ram_style_thresholds") + ] + # Convert the attention mask RAM style attribute to HLS directive + ram_style_mask = ram_styles[self.get_nodeattr("ram_style_mask")] # Generates the "BIND_STORAGE" pragma for the threshold activations # threshold memory of "name" @@ -667,6 +673,19 @@ def partition_thresholds_array(name: str, dim: int): ) ]) + # If a constant mask is specified, there needs to be storage and array + # partition pragmas to be inserted + if self.get_nodeattr("mask_mode") == "const": + # Note: Probably no need for partitioning this array, as the PE + # dimension is packed into the datatype (which is a bitvector with + # one bit per element, i.e., per PE) + # Implement the attention mask array as a dual-port ROM with the + # RAM-Style selected via attribute + pragmas.extend([ + f"#pragma HLS BIND_STORAGE variable=attention_mask" + f" type=ROM_2P impl={ram_style_mask}" + ]) + # Write the body of the attention top-level function self.code_gen_dict["$DOCOMPUTE$"] = [ # Instantiate the attention operator and connect to the generated From 2b9d94b444f08a693c5b79b48ac60146b0cf8e15 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 17 May 2024 12:23:59 +0200 Subject: [PATCH 76/88] [Attention] Refactor RAM_STYLES dictionary --- .../custom_op/fpgadataflow/hls/attention_hls.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py index 621e512cdb..6a7857db40 100644 --- a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py @@ -21,6 +21,12 @@ # Convert and pack (numpy) data for C++ code generation from finn.util.data_packing import numpy_to_hls_code +# Mapping of memory resource attributes to the corresponding C++ HLS +# pragma directives +RAM_STYLES = { + "auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", "ultra": "URAM" +} + # HLS Backend specialization of the Scale Dot-product Attention Operator class ScaledDotProductAttention_hls( # noqa: Class name does not follow @@ -583,17 +589,12 @@ def strm_decl(self): # Generates C++ code for calling the computation part of the operator def docompute(self): - # Mapping of memory resource attributes to the corresponding C++ HLS - # pragma directives - ram_styles = { - "auto": "AUTO", "block": "BRAM", "distributed": "LUTRAM", - } # Convert the thresholds RAM style attribute to HLS directive - ram_style_thresholds = ram_styles[ + ram_style_thresholds = RAM_STYLES[ self.get_nodeattr("ram_style_thresholds") ] # Convert the attention mask RAM style attribute to HLS directive - ram_style_mask = ram_styles[self.get_nodeattr("ram_style_mask")] + ram_style_mask = RAM_STYLES[self.get_nodeattr("ram_style_mask")] # Generates the "BIND_STORAGE" pragma for the threshold activations # threshold memory of "name" From b5bd0fff8cc2b276c90eada4cbcdfcefe98c937f Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 21 May 2024 17:30:10 +0200 Subject: [PATCH 77/88] [Attention] Redirect RTL simulation of attention to Python execution This is a temporary solution to get at least node-by-node RTL simulation of models working by simply skipping the attention operator. --- src/finn/custom_op/fpgadataflow/attention.py | 2 +- src/finn/custom_op/fpgadataflow/hls/attention_hls.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index a934f397f8..4ff16cf200 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -411,7 +411,7 @@ def execute_node(self, context, graph): exec_fns = { "python": self._execute_node_python, "cppsim": self._execute_node_cppsim, - "rtlsim": self._execute_node_rtlsim, + "rtlsim": self._execute_node_python, # TODO: Revert to rtlsim } # Select and execute the function by mode string exec_fns[mode](context, graph) diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py index 6a7857db40..ea10303862 100644 --- a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py @@ -800,3 +800,10 @@ def get_verilog_top_module_intf_names(self): intf_names["ap_none"] = [] # Return the interface name dictionary return intf_names + + # Prepare for RTL simulation: There is no RTL simulation of the attention + # operator for now + def prepare_rtlsim(self): + # This attribute must be present anyway, but it is ok if it points + # nowhere as long as execute_node doe not ry to execute the rtlsim + self.set_nodeattr("rtlsim_so", "none") From aa742c7b107359bd07698cad2deb72381ec6cb53 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Sat, 8 Jun 2024 15:42:14 +0200 Subject: [PATCH 78/88] [Attention] Add missing constant mask mode to input shape query --- src/finn/custom_op/fpgadataflow/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/attention.py b/src/finn/custom_op/fpgadataflow/attention.py index 4ff16cf200..7b1c3e7dd4 100644 --- a/src/finn/custom_op/fpgadataflow/attention.py +++ b/src/finn/custom_op/fpgadataflow/attention.py @@ -479,7 +479,7 @@ def get_normal_input_shape(self, ind=0): ] # If the attention mask is provided as input, it has a shape as well - if self.get_nodeattr("mask_mode") == "input": + if self.get_nodeattr("mask_mode") in {"input", "const"}: # Mask shape is inferred from query and key sequence lengths inputs_shapes += [ (self.get_nodeattr("QLen"), self.get_nodeattr("KVLen")) From 2bf164a3bf76b304312b87426c6a0d9cf25c2fc7 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Fri, 14 Jun 2024 13:05:05 +0200 Subject: [PATCH 79/88] [Attention] Fix Resource::URAM typo --- src/finn/custom_op/fpgadataflow/hls/attention_hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py index ea10303862..332313d8d9 100644 --- a/src/finn/custom_op/fpgadataflow/hls/attention_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/attention_hls.py @@ -435,7 +435,7 @@ def hls_type(name): "auto": "Resource::AUTO", "block": "Resource::BRAM", "distributed": "Resource::LUTRAM", - "ultra": "Resources::URAM" + "ultra": "Resource::URAM" } # Mapping of compute resource attributes to the corresponding C++ tag # types From 95f29b094e248c9919802dea3dceb30489e37227 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 5 Aug 2024 09:59:00 +0200 Subject: [PATCH 80/88] [Attention] Add data layout checks to InferMultiHeads transformation --- .../fpgadataflow/attention_heads.py | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/attention_heads.py b/src/finn/transformation/fpgadataflow/attention_heads.py index 6d97a4bfe6..0b5bc0e7d7 100644 --- a/src/finn/transformation/fpgadataflow/attention_heads.py +++ b/src/finn/transformation/fpgadataflow/attention_heads.py @@ -70,9 +70,35 @@ def apply(self, model: ModelWrapper): # noqa # non-batched inputs rank = len(shape) + # Can only handle 3-dimensional (2-dimensional) layouts for now + if rank not in {2, 3}: + # Issue a warning of near match of the supported head + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported shape near {transpose.name}: {inp}" + ) + # @formatter:on + # Skip transforming this instance + continue + # The input shape determines the sequence length seq, _, dim = shape if (rank == 3) else (shape[0], 1, shape[1]) + # Can only handle 3-dimensional (2-dimensional) layouts for now + if len(model.get_tensor_shape(mid)) != 3: + # Issue a warning of near match of the supported head + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported shape near {transpose.name}: {mid}" + ) + # @formatter:on + # Skip transforming this instance + continue + # The intermediate shape must be the same as specified as the # second input to the reshape operation assert (model.get_tensor_shape(mid) # noqa @@ -193,9 +219,29 @@ def apply(self, model: ModelWrapper): # noqa inp = node.input[0] end = reshape.output[0] + # Get the shape of the input tensor for inferring the number of + # heads and correctly propagating shapes + shape = model.get_tensor_shape(inp) + # Determine the rank of the input tensor to support batched and + # non-batched inputs + rank = len(shape) + + # Can only handle 3-dimensional (2-dimensional) layouts for now + if rank not in {3}: + # Issue a warning of near match of the supported head + # pattern + # @formatter:off + warnings.warn( + f"{self.__class__.__name__}: Skipping near match: " + f"Unsupported shape near {reshape.name}: {inp}" + ) + # @formatter:on + # Skip transforming this instance + continue + # The input shape determines the heads, sequence length and # embedding dimension - heads, seq, dim = model.get_tensor_shape(inp) + heads, seq, dim = shape # Get the (optional) permutation indices of the transpose in # case it is a multi-axis transpose From ca6cc33a3348765e186c91db5e908089924491eb Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 7 Aug 2024 11:23:43 +0200 Subject: [PATCH 81/88] Fix SplitMultiHeads shape inference is shape is None The inferred shape is not taken from the model graph but from the node attributes specifying the shape. --- src/finn/custom_op/fpgadataflow/attention_heads.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/attention_heads.py b/src/finn/custom_op/fpgadataflow/attention_heads.py index 13e1419cbb..6d7c532507 100644 --- a/src/finn/custom_op/fpgadataflow/attention_heads.py +++ b/src/finn/custom_op/fpgadataflow/attention_heads.py @@ -99,14 +99,11 @@ def num_inputs(self): def make_shape_compatible_op(self, model: ModelWrapper): # noqa # Get the node wrapped by this custom op node = self.onnx_node - # Get the shape of the input tensor for inferring the number of - # heads and correctly propagating shapes - shape = model.get_tensor_shape(node.input[0]) # Determine the rank of the input tensor to support batched and # non-batched inputs - rank = len(shape) + rank = len(self.num_inputs) + 1 # The input shape determines the sequence length - seq, _, dim = shape if (rank == 3) else (shape[0], 1, shape[1]) + (seq, *_), dim = self.num_inputs, self.num_elems # Packed outputs a represented by a reshape operation producing one # tensor if self.packed: From 9f90cce93f1b644267d3b1b98b7c4b0fbe81de18 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 7 Aug 2024 11:26:05 +0200 Subject: [PATCH 82/88] [Streamline] Allow RemoveIdentityReshape for fork-nodes --- src/finn/transformation/streamline/remove.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/streamline/remove.py b/src/finn/transformation/streamline/remove.py index 005425bacd..a392f9a4ef 100644 --- a/src/finn/transformation/streamline/remove.py +++ b/src/finn/transformation/streamline/remove.py @@ -27,8 +27,8 @@ def apply(self, model: ModelWrapper): # noqa for index, node in enumerate(graph.node): # Applies to Reshape operation types if node.op_type == "Reshape": - # Currently does not handle fork- or join-nodes - if model.is_fork_node(node) or model.is_join_node(node): + # Currently does not handle join-nodes + if model.is_join_node(node): # Softly skip this node continue # Second input to the reshape operation is the target shape @@ -66,7 +66,7 @@ def apply(self, model: ModelWrapper): # noqa for index, node in enumerate(graph.node): # Applies to Transpose operation types if node.op_type == "Transpose": - # Currently does not handle fork- or join-nodes + # Currently does not handle join-nodes if model.is_join_node(node): # Softly skip this node continue From a07815cc213a9ce1df9098b0414394993eb04f35 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 7 Aug 2024 11:27:14 +0200 Subject: [PATCH 83/88] [Attention] Rework Squeeze to explicitly insert Squeeze operations Instead of manually squeezing all shapes, explicit Squeeze and Unsqueeze operations are inserted into the graph before deleting and redoing all shape annotations from scratch. This should be more robust and keeps the interface (data layout) the model exposes to the outside. Wraps Im2Col operations in Unsqueeze-Squeeze operators to shield it from squeezing as Im2Col always operates on 4-dimensional layouts. --- src/finn/transformation/squeeze.py | 206 +++++++++++++++++++++++++---- 1 file changed, 180 insertions(+), 26 deletions(-) diff --git a/src/finn/transformation/squeeze.py b/src/finn/transformation/squeeze.py index a34d48c06c..c0fea6a89f 100644 --- a/src/finn/transformation/squeeze.py +++ b/src/finn/transformation/squeeze.py @@ -2,6 +2,9 @@ # For array handling import numpy as np +# Python warning subsystem +import warnings + # Helper for creating ONNX nodes from onnx import helper as oh @@ -11,6 +14,12 @@ # QONNX graph transformation base class from qonnx.transformation.base import Transformation +# Transformations running qonnx datatype inference +from qonnx.transformation.infer_datatypes import InferDataTypes + +# Transformation running onnx shape inference +from qonnx.transformation.infer_shapes import InferShapes + # Gets items from protobuf by name from qonnx.util.basic import get_by_name, remove_by_name @@ -33,10 +42,11 @@ def apply(self, model: ModelWrapper): # noqa for index, node in enumerate(graph.node): # There should not be any squeeze or unsqueeze operations in the # graph as these would interfere with this transformation - assert node.op_type not in { - "Squeeze", - "Unsqueeze", - }, f"Squeezing graph containing {node.op_type}" + if node.op_type in {"Squeeze", "Unsqueeze"}: + # Issue a warning to make the user aware of this potential issue + warnings.warn( + f"Squeezing graph containing {node.op_type}: {node.name}" + ) # Validate slice not slicing along squeezed dimension if node.op_type == "Slice": @@ -158,28 +168,172 @@ def apply(self, model: ModelWrapper): # noqa # Is never reset back to False during iteration graph_modified = True + # Need to patch the Im2Col operator when squeezing as this cannot + # operate on other data layouts than 4-dimensional layouts + if node.op_type == "Im2Col": + # Do not squeeze the same operation twice + if get_by_name(node.attribute, "squeezed"): + continue + # Add a new marker attribute to not squeeze this node again + node.attribute.append(oh.make_attribute("squeezed", True)) + # Get the shape of the input tensor to seek for input + # dimensions of size 1 + shape = model.get_tensor_shape( + # fmt: off + node.input[0], fix_missing_init_shape=True + # fmt: on + ) + # Skip if there is no shape + if shape is None: + continue + # Get the axes to be squeezed, i.e., dimensions of size 1 + axes = [dim for dim, size in enumerate(shape) if size == 1] + # To be compatible with ONNX opset >= 13, the axes to + # unsqueeze/squeeze need to be provided as an input + axes_input = model.make_new_valueinfo_name() + # Set the axes as an initializer list + model.set_initializer(axes_input, np.asarray(axes)) + # Instantiate an unsqueeze operation adapting from the squeezed + # layout back to the 4-dimensional layout + unsqueeze = oh.make_node( + # Unsqueeze ONNX operators + "Unsqueeze", + # Inherit the inputs from the Im2Col operation + inputs=[node.input[0], axes_input], + # Create a new output tensor + outputs=[model.make_new_valueinfo_name()], + # Specify the axes to unsqueeze + axes=axes + ) + # Instantiate a squeeze operator adapting from unsqueezed + # 4-dimensional layout back to the squeezed layout + squeeze = oh.make_node( + # Squeeze ONNX operators + "Squeeze", + # Create a new input tensor + inputs=[model.make_new_valueinfo_name(), axes_input], + # Inherit the output tensor from the Im2Col operation + outputs=node.output, + # Specify the axes to squeeze + axes=axes + ) + # Rewire the input/output to/from the Im2Col operator to connect + # the Unsqueeze/Squeeze wrapper + node.input[0] = unsqueeze.output[0] + node.output[0] = squeeze.input[0] + # Insert the new nodes + graph.node.insert(index, unsqueeze) + graph.node.insert(index, squeeze) + # The graph has now been modified. This is never reset back to + # False during iteration + graph_modified = True + + # Get the names of all global input tensors to insert a Squeeze + # operation in front + global_inputs = [inp.name for inp in model.graph.input] + # Insert Squeeze operators at each global input + for inp in global_inputs: + # Get the shape of the tensor to seek for dimensions of size 1 + shape = model.get_tensor_shape( # noqa: Duplicate + inp, fix_missing_init_shape=True + ) + # Skip if there is no shape and skip squeezing 0d or 1d tensors + if shape is None or len(shape) <= 1: + continue + # Get the axes to be squeezed, i.e., dimensions of size 1 + axes = [dim for dim, size in enumerate(shape) if size == 1] + # Te be compatible with ONNX opset >= 13, the axes to + # unsqueeze/squeeze need to be provided as an input + axes_input = model.make_new_valueinfo_name() + # Set the axes as an initializer list + model.set_initializer(axes_input, np.asarray(axes)) + # Instantiate the squeeze operator + squeeze = oh.make_node( + # Squeeze ONNX operators + "Squeeze", + # Inherit the input from the global input and add axes to be + # squeezed to the input list + inputs=[inp, axes_input], + # Create a new output connecting to the graph + outputs=[model.make_new_valueinfo_name()], + # Specify the axes to squeeze + axes=axes + ) + # Connect the new squeeze operator to all consumers of this + # global input + for consumer in model.find_consumers(inp): + # Find the inputs of the consumer which are the global input + for i, c_inp in enumerate(consumer.input): + # Note: This might happen multiple times? + if c_inp == inp: + # Rewire consumer's input directly to the output of + # the squeeze operation + consumer.input[i] = squeeze.output[0] + # Insert the squeeze operator into the model graph + model.graph.node.insert(0, squeeze) + + # Get the names of all global output tensors to insert an Unsqueeze + # operation afterwards + global_outputs = [out.name for out in model.graph.output] + # Insert Unsqueeze operators at each global output + for out in global_outputs: + # Get the shape of the tensor to seek for dimensions of size 1 + shape = model.get_tensor_shape( # noqa: Duplicate + out, fix_missing_init_shape=True + ) + # Skip if there is no shape and skip squeezing 0d or 1d tensors + if shape is None or len(shape) <= 1: + continue + # Get the axes to be squeezed, i.e., dimensions of size 1 + axes = [dim for dim, size in enumerate(shape) if size == 1] + # Te be compatible with ONNX opset >= 13, the axes to + # unsqueeze/squeeze need to be provided as an input + axes_input = model.make_new_valueinfo_name() + # Set the axes as an initializer list + model.set_initializer(axes_input, np.asarray(axes)) + # Instantiate the unsqueeze operator + unsqueeze = oh.make_node( + # Unsqueeze ONNX operators + "Unsqueeze", + # Connect to a new intermediate tensor + inputs=[model.make_new_valueinfo_name(), axes_input], + # Connect tho the global output + outputs=[out], + # Specify the axes to unsqueeze + axes=axes + ) + # Connect the new unsqueeze operator to the producer of this global + # output + producer = model.find_producer(out) + # Find the output of the producer which is the global output + for i, p_out in enumerate(producer.output): + # Note: This might happen multiple times? + if p_out == out: + # Rewire producer's output directly to the input of + # the unsqueeze operation + producer.output[i] = unsqueeze.input[0] + # Insert the unsqueeze operator into the model graph + model.graph.node.insert(0, unsqueeze) + # Iterate all tensors in the graph keeping track of the index for index, name in enumerate(model.get_all_tensor_names()): - # Query the shape of the tensor adding annotations for initializers - # if missing - shape = model.get_tensor_shape(name, fix_missing_init_shape=True) - # Skip squeezing 0d or 1d tensors - if len(shape) <= 1: + # Skip the global inputs and outputs + if name in [*global_inputs, *global_outputs]: + # Skip without warning, these are handled by explicit + # Squeeze/Unsqueeze operations continue - # Squeeze the shape by removing all dimensions with size 1 - new_shape = [size for size in shape if size != 1] - # Try to get the initializer of the tensor - initializer = model.get_initializer(name) - # If an initializer is present replace by the squeezed tensor - if initializer is not None: - # Reassign the squeezed tensor - model.set_initializer(name, initializer.squeeze()) - # Set new shape annotation - model.set_tensor_shape(name, new_shape) - # Track whether the shape actually changed - if len(new_shape) != len(shape): - # Is never reset back to False during iteration - graph_modified = True - # Return the transformed model and indicate whether the graph actually - # has been transformed - return model, graph_modified + # Skip initializer tensors: Shape inference should actually restore + # these shapes, but for some reason it does not work... + if model.get_initializer(name) is not None: + continue + # Just delete all existing shape annotations to redo them later + model.set_tensor_shape(name, None) + # Re-do shape and data type annotations after potential changes to the + # model graph + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether this transformation + # needs to be repeated + # Note: Never repeat this transformation as it might break when + # inserting multiple Squeeze operators + return model, False From 5548b49acfb5912d94cbab3a108658c0313ffce3 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Wed, 28 Aug 2024 14:47:37 +0200 Subject: [PATCH 84/88] [Streamline] Prevent MoveTransposePastEltwise from transposing scalars --- src/finn/transformation/streamline/reorder.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 276cc40c1c..bfba46dfd9 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -1301,8 +1301,14 @@ def apply(self, model: ModelWrapper): # noqa # This transformation does only apply to Add nodes where the # second input is a constant initializer if (value := model.get_initializer(a)) is not None: - # Transpose the initializer and re-insert into the model - model.set_initializer(a, value.transpose(perm)) + # Do not transpose scalar or effectively scalar + # initializers + if not (value.shape is None or all( + x == 1 for x in value.shape) + ): + # Transpose the initializer and re-insert into the + # model + model.set_initializer(a, value.transpose(perm)) # Rewire the graph to feed original input and the # transposed initializer into the Add node first successor.input[:] = [inp, a] From 15963e055dbe3aabf7701920b771510469fe8cbd Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Mon, 20 Jan 2025 17:49:14 +0100 Subject: [PATCH 85/88] [Deps] Add attention-hlslib dependency to fetch-repos.sh --- fetch-repos.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fetch-repos.sh b/fetch-repos.sh index a4fc124fa4..29c64f15e4 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -39,6 +39,7 @@ XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" RFSOC4x2_BDF_COMMIT="13fb6f6c02c7dfd7e4b336b18b959ad5115db696" KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79" EXP_BOARD_FILES_MD5="226ca927a16ea4ce579f1332675e9e9a" +ATTENTION_HLSLIB_COMMIT="24cf498e79390cac79aa886f55027589a6422766" QONNX_URL="https://github.com/fastmachinelearning/qonnx.git" FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git" @@ -51,6 +52,7 @@ AVNET_BDF_URL="https://github.com/Avnet/bdf.git" XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git" RFSOC4x2_BDF_URL="https://github.com/RealDigitalOrg/RFSoC4x2-BSP.git" KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git" +ATTENTION_HLSLIB_URL="https://github.com/iksnagreb/attention-hlslib.git" QONNX_DIR="qonnx" FINN_EXP_DIR="finn-experimental" @@ -63,6 +65,7 @@ AVNET_BDF_DIR="avnet-bdf" XIL_BDF_DIR="xil-bdf" RFSOC4x2_BDF_DIR="rfsoc4x2-bdf" KV260_SOM_BDF_DIR="kv260-som-bdf" +ATTENTION_HLSLIB_DIR="attention-hlslib" # absolute path to this script, e.g. /home/user/bin/foo.sh SCRIPT=$(readlink -f "$0") @@ -126,6 +129,7 @@ fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR fetch_repo $RFSOC4x2_BDF_URL $RFSOC4x2_BDF_COMMIT $RFSOC4x2_BDF_DIR fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR +fetch_repo $ATTENTION_HLSLIB_URL $ATTENTION_HLSLIB_COMMIT $ATTENTION_HLSLIB_DIR # Can skip downloading of board files entirely if desired if [ "$FINN_SKIP_BOARD_FILES" = "1" ]; then From 6d56c61eea89cdcbda5b68c88e1811d16d44ddac Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 21 Jan 2025 14:33:51 +0100 Subject: [PATCH 86/88] Make Squeeze interact properly with Im2Col, Split and initializers --- src/finn/transformation/squeeze.py | 101 ++++++++++++++++++++++++----- src/finn/transformation/util.py | 5 ++ 2 files changed, 88 insertions(+), 18 deletions(-) diff --git a/src/finn/transformation/squeeze.py b/src/finn/transformation/squeeze.py index c0fea6a89f..d9c657b054 100644 --- a/src/finn/transformation/squeeze.py +++ b/src/finn/transformation/squeeze.py @@ -20,9 +20,15 @@ # Transformation running onnx shape inference from qonnx.transformation.infer_shapes import InferShapes +# Reuse node removal and rewiring from qonnx +from qonnx.transformation.remove import remove_node_and_rewire + # Gets items from protobuf by name from qonnx.util.basic import get_by_name, remove_by_name +# Small utility functions for graph transformations +from .util import is_threshold + # Squeezes, i.e., removes, dimensions of size 1 # Note: Use this transformation with great care, it currently serves only the @@ -36,17 +42,19 @@ class Squeeze(Transformation): def apply(self, model: ModelWrapper): # noqa # Get the model graph out of the model wrapper object graph = model.graph - # Keep track of whether the graph has been modified - graph_modified = False + # # Keep track of whether the graph has been modified + # graph_modified = False # Iterate all nodes in the graph keeping track of the index for index, node in enumerate(graph.node): # There should not be any squeeze or unsqueeze operations in the # graph as these would interfere with this transformation if node.op_type in {"Squeeze", "Unsqueeze"}: # Issue a warning to make the user aware of this potential issue + # fmt: off warnings.warn( f"Squeezing graph containing {node.op_type}: {node.name}" ) + # fmt: on # Validate slice not slicing along squeezed dimension if node.op_type == "Slice": @@ -83,7 +91,8 @@ def apply(self, model: ModelWrapper): # noqa # Track whether the shape actually changed if len(new_shape) != len(shape): # Is never reset back to False during iteration - graph_modified = True + # graph_modified = True + pass # Need to drop dimensions of size 1 from transpose permutation list if node.op_type == "Transpose": @@ -120,8 +129,9 @@ def apply(self, model: ModelWrapper): # noqa ] # Track whether the permutations actually changed if len(new_perm) != len(perm) or new_perm != perm: - # Is never reset back to False during iteration - graph_modified = True + # # Is never reset back to False during iteration + # graph_modified = True + pass # Remove the permutation attribute before setting the new # permutation remove_by_name(node.attribute, "perm") @@ -143,8 +153,25 @@ def apply(self, model: ModelWrapper): # noqa ) # Track whether the number of inputs actually changed if len(new_num_inputs) != len(num_inputs.ints): - # Is never reset back to False during iteration - graph_modified = True + # # Is never reset back to False during iteration + # graph_modified = True + pass + + # Need to adjust the index of the split axis by the amount of + # squeezed axes before + if node.op_type == "Split": + # Get the axis attribute from the Split operator + axis = get_by_name(node.attribute, "axis") + # Convert to integer or substitute default 0 according to ONNX + # reference + axis = axis.i if axis is not None else 0 + # Get the shape of the input tensor to the split operation + shape = model.get_tensor_shape(node.input[0]) + # Subtract the number of squeezed, i.e, size=1, axes before axis + axis = axis - sum(size == 1 for size in shape[:axis]) + # Update the attribute by removing and reinserting + remove_by_name(node.attribute, "axis") + node.attribute.append(oh.make_attribute("axis", axis)) # Need to set the squeezed output mode of multi-head merging if node.op_type == "MergeMultiHeads": @@ -165,8 +192,9 @@ def apply(self, model: ModelWrapper): # noqa ) # Track whether the number of inputs actually changed if len(new_num_inputs) != len(num_inputs.ints): - # Is never reset back to False during iteration - graph_modified = True + # # Is never reset back to False during iteration + # graph_modified = True + pass # Need to patch the Im2Col operator when squeezing as this cannot # operate on other data layouts than 4-dimensional layouts @@ -203,7 +231,7 @@ def apply(self, model: ModelWrapper): # noqa # Create a new output tensor outputs=[model.make_new_valueinfo_name()], # Specify the axes to unsqueeze - axes=axes + axes=axes, ) # Instantiate a squeeze operator adapting from unsqueezed # 4-dimensional layout back to the squeezed layout @@ -215,7 +243,7 @@ def apply(self, model: ModelWrapper): # noqa # Inherit the output tensor from the Im2Col operation outputs=node.output, # Specify the axes to squeeze - axes=axes + axes=axes, ) # Rewire the input/output to/from the Im2Col operator to connect # the Unsqueeze/Squeeze wrapper @@ -224,9 +252,28 @@ def apply(self, model: ModelWrapper): # noqa # Insert the new nodes graph.node.insert(index, unsqueeze) graph.node.insert(index, squeeze) - # The graph has now been modified. This is never reset back to - # False during iteration - graph_modified = True + # # The graph has now been modified. This is never reset back to + # # False during iteration + # graph_modified = True + + # Iterate the graph once again to get rid of existing Squeeze/Unsqueeze + # Note: This needs to be done after all other operations to not mess + # with the shape annotations + for index, node in enumerate(graph.node): + # Squeeze and Unsqueeze can be handled the same + if node.op_type in {"Squeeze", "Unsqueeze"}: + # Do not touch the Unsqueeze/Squeeze surrounding the Im2Col + # operation + if "Im2Col" not in [ + n.op_type + for n in [ + *model.find_direct_predecessors(node), + *model.find_direct_successors(node), + ] + ]: + # Remove existing Squeeze/Unsqueeze from the graph as these + # will not have any effect anymore + remove_node_and_rewire(model, node) # Get the names of all global input tensors to insert a Squeeze # operation in front @@ -257,7 +304,7 @@ def apply(self, model: ModelWrapper): # noqa # Create a new output connecting to the graph outputs=[model.make_new_valueinfo_name()], # Specify the axes to squeeze - axes=axes + axes=axes, ) # Connect the new squeeze operator to all consumers of this # global input @@ -273,7 +320,7 @@ def apply(self, model: ModelWrapper): # noqa model.graph.node.insert(0, squeeze) # Get the names of all global output tensors to insert an Unsqueeze - # operation afterwards + # operation afterward global_outputs = [out.name for out in model.graph.output] # Insert Unsqueeze operators at each global output for out in global_outputs: @@ -300,7 +347,7 @@ def apply(self, model: ModelWrapper): # noqa # Connect tho the global output outputs=[out], # Specify the axes to unsqueeze - axes=axes + axes=axes, ) # Connect the new unsqueeze operator to the producer of this global # output @@ -324,7 +371,25 @@ def apply(self, model: ModelWrapper): # noqa continue # Skip initializer tensors: Shape inference should actually restore # these shapes, but for some reason it does not work... - if model.get_initializer(name) is not None: + if (init := model.get_initializer(name)) is not None: + # If any of the consumers of this initializer is a + # multi-threshold function, it should not be squeezed as the + # thresholding is quite sensitive to data layouts and does not + # handle broadcasting. + # Note: Not sue whether there can actually be cases wih multiple + # consumers of a threshold tensor, but this should be perfectly + # legal according to standard ONNX. + if any(is_threshold(op) for op in model.find_consumers(name)): + # Skip without warning + continue + # First squeeze the actual data of the initializer tensors + model.set_initializer(name, np.squeeze(init)) + # Now also annotate the squeezed shape, otherwise the following + # shape inference might fail or break the graph + # Note: Deleting the annotation is not sufficient here, it is + # not recovered properly from the tensor data for some reason... + model.set_tensor_shape(name, np.squeeze(init).shape) + # Continue with the next tensor, skipping the default case below continue # Just delete all existing shape annotations to redo them later model.set_tensor_shape(name, None) diff --git a/src/finn/transformation/util.py b/src/finn/transformation/util.py index b749c83288..b35e41c635 100644 --- a/src/finn/transformation/util.py +++ b/src/finn/transformation/util.py @@ -8,6 +8,11 @@ from qonnx.core.modelwrapper import ModelWrapper +# Tests whether a node is a multi-threshold operation +def is_threshold(node: NodeProto): + return node.op_type == "MultiThreshold" + + # Tests whether a node is a join-node MatMul operation, i.e., a MatMul with two # runtime inputs but no weights initializers def is_join_matmul(node: NodeProto, model: ModelWrapper): # noqa From 50544ef1378f325d9ac09d49d34244cc604325df Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 21 Jan 2025 14:53:29 +0100 Subject: [PATCH 87/88] [Streamline] Fix MoveTransposePastEltwise permutation --- src/finn/transformation/streamline/reorder.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 3606590c73..eddbd91bb0 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -1310,7 +1310,14 @@ def apply(self, model: ModelWrapper): # noqa # case it is a multi-axis transpose perm = get_by_name(node.attribute, "perm") # Convert permutation indices to list of integers - perm = perm.ints if perm is not None else None + perm = list(perm.ints) if perm is not None else None + + # Inverse permutation needs to be applied to the initializer + # fmt: off + inverse_perm = None if not perm else [ + perm.index(i) for i in range(len(perm)) + ] + # fmt: on # This transformation does only apply to Add nodes where the # second input is a constant initializer @@ -1318,11 +1325,14 @@ def apply(self, model: ModelWrapper): # noqa # Do not transpose scalar or effectively scalar # initializers if not (value.shape is None or all( - x == 1 for x in value.shape) - ): + x == 1 for x in value.shape)): # Transpose the initializer and re-insert into the # model - model.set_initializer(a, value.transpose(perm)) + # fmt: off + model.set_initializer( + a, value.transpose(inverse_perm) + ) + # fmt: on # Rewire the graph to feed original input and the # transposed initializer into the Add node first successor.input[:] = [inp, a] From 6cee1ec26770fd803f8f265612ac6858e85eccf6 Mon Sep 17 00:00:00 2001 From: Christoph Berganski Date: Tue, 28 Jan 2025 15:01:38 +0100 Subject: [PATCH 88/88] [Deps] Update attention-hlslib dependency --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index 29c64f15e4..53a21ebda9 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -39,7 +39,7 @@ XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" RFSOC4x2_BDF_COMMIT="13fb6f6c02c7dfd7e4b336b18b959ad5115db696" KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79" EXP_BOARD_FILES_MD5="226ca927a16ea4ce579f1332675e9e9a" -ATTENTION_HLSLIB_COMMIT="24cf498e79390cac79aa886f55027589a6422766" +ATTENTION_HLSLIB_COMMIT="afc9720f10e551e1f734e137b21bb6d0a8342177" QONNX_URL="https://github.com/fastmachinelearning/qonnx.git" FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git"