README > Limitations
Some XeTLA APIs have limitations due to hardware restrictions or software design. XeTLA added checkings for these restrictions and end users could get error messages when they touched the limitations. We added the checkings in kernel, group, subgroup levels.
Level |
Feature |
Category |
Restriction |
API |
kernel | gemm | general 1d | refer to table 1-1 | template
class general_1d::check_alignment(T *base, uint32_t pitch) |
block 2d | refer to table 1-2 | template
class block_2d::check_tensor( uint64_t base, uint32_t width, uint32_t height, uint32_t pitch) |
||
group | FPU | data type | dtype_mma_a, dtype_mma_b, dtype_mma_acc must be float type | template
typename dtype_mma_b,
typename dtype_mma_acc> struct check_dtype_default |
memory | Don't support matrixA and matrixB load from local memory | template
mem_space
mem_space_a, mem_space mem_space_b> struct check_memory_default |
||
tile size | (block_size_x_b % (64 / sizeof(dtype_mma))) == 0 | template
int tile_size_y_a,
int block_size_x_a, int block_size_y_a, int tile_size_x_b, int tile_size_y_b, int block_size_x_b, int block_size_y_b> struct check_tile_size_default |
||
(tile_size_x_a % block_size_x_a) == 0 | ||||
(tile_size_y_b % block_size_y_b) == 0 | ||||
block_size_x_a == block_size_y_b | ||||
XMX | data type | dtype_mma_a should be the same as dtype_mma_b in xe arch | template
typename
dtype_mma_b> struct check_dtype_default |
|
not support fp32<->fp8, since it will meet a lot of HW limitations | ||||
memory | matA load from local memory, then matA should be row-major | template
mem_space
mem_space_a, mem_space mem_space_b> struct check_memory_default |
||
matB load from local memory, then matB should be row-major | ||||
tile size | tile_size_x_a should be a multiple of mma_k | template
int tile_size_y_a,
int block_size_x_a, int block_size_y_a, int tile_size_x_b, int tile_size_y_b, int block_size_x_b, int block_size_y_b> struct check_tile_size_default |
||
block_size_x_a should be equal to mma_k | ||||
tile_size_y_a should be a multiple of mma_m | ||||
block_size_y_a should be a multiple of mma_m | ||||
tile_size_x_b should be a multiple of mma_n | ||||
block_size_x_b should be equal to mma_n | ||||
tile_size_y_b should be a multiple of mma_k | ||||
block_size_y_b should be a multiple of mma_k | ||||
subgroup | load | global 2d | For VNNI transform, the maximum block width is 16 width | template
struct check_load { template struct global_2d |
max_block_width should be a multiply of block size x | ||||
global 1d | sizeof(mem_dtype) == 4 || sizeof(mem_dtype) == 8 | template
struct check_load { struct global_1d |
||
local scatter | only support row major in local load, you can use local store to do the transpose | template
struct check_load { template struct local_scatter |
||
load size should at least DW aligned | ||||
bytes per row should be a multiply of sizeof load_dtype | ||||
(tile_bytes % min_bytes) == 0 && (block_bytes % min_bytes) == 0 | ||||
The number of simd channel x should be greater than 0 and less than num_channel | ||||
local 1d | tile 1d only support D32/D64 | template
struct check_load { struct local_1d |
||
store | global 2d | max_block_width should be a multiply of block size x | template
struct check_store { template struct global_2d |
|
global 1d | tile 1d only support D32/D64 | template
struct check_store { struct global_1d |
||
global atomic | for global atomic add, we only support fp32,fp64,uin32_t,uint64_t,int | template
struct check_store { template struct global_atomic |
||
(tile_bytes % min_store_bytes) == 0 && (block_bytes % min_store_bytes) == 0 | ||||
The number of simd channel x should be greater than 0 and less than num_channel | ||||
Only support DW and QW atomic add | ||||
local scatter | (tile_bytes % min_bytes) == 0 && (block_bytes % min_bytes) == 0 | template
struct check_store { template struct local_scatter |
||
The number of simd channel x should be greater than 0 and less than num_channel | ||||
local scatter vnni col | (tile_bytes % min_store_bytes) == 0 && (block_bytes % min_store_bytes) == 0 | template
struct check_store { template struct local_scatter_vnni_col |
||
The number of simd channel x should be greater than 0 and less than num_channel |
Addr Type |
Data Size |
Address Size |
Addr Alignment |
Vector Size |
Transpose |
SIMT Mask |
global | D8U32, D16U32, D32, D64 | A32, A64 | byte | 1 | off | 1, 2, 4, 8, 16, 32 |
global | D32, D64 | A32, A64 | data size | 2, 3, 4, 8 | off | 1, 2, 4, 8, 16, 32 |
global | D32, D64 | A32, A64 | data size | 1, 2, 3, 4, 8, 16, 32, 64 | on | 1 |
slm | D8U32, D16U32, D32, D64 | A16, A32 | byte | 1 | off | 1, 2, 4, 8, 16, 32 |
slm | D32, D64 | A16, A32 | data size | 2, 3, 4, 8 | off | 1, 2, 4, 8, 16, 32 |
slm | D32, D64 | A16, A32 | data size | 1, 2, 3, 4, 8, 16, 32, 64 | on | 1 |
Category |
Data Size |
Restrictions |
base address | U64 | base address must be dword aligned. |
surface width | U32 | 1. only 24 bits are supported for surface width field, bits [31:24] are ignored
by the hardware. 2. surface width (encoded_value + 1) must be equal or greater than 64B. |
surface height | U32 | only 24 bits are supported for surface height field, bits [31:24] are ignored by the hardware. |
surface pitch | U32 | 1. pitch must be greater or equal to width. 2. only 24 bits are supported for surface pitch field, bits [31:24] are ignored by the hardware. 3. surface pitch (encoded_value + 1) must be equal or greater than 64B. 4. surface pitch (encoded_value + 1) must be a multiple of OW (16 bytes). |
block start x | S31 | for data-size d8, block start x must be a multiple of 4. for data-size of d16, block start x must be a multiple of 2. |
block start y | S31 | N/A |
block width | U8 | 1.this field value must be between 0-63. 2. block width (encoded_value + 1) mulitiplied by the element size (bytes) must be a multiple of 4 bytes. which means, for element size of 1 byte, the block width (encoded_value+1) should be a multiple of 4, and for element size of 2 bytes, the block width should be a multiple of 2. |
block height | U8 | this field value must be between 0-31. |
array length | U4 | 1. the range of this field must be in 0-3. 2. this field must be zero for 2d block store messages. |
Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.