Skip to content
This repository was archived by the owner on Dec 18, 2024. It is now read-only.

Latest commit

 

History

History
496 lines (484 loc) · 23.3 KB

limitations.md

File metadata and controls

496 lines (484 loc) · 23.3 KB

README > Limitations

Limitations

Some XeTLA APIs have limitations due to hardware restrictions or software design. XeTLA added checkings for these restrictions and end users could get error messages when they touched the limitations. We added the checkings in kernel, group, subgroup levels.

Limitations And Checkers

Level

Feature

Category

Restriction

API

kernel gemm general 1d refer to table 1-1 template
class general_1d::check_alignment(T *base, uint32_t pitch)
block 2d refer to table 1-2 template
class block_2d::check_tensor( uint64_t base, uint32_t width, uint32_t height, uint32_t pitch)
group FPU data type dtype_mma_a, dtype_mma_b, dtype_mma_acc must be float type template typename dtype_mma_b, typename dtype_mma_acc>
struct check_dtype_default
memory Don't support matrixA and matrixB load from local memory template mem_space mem_space_a, mem_space mem_space_b>
struct check_memory_default
tile size (block_size_x_b % (64 / sizeof(dtype_mma))) == 0 template int tile_size_y_a, int block_size_x_a, int block_size_y_a,
int tile_size_x_b, int tile_size_y_b, int block_size_x_b,
int block_size_y_b>
struct check_tile_size_default
(tile_size_x_a % block_size_x_a) == 0
(tile_size_y_b % block_size_y_b) == 0
block_size_x_a == block_size_y_b
XMX data type dtype_mma_a should be the same as dtype_mma_b in xe arch template typename dtype_mma_b>
struct check_dtype_default
not support fp32<->fp8, since it will meet a lot of HW limitations
memory matA load from local memory, then matA should be row-major template mem_space mem_space_a, mem_space mem_space_b>
struct check_memory_default
matB load from local memory, then matB should be row-major
tile size tile_size_x_a should be a multiple of mma_k template int tile_size_y_a, int block_size_x_a, int block_size_y_a,
int tile_size_x_b, int tile_size_y_b, int block_size_x_b,
int block_size_y_b>
struct check_tile_size_default
block_size_x_a should be equal to mma_k
tile_size_y_a should be a multiple of mma_m
block_size_y_a should be a multiple of mma_m
tile_size_x_b should be a multiple of mma_n
block_size_x_b should be equal to mma_n
tile_size_y_b should be a multiple of mma_k
block_size_y_b should be a multiple of mma_k
subgroup load global 2d For VNNI transform, the maximum block width is 16 width template
struct check_load {
template
struct global_2d
max_block_width should be a multiply of block size x
global 1d sizeof(mem_dtype) == 4 || sizeof(mem_dtype) == 8 template
struct check_load {
struct global_1d
local scatter only support row major in local load, you can use local store to do the transpose template
struct check_load {
template
struct local_scatter
load size should at least DW aligned
bytes per row should be a multiply of sizeof load_dtype
(tile_bytes % min_bytes) == 0 && (block_bytes % min_bytes) == 0
The number of simd channel x should be greater than 0 and less than num_channel
local 1d tile 1d only support D32/D64 template
struct check_load {
struct local_1d
store global 2d max_block_width should be a multiply of block size x template
struct check_store {
template
struct global_2d
global 1d tile 1d only support D32/D64 template
struct check_store {
struct global_1d
global atomic for global atomic add, we only support fp32,fp64,uin32_t,uint64_t,int template
struct check_store {
template
struct global_atomic
(tile_bytes % min_store_bytes) == 0 && (block_bytes % min_store_bytes) == 0
The number of simd channel x should be greater than 0 and less than num_channel
Only support DW and QW atomic add
local scatter (tile_bytes % min_bytes) == 0 && (block_bytes % min_bytes) == 0 template
struct check_store {
template
struct local_scatter
The number of simd channel x should be greater than 0 and less than num_channel
local scatter vnni col (tile_bytes % min_store_bytes) == 0 && (block_bytes % min_store_bytes) == 0 template
struct check_store {
template
struct local_scatter_vnni_col
The number of simd channel x should be greater than 0 and less than num_channel

Table 1-1

Addr Type

Data Size

Address Size

Addr Alignment

Vector Size

Transpose

SIMT Mask

global D8U32, D16U32, D32, D64 A32, A64 byte 1 off 1, 2, 4, 8, 16, 32
global D32, D64 A32, A64 data size 2, 3, 4, 8 off 1, 2, 4, 8, 16, 32
global D32, D64 A32, A64 data size 1, 2, 3, 4, 8, 16, 32, 64 on 1
slm D8U32, D16U32, D32, D64 A16, A32 byte 1 off 1, 2, 4, 8, 16, 32
slm D32, D64 A16, A32 data size 2, 3, 4, 8 off 1, 2, 4, 8, 16, 32
slm D32, D64 A16, A32 data size 1, 2, 3, 4, 8, 16, 32, 64 on 1

Table 1-2

Category

Data Size

Restrictions

base address U64 base address must be dword aligned.
surface width U32 1. only 24 bits are supported for surface width field, bits [31:24] are ignored by the hardware.
2. surface width (encoded_value + 1) must be equal or greater than 64B.
surface height U32 only 24 bits are supported for surface height field, bits [31:24] are ignored by the hardware.
surface pitch U32 1. pitch must be greater or equal to width.
2. only 24 bits are supported for surface pitch field, bits [31:24] are ignored by the hardware.
3. surface pitch (encoded_value + 1) must be equal or greater than 64B.
4. surface pitch (encoded_value + 1) must be a multiple of OW (16 bytes).
block start x S31 for data-size d8, block start x must be a multiple of 4. for data-size of d16, block start x must be a multiple of 2.
block start y S31 N/A
block width U8 1.this field value must be between 0-63.
2. block width (encoded_value + 1) mulitiplied by the element size (bytes) must be a multiple of 4 bytes. which means, for element size of 1 byte, the block width (encoded_value+1) should be a multiple of 4, and for element size of 2 bytes, the block width should be a multiple of 2.
block height U8 this field value must be between 0-31.
array length U4 1. the range of this field must be in 0-3.
2. this field must be zero for 2d block store messages.

Copyright

Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.