Skip to content

Commit

Permalink
Undo enable-halo-split-reader flag changes.
Browse files Browse the repository at this point in the history
Signed-off-by: Nilaykumar Patel <nkpatel@tenstorrent.com>
  • Loading branch information
nkpatel-tt committed Mar 6, 2025
1 parent bf9dde9 commit c24f447
Show file tree
Hide file tree
Showing 16 changed files with 56 additions and 198 deletions.
81 changes: 0 additions & 81 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def run_conv(
weight_mesh_mapper=None,
output_mesh_composer=None,
enable_split_reader=False,
enable_halo_split_reader=False,
activation="",
):
if isinstance(device, ttnn.MeshDevice):
Expand Down Expand Up @@ -139,7 +138,6 @@ def run_conv(
enable_subblock_padding=False,
output_layout=output_layout,
activation=activation,
enable_halo_split_reader=enable_halo_split_reader,
)
compute_config = ttnn.init_device_compute_kernel_config(
device.arch(),
Expand Down Expand Up @@ -2854,82 +2852,3 @@ def test_block_sharding_relu_act_block_h(
shard_layout=shard_layout,
activation=activation,
)

@pytest.mark.parametrize("batch", [1])
@pytest.mark.parametrize(
"output_channels, input_channels, input_height, input_width",
(
(4, 32, 288, 288),
(32, 48, 284, 284),
(48, 56, 280, 280),
(56, 64, 272, 272),
),
)
@pytest.mark.parametrize(
"weights_dtype",
[ttnn.bfloat16],
)
@pytest.mark.parametrize(
"activations_dtype",
[ttnn.bfloat16],
)
@pytest.mark.parametrize("math_fidelity", [ttnn.MathFidelity.LoFi])
@pytest.mark.parametrize(
"kernel, dilation, padding",
[
[5, 2, 2],
[3, 8, 1],
],
)
@pytest.mark.parametrize("stride", [1, 4])
@pytest.mark.parametrize("enable_halo_split_reader", [True])
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384*2}], indirect=True)
def test_halo_split_reader(
device,
torch_tensor_map,
batch,
output_channels,
input_channels,
input_height,
input_width,
weights_dtype,
activations_dtype,
math_fidelity,
kernel,
dilation,
padding,
stride,
enable_halo_split_reader
):
config_override = {}

run_conv(
device=device,
torch_tensor_map=torch_tensor_map,
activations_dtype=activations_dtype,
weights_dtype=weights_dtype,
batch_size=batch,
output_channels=output_channels,
input_channels=input_channels,
input_height=input_height,
input_width=input_width,
filter_height=kernel,
filter_width=kernel,
stride_h=stride,
stride_w=stride,
pad_h=padding,
pad_w=padding,
config_override=config_override,
dilation=dilation,
math_fidelity=math_fidelity,
output_layout=ttnn.TILE_LAYOUT,
debug=False,
groups=1,
has_bias=True,
shard_layout=None,
memory_config=None,
input_mesh_mapper=None,
weight_mesh_mapper=None,
output_mesh_composer=None,
enable_halo_split_reader=enable_halo_split_reader,
)
3 changes: 1 addition & 2 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,7 @@ Result conv2d(
parallel_config.shard_orientation == ShardOrientation::COL_MAJOR,
0,
input_tensor_post_tm.memory_config(),
true,
conv_config.enable_halo_split_reader);
true);

if (conv_config.deallocate_activation) {
input_tensor_post_tm.deallocate(/*force*/ true);
Expand Down
5 changes: 1 addition & 4 deletions ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,6 @@ void py_bind_conv2d(py::module& module) {
bool,
bool,
bool,
bool,
bool>(),
py::kw_only(),
py::arg("dtype") = DataType::BFLOAT16,
Expand All @@ -355,8 +354,7 @@ void py_bind_conv2d(py::module& module) {
py::arg("enable_act_double_buffer") = false,
py::arg("enable_weights_double_buffer") = false,
py::arg("enable_split_reader") = false,
py::arg("enable_subblock_padding") = false,
py::arg("enable_halo_split_reader") = false);
py::arg("enable_subblock_padding") = false);
py_conv_config.def_readwrite("dtype", &Conv2dConfig::dtype);
py_conv_config.def_readwrite("weights_dtype", &Conv2dConfig::weights_dtype);
py_conv_config.def_readwrite("activation", &Conv2dConfig::activation);
Expand All @@ -375,7 +373,6 @@ void py_bind_conv2d(py::module& module) {
py_conv_config.def_readwrite("enable_weights_double_buffer", &Conv2dConfig::enable_weights_double_buffer);
py_conv_config.def_readwrite("enable_split_reader", &Conv2dConfig::enable_split_reader);
py_conv_config.def_readwrite("enable_subblock_padding", &Conv2dConfig::enable_subblock_padding);
py_conv_config.def_readwrite("enable_halo_split_reader", &Conv2dConfig::enable_halo_split_reader);

py_conv_config.def("__repr__", [](const Conv2dConfig& config) { return fmt::format("{}", config); });

Expand Down
8 changes: 2 additions & 6 deletions ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ struct Conv2dConfig {
bool enable_split_reader = false;

bool enable_subblock_padding = false;

bool enable_halo_split_reader = false;
static constexpr auto attribute_names = std::make_tuple(
"dtype",
"weights_dtype",
Expand All @@ -93,8 +91,7 @@ struct Conv2dConfig {
"enable_act_double_buffer",
"enable_weights_double_buffer",
"enable_split_reader",
"enable_subblock_padding",
"enable_halo_split_reader");
"enable_subblock_padding");
const auto attribute_values() const {
return std::make_tuple(
std::cref(this->dtype),
Expand All @@ -114,8 +111,7 @@ struct Conv2dConfig {
std::cref(this->enable_act_double_buffer),
std::cref(this->enable_weights_double_buffer),
std::cref(this->enable_split_reader),
std::cref(this->enable_subblock_padding),
std::cref(this->enable_halo_split_reader));
std::cref(this->enable_subblock_padding));
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ template <
bool is_read,
bool is_col_major,
bool is_remote_config,
bool enable_split_reader,
bool is_reader>
void copy_sticks_async(
const tt_l1_ptr uint16_t* config_data,
Expand Down Expand Up @@ -104,8 +103,7 @@ void kernel_main() {
constexpr bool is_col_major = get_compile_time_arg_val(12) == 1;
constexpr uint32_t is_width_sharded = get_compile_time_arg_val(13);
constexpr uint32_t input_aligned_page_size = get_compile_time_arg_val(14);
constexpr bool enable_split_reader = get_compile_time_arg_val(15);
constexpr bool is_reader = get_compile_time_arg_val(16);
constexpr bool is_reader = get_compile_time_arg_val(15);

constexpr uint32_t elem_nbytes = sizeof(uint16_t);
constexpr uint16_t pad_core_id = 0xFFFF;
Expand All @@ -115,46 +113,34 @@ void kernel_main() {
const uint32_t in_base_l1_addr = get_read_ptr(in_cb_id);
const uint32_t out_base_l1_addr = get_write_ptr(out_cb_id);

// pad sticks
if constexpr (padding_config_cb_id) {
// construct the pad stick in its buffer
if constexpr (enable_split_reader) {
if constexpr (is_reader) {
cb_reserve_back(pad_cb_id, 1);
const uint16_t pad_val = pad_val_u32;
fill_with_val(get_write_ptr(pad_cb_id), stick_nbytes / elem_nbytes, pad_val);
cb_push_back(pad_cb_id, 1);
} else {
cb_wait_front(pad_cb_id, 1);
}
} else {
cb_reserve_back(pad_cb_id, 1);
const uint16_t pad_val = pad_val_u32;
fill_with_val(get_write_ptr(pad_cb_id), stick_nbytes / elem_nbytes, pad_val);
cb_push_back(pad_cb_id, 1);
}
if constexpr (is_reader) {
cb_reserve_back(pad_cb_id, 1);
const uint16_t pad_val = pad_val_u32;
fill_with_val(get_write_ptr(pad_cb_id), stick_nbytes / elem_nbytes, pad_val);
cb_push_back(pad_cb_id, 1);
} else {
cb_wait_front(pad_cb_id, 1);
}

uint32_t padding_config_l1_addr = get_read_ptr(padding_config_cb_id);
volatile tt_l1_ptr uint16_t* config_data =
reinterpret_cast<volatile tt_l1_ptr uint16_t*>(padding_config_l1_addr);

const uint64_t padding_l1_addr = get_noc_addr(my_noc_x, my_noc_y, get_read_ptr(pad_cb_id));
const uint32_t dst_base_addr = out_base_l1_addr;
uint16_t nsticks = 1;
for (uint16_t j = 0; nsticks; j += 2) {
uint16_t dst_local_idx = config_data[j + 0];
nsticks = config_data[j + 1];

uint64_t dst_addr = dst_base_addr + dst_local_idx * stick_nbytes;
for (uint16_t k = 0; k < nsticks; ++k) {
noc_async_read(padding_l1_addr, dst_addr, stick_nbytes);
dst_addr += stick_nbytes;
}
uint32_t padding_config_l1_addr = get_read_ptr(padding_config_cb_id);
volatile tt_l1_ptr uint16_t* config_data = reinterpret_cast<volatile tt_l1_ptr uint16_t*>(padding_config_l1_addr);

const uint64_t padding_l1_addr = get_noc_addr(my_noc_x, my_noc_y, get_read_ptr(pad_cb_id));
const uint32_t dst_base_addr = out_base_l1_addr;
uint16_t nsticks = 1;
for (uint16_t j = 0; nsticks; j += 2) {
uint16_t dst_local_idx = config_data[j + 0];
nsticks = config_data[j + 1];

uint64_t dst_addr = dst_base_addr + dst_local_idx * stick_nbytes;
for (uint16_t k = 0; k < nsticks; ++k) {
noc_async_read(padding_l1_addr, dst_addr, stick_nbytes);
dst_addr += stick_nbytes;
}
}

// input shards
if constexpr ((enable_split_reader && is_reader) || (!enable_split_reader && local_config_cb_id)) {
if constexpr (is_reader) {
cb_reserve_back(src_cb_id, in_nsticks);
cb_push_back(src_cb_id, in_nsticks);
}
Expand All @@ -172,7 +158,6 @@ void kernel_main() {
remote_read,
is_col_major,
true,
enable_split_reader,
is_reader>(config_data, my_noc_x, my_noc_y, in_base_l1_addr, out_base_l1_addr);
}
// copy data as per local config
Expand All @@ -187,7 +172,6 @@ void kernel_main() {
false,
is_col_major,
false,
enable_split_reader,
is_reader>(config_data, my_noc_x, my_noc_y, in_base_l1_addr, out_base_l1_addr);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ struct UntilizeWithHaloV2 {
const tt::tt_metal::MemoryConfig out_mem_config_;
const bool remote_read_;
const bool transpose_mcast_;
const bool enable_split_reader_;

void validate(const std::vector<Tensor>& input_tensors) const;
std::vector<ttnn::TensorSpec> compute_output_specs(const std::vector<Tensor>& input_tensors) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
const bool remote_read,
const bool transpose_mcast,
Tensor& output_tensor,
const bool capture_buffers,
const bool enable_split_reader) {
const bool capture_buffers) {
IDevice* device = input_tensor.device();
Buffer* src_buffer = input_tensor.buffer();
Buffer* dst_buffer = output_tensor.buffer();
Expand Down Expand Up @@ -224,35 +223,22 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
(uint32_t)(transpose_mcast ? 1 : 0),
is_width_sharded,
aligned_input_nstick_nbytes,
true,
true};

if (true) {
reader_ct_args[0] = padding_config_cb_id1;
reader_ct_args[1] = local_config_cb_id1;
reader_ct_args[2] = remote_config_cb_id2;
} else {
reader_ct_args[0] = 0;
reader_ct_args[1] = local_config_cb_id1;
reader_ct_args[2] = 0;
}
reader_ct_args[0] = padding_config_cb_id1;
reader_ct_args[1] = local_config_cb_id2;
reader_ct_args[2] = remote_config_cb_id1;
KernelHandle reader_kernel_id0 = CreateKernel(
program,
"ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp",
all_cores,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = reader_ct_args});

if (true) {
reader_ct_args[0] = padding_config_cb_id2;
reader_ct_args[1] = local_config_cb_id2;
reader_ct_args[2] = remote_config_cb_id1;
reader_ct_args[16] = false;
} else {
reader_ct_args[0] = padding_config_cb_id1;
reader_ct_args[1] = 0;
reader_ct_args[2] = remote_config_cb_id1;
}
reader_ct_args[0] = padding_config_cb_id2;
reader_ct_args[1] = local_config_cb_id1;
reader_ct_args[2] = remote_config_cb_id2;
reader_ct_args[15] = false;

KernelHandle reader_kernel_id1 = CreateKernel(
program,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ tt::tt_metal::operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
const bool remote_read,
const bool transpose_mcast,
Tensor& output_tensor,
const bool capture_buffers, // Used by halo op to cache internally created config buffers with the program Untilize
// with Halo V2 op takes them as inputs from the user, so doesn't capture
const bool enable_split_reader = false);
const bool capture_buffers); // Used by halo op to cache internally created config buffers with the program
// Untilize with Halo V2 op takes them as inputs from the user, so doesn't capture
} // namespace ttnn::operations::data_movement::detail
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ ttnn::Tensor ExecuteUntilizeWithHaloV2::invoke(
const uint32_t max_out_nsticks_per_core,
const std::optional<MemoryConfig>& memory_config,
const bool remote_read,
const bool transpose_mcast,
const bool enable_split_reader) {
const bool transpose_mcast) {
TT_ASSERT(input_tensor.memory_config().is_sharded());
TT_ASSERT(
input_tensor.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED ||
Expand All @@ -40,8 +39,7 @@ ttnn::Tensor ExecuteUntilizeWithHaloV2::invoke(
max_out_nsticks_per_core,
memory_config.value_or(input_tensor.memory_config()),
remote_read,
transpose_mcast,
enable_split_reader},
transpose_mcast},
{input_tensor,
padding_config1,
padding_config2,
Expand All @@ -68,8 +66,7 @@ ttnn::Tensor ExecuteUntilizeWithHaloV2::invoke(
const uint32_t max_out_nsticks_per_core,
const std::optional<MemoryConfig>& memory_config,
const bool remote_read,
const bool transpose_mcast,
const bool enable_split_reader) {
const bool transpose_mcast) {
return invoke(
DefaultQueueId,
input_tensor,
Expand All @@ -84,8 +81,7 @@ ttnn::Tensor ExecuteUntilizeWithHaloV2::invoke(
max_out_nsticks_per_core,
memory_config,
remote_read,
transpose_mcast,
enable_split_reader);
transpose_mcast);
}

} // namespace ttnn::operations::data_movement
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ struct ExecuteUntilizeWithHaloV2 {
const uint32_t max_out_nsticks_per_core,
const std::optional<MemoryConfig>& memory_config,
const bool remote_read,
const bool transpose_mcast,
const bool enable_split_reader);
const bool transpose_mcast);

static ttnn::Tensor invoke(
const ttnn::Tensor& input_tensor,
Expand All @@ -40,8 +39,7 @@ struct ExecuteUntilizeWithHaloV2 {
const uint32_t max_out_nsticks_per_core,
const std::optional<MemoryConfig>& memory_config,
const bool remote_read,
const bool transpose_mcast,
const bool enable_split_reader);
const bool transpose_mcast);
};

} // namespace operations::data_movement
Expand Down
Loading

0 comments on commit c24f447

Please sign in to comment.