From 10366fdfb058c1c013cc0bc6fb3e9ca3351fcf7d Mon Sep 17 00:00:00 2001 From: Ammar Vora Date: Thu, 6 Mar 2025 12:41:26 -0500 Subject: [PATCH] Remove dynamic noc for in1 kernel for ring matmul (#18594) ### Ticket - PR: #18423 ### Problem description Currently, the ring matmul has enabled dynamic noc for the in1 writer kernel, however it is not being used there. ### What's changed - Removed dynamic noc for in1 kernel - Disable dynamic noc for the `hop_cores` configuration. This is relevant to Llama TG, as we use a NOC1 only matmul there, and therefore a dynamic noc is not necessary. ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/runs/13702486437/job/38321838490) CI passes - [x] [TG Prefetcher Tests](https://github.com/tenstorrent/tt-metal/actions/runs/13653807930) Passes --- ...ti_core_reuse_mcast_1d_program_factory.cpp | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp index 6263a44359d..bb1837f10ab 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp @@ -27,7 +27,7 @@ uint32_t get_preferred_noc( const ttnn::CoreCoord src, const ttnn::CoreCoord dst, const tt_metal::IDevice* device, - const bool use_hop_cores = false) { + const bool use_dedicated_noc = false) { /* NOC0: Preferred +x -> +y NOC1: Preferred -y -> -x @@ -55,7 +55,7 @@ uint32_t get_preferred_noc( // std::cout << "src: (" << src_x << ", " << src_y << "), dst: (" << dst_x << ", " << dst_y << "), noc: " << noc << // std::endl; - return use_hop_cores ? 1 : noc; + return use_dedicated_noc ? 1 : noc; } tt::tt_metal::operation::ProgramWithCallbacks create_program_mcast_in0( @@ -1890,15 +1890,29 @@ tt::tt_metal::operation::ProgramWithCallbacks create_program_gather_in0( tt_metal::NOC in0_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMWrite(device->arch()); tt_metal::NOC in1_noc = tt::tt_metal::detail::GetPreferredNOCForDRAMRead(device->arch()); + bool use_dedicated_noc = use_hop_cores || in1_is_dram_interleaved; + tt_metal::NOC_MODE noc_mode = + use_dedicated_noc ? tt_metal::NOC_MODE::DM_DEDICATED_NOC : tt_metal::NOC_MODE::DM_DYNAMIC_NOC; + /* Create the kernels */ auto mm_kernel_in0_id = tt_metal::CreateKernel( program, "ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_ring_all_gather.cpp", - ring_cores, + all_cores, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, + .noc = in0_noc, + .noc_mode = noc_mode, + .compile_args = in0_sender_compile_time_args}); + + auto mm_kernel_in0_hop_cores_id = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_ring_all_gather.cpp", + hop_cores, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = in0_noc, - .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .noc_mode = noc_mode, .compile_args = in0_sender_compile_time_args}); auto mm_kernel_in1_sender_writer_id = tt_metal::CreateKernel( @@ -1908,7 +1922,7 @@ tt::tt_metal::operation::ProgramWithCallbacks create_program_gather_in0( tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = in1_noc, - .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .noc_mode = noc_mode, .compile_args = in1_sender_writer_compile_time_args, .defines = mm_in1_kernel_defines}); @@ -2033,7 +2047,7 @@ tt::tt_metal::operation::ProgramWithCallbacks create_program_gather_in0( next_core = cores[next_i % num_cores]; } const auto& next_core_noc = device->worker_core_from_logical_core(next_core); - uint32_t noc = get_preferred_noc(core_noc, next_core_noc, device, use_hop_cores); + uint32_t noc = get_preferred_noc(core_noc, next_core_noc, device, use_dedicated_noc); std::vector mm_in0_args = { i, // ring_index @@ -2078,7 +2092,7 @@ tt::tt_metal::operation::ProgramWithCallbacks create_program_gather_in0( /* in0 */ CoreCoord next_core = end_of_hop ? cores[num_cores - 1] : hcores[i + 1]; const auto& next_core_noc = device->worker_core_from_logical_core(next_core); - uint32_t noc = get_preferred_noc(core_noc, next_core_noc, device, use_hop_cores); + uint32_t noc = get_preferred_noc(core_noc, next_core_noc, device, use_dedicated_noc); std::vector mm_in0_args = { 0, // ring_index @@ -2088,11 +2102,18 @@ tt::tt_metal::operation::ProgramWithCallbacks create_program_gather_in0( (std::uint32_t)true, // is_hop_core (std::uint32_t)end_of_hop, // end_of_hop }; - tt_metal::SetRuntimeArgs(program, mm_kernel_in0_id, core, mm_in0_args); + tt_metal::SetRuntimeArgs(program, mm_kernel_in0_hop_cores_id, core, mm_in0_args); } auto override_runtime_arguments_callback = - [mm_kernel_in0_id, mm_kernel_in1_sender_writer_id, cb_src0, cb_src1, cb_output, num_cores, cores, global_cb]( + [mm_kernel_in0_id, + mm_kernel_in0_hop_cores_id, + mm_kernel_in1_sender_writer_id, + cb_src0, + cb_src1, + cb_output, + num_cores, + cores]( const void* operation, tt::tt_metal::Program& program, const std::vector& input_tensors, @@ -2101,6 +2122,8 @@ tt::tt_metal::operation::ProgramWithCallbacks create_program_gather_in0( TT_ASSERT(input_tensors.size() + optional_input_tensors.size() == 3); TT_ASSERT(output_tensors.size() == 1); + auto& global_cb = static_cast(operation)->global_cb; + auto src_buffer_a = input_tensors[0].buffer(); auto src_buffer_b = input_tensors[1].buffer(); auto dst_buffer = output_tensors[0].buffer();