Skip to content

Commit

Permalink
#18560: Add workaround for inline writes on BH by writing to L1
Browse files Browse the repository at this point in the history
  • Loading branch information
abhullar-tt committed Mar 6, 2025
1 parent 948c8f7 commit 1bc9d59
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 17 deletions.
105 changes: 105 additions & 0 deletions tests/tt_metal/tt_metal/api/test_noc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,4 +234,109 @@ TEST_F(DeviceFixture, TensixDirectedStreamRegWriteRead) {
}
}

// Both data movement riscs issue inline writes
TEST_F(DeviceFixture, TensixInlineWriteDedicatedNoc) {
CoreCoord writer_core{0, 0};
CoreCoord receiver_core(0, 1);
uint32_t first_receiver_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED);
uint32_t second_receiver_addr = first_receiver_addr + hal.get_alignment(HalMemType::L1);
uint32_t value_to_write = 39;

for (tt_metal::IDevice* device : this->devices_) {
std::vector<uint32_t> readback(32 / sizeof(uint32_t), 0);
tt_metal::detail::WriteToDeviceL1(device, receiver_core, first_receiver_addr, readback);

CoreCoord virtual_receiver_core = device->worker_core_from_logical_core(receiver_core);

tt_metal::Program program = tt_metal::CreateProgram();
tt_metal::KernelHandle kernel0 = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/inline_writer.cpp",
writer_core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::NOC_0});

tt_metal::SetRuntimeArgs(
program,
kernel0,
writer_core,
{virtual_receiver_core.x, virtual_receiver_core.y, first_receiver_addr, value_to_write});

tt_metal::KernelHandle kernel1 = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/inline_writer.cpp",
writer_core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::NOC_1});

tt_metal::SetRuntimeArgs(
program,
kernel1,
writer_core,
{virtual_receiver_core.x, virtual_receiver_core.y, second_receiver_addr, value_to_write + 1});

tt_metal::detail::LaunchProgram(device, program);

tt_metal::detail::ReadFromDeviceL1(device, receiver_core, first_receiver_addr, 32, readback);
EXPECT_EQ(readback[0], value_to_write);
EXPECT_EQ(readback[4], value_to_write + 1);
}
}

// Both data movement riscs issue inline writes using the same noc
TEST_F(DeviceFixture, TensixInlineWriteDynamicNoc) {
CoreCoord writer_core{0, 0};
CoreCoord receiver_core(0, 1);
uint32_t receiver_addr0 = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED);
uint32_t receiver_addr2 = receiver_addr0 + (2 * hal.get_alignment(HalMemType::L1));
uint32_t value_to_write = 39;

for (tt_metal::IDevice* device : this->devices_) {
std::vector<uint32_t> readback(80 / sizeof(uint32_t), 0);
tt_metal::detail::WriteToDeviceL1(device, receiver_core, receiver_addr0, readback);

CoreCoord virtual_receiver_core = device->worker_core_from_logical_core(receiver_core);

tt_metal::Program program = tt_metal::CreateProgram();
tt_metal::KernelHandle kernel0 = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/inline_writer.cpp",
writer_core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_0,
.noc = tt_metal::NOC::NOC_0,
.noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC});

tt_metal::SetRuntimeArgs(
program,
kernel0,
writer_core,
{virtual_receiver_core.x, virtual_receiver_core.y, receiver_addr0, value_to_write});

tt_metal::KernelHandle kernel1 = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/inline_writer.cpp",
writer_core,
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_1,
.noc = tt_metal::NOC::NOC_1,
.noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC});

tt_metal::SetRuntimeArgs(
program,
kernel1,
writer_core,
{virtual_receiver_core.x, virtual_receiver_core.y, receiver_addr2, value_to_write + 2});

tt_metal::detail::LaunchProgram(device, program);

tt_metal::detail::ReadFromDeviceL1(device, receiver_core, receiver_addr0, 64, readback);
uint32_t expected_value = value_to_write;
for (int i = 0; i < 4; i++) {
EXPECT_EQ(readback[i * 4], expected_value);
expected_value++;
}
}
}

} // namespace tt::tt_metal
36 changes: 36 additions & 0 deletions tests/tt_metal/tt_metal/test_kernels/dataflow/inline_writer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include "debug/dprint.h"

void kernel_main() {
uint32_t dst_noc_x = get_arg_val<uint32_t>(0);
uint32_t dst_noc_y = get_arg_val<uint32_t>(1);
uint32_t dst_addr = get_arg_val<uint32_t>(2);
uint32_t value_to_write = get_arg_val<uint32_t>(3);

uint32_t first_noc;
if constexpr (noc_mode == DM_DYNAMIC_NOC) {
first_noc = 0;
} else {
first_noc = noc_index;
}

for (uint32_t i = 0; i < 2; i++) {
uint32_t noc_to_use = (i % 2) == 0 ? first_noc : 1 - first_noc;
uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr, noc_to_use);
noc_inline_dw_write(dst_noc_addr, value_to_write, 0xF, noc_to_use);
if constexpr (noc_mode != DM_DYNAMIC_NOC) {
break;
}
dst_addr += L1_ALIGNMENT;
value_to_write++;
}

noc_async_write_barrier(noc_index);
if constexpr (noc_mode == DM_DYNAMIC_NOC) {
noc_async_write_barrier(1 - noc_index);
}
}
8 changes: 7 additions & 1 deletion tt_metal/hw/inc/blackhole/dev_mem_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,13 @@
#define MEM_BOOT_CODE_BASE 0
#define MEM_NOC_ATOMIC_RET_VAL_ADDR 4
#define MEM_L1_BARRIER 12
#define MEM_MAILBOX_BASE 16
// On Blackhole issuing inline writes and atomics requires all 4 memory ports to accept the transaction at the same
// time. If one port on the receipient has no back-pressure then the transaction will hang because there is no mechanism
// to allow one memory port to move ahead of another. To workaround this hang, we emulate inline writes on Blackhole by
// writing the value to be written to local L1 first and then issue a noc async write.
#define MEM_L1_INLINE_BASE 16
#define MEM_L1_INLINE_SIZE 64 // Each risc and noc has 16B to store value written out by inline writes
#define MEM_MAILBOX_BASE (MEM_L1_INLINE_BASE + MEM_L1_INLINE_SIZE)
// Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small
#define MEM_MAILBOX_SIZE 12640
#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + MEM_MAILBOX_SIZE)
Expand Down
44 changes: 42 additions & 2 deletions tt_metal/hw/inc/dataflow_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1387,10 +1387,49 @@ void noc_semaphore_set(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) {
* | be | Byte-enable | uint8_t | 0x1-0xF | False |
*/
// clang-format on
FORCE_INLINE
void noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t be = 0xF, uint8_t noc = noc_index) {
template <bool write_to_stream_reg = false>
FORCE_INLINE void noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t be = 0xF, uint8_t noc = noc_index) {
WAYPOINT("NWIW");
DEBUG_SANITIZE_NOC_ADDR(noc, addr, 4);
#ifdef ARCH_BLACKHOLE
// On Blackhole issuing inline writes and atomics requires all 4 memory ports to accept the transaction at the same
// time. If one port on the receipient has no back-pressure then the transaction will hang because there is no
// mechanism to allow one memory port to move ahead of another. To workaround this hang, we emulate inline writes on
// Blackhole by writing the value to be written to local L1 first and then issue a noc async write.
if constexpr (write_to_stream_reg) {
noc_fast_write_dw_inline<noc_mode>(
noc,
write_at_cmd_buf,
val,
addr,
be, // byte-enable
NOC_UNICAST_WRITE_VC,
false, // mcast
false // posted
);
WAYPOINT("NWID");
return;
}

ASSERT(be == 0xF);
uint32_t src_addr = (uint32_t)MEM_L1_INLINE_BASE + ((uint32_t)(noc + proc_type) * L1_ALIGNMENT);
volatile tt_l1_ptr uint32_t* interim_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(src_addr);
*interim_addr_ptr = val;
ncrisc_noc_fast_write_any_len<noc_mode>(
noc,
write_cmd_buf,
src_addr,
addr,
4,
NOC_UNICAST_WRITE_VC,
false, // mcast
false, // linked
1, // num_dests
true, // multicast_path_reserve
false // posted
);
noc_async_writes_flushed(noc);
#else
noc_fast_write_dw_inline<noc_mode>(
noc,
write_at_cmd_buf,
Expand All @@ -1401,6 +1440,7 @@ void noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t be = 0xF, uint8_t
false, // mcast
false // posted
);
#endif
WAYPOINT("NWID");
}

Expand Down
29 changes: 17 additions & 12 deletions tt_metal/impl/dispatch/kernels/cq_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,39 +121,43 @@ enum CQNocSend {
CQ_NOC_SEND = 1,
};

template <enum CQNocFlags flags, enum CQNocWait wait = CQ_NOC_WAIT, enum CQNocSend send = CQ_NOC_SEND>
template <
enum CQNocFlags flags,
enum CQNocWait wait = CQ_NOC_WAIT,
enum CQNocSend send = CQ_NOC_SEND,
uint32_t cmd_buf = NCRISC_WR_CMD_BUF>
FORCE_INLINE void cq_noc_async_write_with_state(
uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) {
if constexpr (wait) {
WAYPOINT("CNSW");
while (!noc_cmd_buf_ready(noc_index, NCRISC_WR_CMD_BUF));
while (!noc_cmd_buf_ready(noc_index, cmd_buf));
WAYPOINT("CNSD");
}

if constexpr (flags & CQ_NOC_FLAG_SRC) {
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_TARG_ADDR_LO, src_addr);
NOC_CMD_BUF_WRITE_REG(noc_index, cmd_buf, NOC_TARG_ADDR_LO, src_addr);
}
if constexpr (flags & CQ_NOC_FLAG_DST) {
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_RET_ADDR_LO, (uint32_t)dst_addr);
NOC_CMD_BUF_WRITE_REG(noc_index, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dst_addr);
}
if constexpr (flags & CQ_NOC_FLAG_NOC) {
#ifdef ARCH_BLACKHOLE
// Handles writing to PCIe
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_RET_ADDR_MID, (uint32_t)(dst_addr >> 32) & 0x1000000F);
NOC_CMD_BUF_WRITE_REG(noc_index, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dst_addr >> 32) & 0x1000000F);
#endif
NOC_CMD_BUF_WRITE_REG(
noc_index,
NCRISC_WR_CMD_BUF,
cmd_buf,
NOC_RET_ADDR_COORDINATE,
(uint32_t)(dst_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK);
}
if constexpr (flags & CQ_NOC_FLAG_LEN) {
ASSERT(size <= NOC_MAX_BURST_SIZE);
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE, size);
NOC_CMD_BUF_WRITE_REG(noc_index, cmd_buf, NOC_AT_LEN_BE, size);
}
if constexpr (send) {
DEBUG_SANITIZE_NOC_WRITE_TRANSACTION_FROM_STATE(noc_index);
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
NOC_CMD_BUF_WRITE_REG(noc_index, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
}
}

Expand Down Expand Up @@ -182,11 +186,11 @@ uint32_t cq_noc_async_write_with_state_any_len(
}
}

template <enum CQNocFlags flags, bool mcast = false, bool linked = false>
template <enum CQNocFlags flags, bool mcast = false, bool linked = false, uint32_t cmd_buf = NCRISC_WR_CMD_BUF>
FORCE_INLINE void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0) {
WAYPOINT("CNIW");
uint32_t heartbeat = 0;
while (!noc_cmd_buf_ready(noc_index, NCRISC_WR_CMD_BUF)) {
while (!noc_cmd_buf_ready(noc_index, cmd_buf)) {
IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat);
}
WAYPOINT("CNID");
Expand All @@ -200,9 +204,9 @@ FORCE_INLINE void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_
(mcast ? ((multicast_path_reserve ? NOC_CMD_PATH_RESERVE : 0) | NOC_CMD_BRCST_PACKET) : 0x0) |
(posted ? 0 : NOC_CMD_RESP_MARKED);

NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_WR_CMD_BUF, NOC_CTRL, noc_cmd_field);
NOC_CMD_BUF_WRITE_REG(noc_index, cmd_buf, NOC_CTRL, noc_cmd_field);

cq_noc_async_write_with_state<flags, CQ_NOC_wait, CQ_NOC_send>(src_addr, dst_addr, size);
cq_noc_async_write_with_state<flags, CQ_NOC_wait, CQ_NOC_send, cmd_buf>(src_addr, dst_addr, size);
}

template <enum CQNocInlineFlags flags, enum CQNocWait wait = CQ_NOC_WAIT, enum CQNocSend send = CQ_NOC_SEND>
Expand Down Expand Up @@ -244,6 +248,7 @@ FORCE_INLINE void cq_noc_inline_dw_write_with_state(uint64_t dst_addr, uint32_t

// TODO: noc_inline_dw_write currently hardcodes most of these parameters, which we copied here
// If needed, add templates for setting these
// TODO: uplift for BH to not do inline write
template <enum CQNocInlineFlags flags>
FORCE_INLINE void cq_noc_inline_dw_write_init_state(uint64_t dst_addr, uint32_t val = 0, uint8_t be = 0xF) {
WAYPOINT("NIIW");
Expand Down
15 changes: 15 additions & 0 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,16 @@ void relay_to_next_cb(
// counter so we would only need to inc atomics downstream
uint64_t dst = get_noc_addr_helper(downstream_noc_xy, downstream_cb_data_ptr);
cq_noc_async_write_init_state<CQ_NOC_sNdl>(0, dst, 0);
#ifdef ARCH_BLACKHOLE
// On Blackhole inline writes are disabled so use cq_noc_async_write_init_state with inline write cmd buf
// See comment in `noc_inline_dw_write` for more details
uint32_t inline_l1_src_addr = (uint32_t)MEM_L1_INLINE_BASE + ((uint32_t)(noc_index + proc_type) * L1_ALIGNMENT);
volatile tt_l1_ptr uint32_t* inline_l1_src_addr_ptr =
reinterpret_cast<volatile tt_l1_ptr uint32_t*>(inline_l1_src_addr);
cq_noc_async_write_init_state<CQ_NOC_sNdl, false, false, NCRISC_WR_REG_CMD_BUF>(0, dst, 0);
#else
cq_noc_inline_dw_write_init_state<CQ_NOC_INLINE_Ndvb>(dst);
#endif

while (length > 0) {
ASSERT(downstream_cb_end > downstream_cb_data_ptr);
Expand All @@ -301,8 +310,14 @@ void relay_to_next_cb(

if constexpr (preamble_size > 0) {
uint32_t flag;
#ifdef ARCH_BLACKHOLE
*inline_l1_src_addr_ptr = xfer_size + preamble_size + not_end_of_cmd;
cq_noc_async_write_with_state<CQ_NOC_SnDL, CQ_NOC_WAIT, CQ_NOC_SEND, NCRISC_WR_REG_CMD_BUF>(
inline_l1_src_addr, downstream_cb_data_ptr, 4);
#else
cq_noc_inline_dw_write_with_state<CQ_NOC_INLINE_nDVB>(
downstream_cb_data_ptr, xfer_size + preamble_size + not_end_of_cmd);
#endif
noc_nonposted_writes_num_issued[noc_index]++;
noc_nonposted_writes_acked[noc_index]++;
downstream_cb_data_ptr += preamble_size;
Expand Down
24 changes: 24 additions & 0 deletions tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,29 @@ FORCE_INLINE
void dispatch_s_noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t noc_id, uint8_t be = 0xF) {
WAYPOINT("NWIW");
DEBUG_SANITIZE_NOC_ADDR(noc_id, addr, 4);
#ifdef ARCH_BLACKHOLE
// On Blackhole issuing inline writes and atomics requires all 4 memory ports to accept the transaction at the same
// time. If one port on the receipient has no back-pressure then the transaction will hang because there is no
// mechanism to allow one memory port to move ahead of another. To workaround this hang, we emulate inline writes on
// Blackhole by writing the value to be written to local L1 first and then issue a noc async write.
uint32_t src_addr = (uint32_t)MEM_L1_INLINE_BASE + ((uint32_t)(noc + proc_type) * L1_ALIGNMENT);
volatile tt_l1_ptr uint32_t* interim_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(src_addr);
*interim_addr_ptr = val;
ncrisc_noc_fast_write_any_len<noc_mode>(
noc_id,
DISPATCH_S_WR_REG_CMD_BUF,
src_addr,
addr,
4,
NOC_UNICAST_WRITE_VC,
false, // mcast
false, // linked
1, // num_dests
true, // multicast_path_reserve
false // posted
);
noc_async_writes_flushed(noc_id);
#else
noc_fast_write_dw_inline<noc_mode>(
noc_id,
DISPATCH_S_WR_REG_CMD_BUF,
Expand All @@ -116,6 +139,7 @@ void dispatch_s_noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t noc_id,
false, // mcast
false // posted
);
#endif
WAYPOINT("NWID");
}

Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/kernels/packet_queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ class packet_queue_state_t {
eth_write_remote_reg(reg_addr, val);
} else {
const auto dest_addr = get_noc_addr(this->remote_x, this->remote_y, reg_addr);
noc_inline_dw_write(dest_addr, val);
noc_inline_dw_write<true>(dest_addr, val);
}
}

Expand Down
3 changes: 2 additions & 1 deletion tt_metal/impl/kernels/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ std::vector<ll_api::memory const*> const& Kernel::binaries(uint32_t build_key) c
}

std::string DataMovementKernel::config_hash() const {
return fmt::format("{}", magic_enum::enum_name(this->config_.noc));
return fmt::format(
"{}_{}", magic_enum::enum_name(this->config_.noc), magic_enum::enum_name(this->config_.noc_mode));
}

// Add "eth_" to the hash to differentiate between erisc and brisc.
Expand Down

0 comments on commit 1bc9d59

Please sign in to comment.