diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index 10ac5456fd8..d0f0fea005b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -565,12 +565,6 @@ int main(int argc, char** argv) { host_completion_queue_wr_ptr, dev_completion_queue_wr_ptr, dev_completion_queue_rd_ptr, - 0, - 0, - 0, - 0, - 0, - 0, true, // is_dram_variant true, // is_host_variant }; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index e3e68b8a78e..0c6b581e7c3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -66,10 +66,6 @@ constexpr CoreType DISPATCH_CORE_TYPE = CoreType::WORKER; ////////////////////////////////////////////////////////////////////////////////////////// using std::vector; using namespace tt; -using tt::packet_queue::dispatch_packet_header_t; -using tt::packet_queue::DispatchRemoteNetworkType; -using tt::packet_queue::packet_switch_4B_pack; -using tt::packet_queue::packet_switch_dest_pack; uint32_t iterations_g = DEFAULT_ITERATIONS; @@ -1843,11 +1839,11 @@ void configure_for_single_chip( const uint32_t dispatch_h_cb_sem = dispatch_h_core_sem_0_id; std::vector prefetch_compile_args = { - dispatch_buffer_base, // overridden below for prefetch_h + dispatch_buffer_base, // overridden below for prefetch_h DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE, // overridden below for prefetch_h - dispatch_buffer_pages, // overridden below for prefetch_h - prefetch_downstream_cb_sem, // overridden below for prefetch_d - dispatch_cb_sem, // overridden below for prefetch_h + dispatch_buffer_pages, // overridden below for prefetch_h + prefetch_downstream_cb_sem, // overridden below for prefetch_d + dispatch_cb_sem, // overridden below for prefetch_h dev_hugepage_base_g, hugepage_issue_buffer_size_g, prefetch_q_base, @@ -1864,17 +1860,11 @@ void configure_for_single_chip( prefetch_downstream_cb_sem, // prefetch_d only DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE, DispatchSettings::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only - 0, // unused: for prefetch_hd <--> dispatch_hd - 0, // unused: for prefetch_hd <--> dispatch_hd - 0, // unused: for prefetch_hd <--> dispatch_hd - 0, // unused: for prefetch_hd <--> dispatch_hd - 0, // unused: for prefetch_hd <--> dispatch_hd - 0, - 0, - 0, - 0, - 0, - 0, + 0, // unused: for prefetch_hd <--> dispatch_hd + 0, // unused: for prefetch_hd <--> dispatch_hd + 0, // unused: for prefetch_hd <--> dispatch_hd + 0, // unused: for prefetch_hd <--> dispatch_hd + 0, // unused: for prefetch_hd <--> dispatch_hd }; constexpr NOC my_noc_index = NOC::NOC_0; @@ -2156,14 +2146,7 @@ void configure_for_single_chip( 0, host_completion_queue_wr_ptr, dev_completion_queue_wr_ptr, - dev_completion_queue_rd_ptr, - 0, - 0, - 0, - 0, - 0, - 0, - }; + dev_completion_queue_rd_ptr}; CoreCoord phys_upstream_from_dispatch_core = split_prefetcher_g ? phys_prefetch_d_core : phys_prefetch_core_g; if (split_dispatcher_g) { @@ -3522,10 +3505,6 @@ int main(int argc, char** argv) { } if (packetized_path_en_g) { - using tt::packet_queue::PACKET_QUEUE_TEST_PASS; - using tt::packet_queue::packet_queue_test_status_to_string; - using tt::packet_queue::PQ_TEST_STATUS_INDEX; - vector prefetch_relay_mux_results = tt::llrt::read_hex_vec_from_core( device_r->id(), phys_prefetch_relay_mux_core, diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp index 22ea2c25120..76737d354b4 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp @@ -59,7 +59,7 @@ struct input_queue_raw_state_t { uint32_t max_packet_size_words, uint64_t total_data_words) { this->curr_packet_dest = this->num_dests_sent_last_packet + dest_endpoint_start_id; - this->curr_packet_flags = tt::packet_queue::DispatchPacketFlag::PACKET_TEST_LAST; + this->curr_packet_flags = DispatchPacketFlag::PACKET_TEST_LAST; this->curr_packet_size_words = 2; this->curr_packet_words_remaining = this->curr_packet_size_words; this->data_words_input += 2; @@ -174,10 +174,10 @@ constexpr auto select_input_queue() { } inline void fill_packet_data(tt_l1_ptr uint32_t* start_addr, uint32_t num_words, uint32_t start_val) { - tt_l1_ptr uint32_t* addr = start_addr + (tt::packet_queue::PACKET_WORD_SIZE_BYTES / 4 - 1); + tt_l1_ptr uint32_t* addr = start_addr + (PACKET_WORD_SIZE_BYTES / 4 - 1); for (uint32_t i = 0; i < num_words; i++) { *addr = start_val++; - addr += (tt::packet_queue::PACKET_WORD_SIZE_BYTES / 4); + addr += (PACKET_WORD_SIZE_BYTES / 4); } } @@ -188,7 +188,7 @@ inline bool check_packet_data( uint32_t& mismatch_addr, uint32_t& mismatch_val, uint32_t& expected_val) { - tt_l1_ptr uint32_t* addr = start_addr + (tt::packet_queue::PACKET_WORD_SIZE_BYTES / 4 - 1); + tt_l1_ptr uint32_t* addr = start_addr + (PACKET_WORD_SIZE_BYTES / 4 - 1); for (uint32_t i = 0; i < num_words; i++) { if (*addr != start_val) { mismatch_addr = reinterpret_cast(addr); @@ -197,7 +197,7 @@ inline bool check_packet_data( return false; } start_val++; - addr += (tt::packet_queue::PACKET_WORD_SIZE_BYTES / 4); + addr += (PACKET_WORD_SIZE_BYTES / 4); } return true; } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp index d7e2b532fd9..911d0c5bb76 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp @@ -7,9 +7,6 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" -using tt::packet_queue::PACKET_WORD_SIZE_BYTES; -using tt::packet_queue::dispatch_packet_header_t; -using tt::packet_queue::DispatchRemoteNetworkType; constexpr uint32_t endpoint_id = get_compile_time_arg_val(0); @@ -57,12 +54,11 @@ constexpr uint32_t disable_header_check = get_compile_time_arg_val(18); // the same random seed as the corresponding traffic_gen_tx input_queue_rnd_state_t src_rnd_state[num_src_endpoints]; -tt::packet_queue::packet_input_queue_state_t input_queues[tt::packet_queue::MAX_SWITCH_FAN_IN]; -using input_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using input_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN]; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; void kernel_main() { - using namespace tt::packet_queue; zero_l1_buf(test_results, test_results_size_bytes); test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp index b584c8bda02..2dd8613a562 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp @@ -8,10 +8,6 @@ #include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen.hpp" -using tt::packet_queue::PACKET_WORD_SIZE_BYTES; -using tt::packet_queue::dispatch_packet_header_t; -using tt::packet_queue::DispatchRemoteNetworkType; - constexpr uint32_t src_endpoint_id = get_compile_time_arg_val(0); constexpr uint32_t num_dest_endpoints = get_compile_time_arg_val(1); @@ -65,16 +61,16 @@ constexpr uint32_t data_sent_per_iter_high = get_compile_time_arg_val(21); constexpr uint32_t input_queue_id = 0; constexpr uint32_t output_queue_id = 1; -tt::packet_queue::packet_input_queue_state_t input_queue; -using input_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using input_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_input_queue_state_t input_queue; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; -tt::packet_queue::packet_output_queue_state_t output_queue; -using output_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using output_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_output_queue_state_t output_queue; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; -constexpr tt::packet_queue::packet_input_queue_state_t* input_queue_ptr = &input_queue; -constexpr tt::packet_queue::packet_output_queue_state_t* output_queue_ptr = &output_queue; +constexpr packet_input_queue_state_t* input_queue_ptr = &input_queue; +constexpr packet_output_queue_state_t* output_queue_ptr = &output_queue; // input_queue_rnd_state_t input_queue_state; auto input_queue_state = select_input_queue(); @@ -135,8 +131,6 @@ inline bool input_queue_handler() { } void kernel_main() { - using namespace tt::packet_queue; - zero_l1_buf(test_results, test_results_size_bytes); test_results[PQ_TEST_STATUS_INDEX] = PACKET_QUEUE_TEST_STARTED; test_results[PQ_TEST_MISC_INDEX] = 0xff000000; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index 279fad46a31..eda89407079 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -12,11 +12,11 @@ #include "routing_test_common.hpp" #include "llrt.hpp" +using std::vector; +using namespace tt; +using json = nlohmann::json; + int main(int argc, char **argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; - using json = nlohmann::json; constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index e1f7f41026e..6696d1e619c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -11,10 +11,10 @@ #include "routing_test_common.hpp" #include "llrt.hpp" +using std::vector; +using namespace tt; + int main(int argc, char **argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; constexpr uint32_t default_prng_seed = 0x100; constexpr uint32_t default_data_kb_per_tx = 64*1024; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index c1dd77f2cd8..c9f075b3eaa 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -12,10 +12,10 @@ #include "utils.hpp" #include "llrt.hpp" +using std::vector; +using namespace tt; + int main(int argc, char **argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; bool pass = true; try { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp index 93f9dfb7114..bfaaadb2a0c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp @@ -11,12 +11,11 @@ #include "test_common.hpp" #include "routing_test_common.hpp" -int main(int argc, char** argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; +using std::vector; +using namespace tt; +using json = nlohmann::json; - using json = nlohmann::json; +int main(int argc, char** argv) { constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp index a80d2db15b2..23a4e9db4f7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp @@ -11,12 +11,11 @@ #include "test_common.hpp" #include "routing_test_common.hpp" -int main(int argc, char** argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; - using json = nlohmann::json; +using std::vector; +using namespace tt; +using json = nlohmann::json; +int main(int argc, char** argv) { constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; constexpr uint32_t default_rx_x = 0; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp index 26e26d0bc23..c34eea39242 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp @@ -11,11 +11,11 @@ #include "test_common.hpp" #include "routing_test_common.hpp" +using std::vector; +using namespace tt; +using json = nlohmann::json; + int main(int argc, char **argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; - using json = nlohmann::json; constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index 3de10e62937..28a89013e54 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -11,11 +11,11 @@ #include "test_common.hpp" #include "routing_test_common.hpp" +using std::vector; +using namespace tt; +using json = nlohmann::json; + int main(int argc, char **argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; - using json = nlohmann::json; constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp index 0f8fa1be867..b4c37a1ff14 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp @@ -11,11 +11,12 @@ #include "test_common.hpp" #include "routing_test_common.hpp" +using std::vector; +using namespace tt; +using json = nlohmann::json; + + int main(int argc, char **argv) { - using std::vector; - using namespace tt; - using namespace tt::packet_queue; - using json = nlohmann::json; constexpr uint32_t default_tx_x = 0; constexpr uint32_t default_tx_y = 0; diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 11e8f03c906..d45c2468f77 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -28,7 +28,6 @@ target_sources( api/tt-metalium/cq_commands.hpp api/tt-metalium/dev_msgs.h api/tt-metalium/tt_log.h - api/tt-metalium/fabric_host_interface.h core_descriptors/blackhole_140_arch.yaml core_descriptors/grayskull_120_arch.yaml core_descriptors/wormhole_b0_80_arch.yaml diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp index 04112a70263..c9121012948 100644 --- a/tt_metal/api/tt-metalium/command_queue_interface.hpp +++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "cq_commands.hpp" @@ -18,7 +17,6 @@ #include "dispatch_settings.hpp" #include "helpers.hpp" #include "buffer.hpp" -#include "rtoptions.hpp" #include "umd/device/tt_core_coordinates.h" namespace tt::tt_metal { @@ -35,9 +33,7 @@ enum class CommandQueueDeviceAddrType : uint8_t { COMPLETION_Q1_LAST_EVENT = 5, DISPATCH_S_SYNC_SEM = 6, DISPATCH_MESSAGE = 7, - // Enable for FD on Fabric - FABRIC_INTERFACE = 8, - UNRESERVED = 9, + UNRESERVED = 8 }; enum class CommandQueueHostAddrType : uint8_t { @@ -175,12 +171,6 @@ class DispatchMemMap { device_cq_addr_sizes_[dev_addr_idx] = settings.dispatch_s_sync_sem_; } else if (dev_addr_type == CommandQueueDeviceAddrType::DISPATCH_MESSAGE) { device_cq_addr_sizes_[dev_addr_idx] = settings.dispatch_message_; - } else if (dev_addr_type == CommandQueueDeviceAddrType::FABRIC_INTERFACE) { - if (llrt::RunTimeOptions::get_instance().get_fd_fabric()) { - device_cq_addr_sizes_[dev_addr_idx] = tt_fabric::PACKET_HEADER_SIZE_BYTES; - } else { - device_cq_addr_sizes_[dev_addr_idx] = 0; - } } else { device_cq_addr_sizes_[dev_addr_idx] = settings.other_ptrs_size; } @@ -194,8 +184,6 @@ class DispatchMemMap { CommandQueueDeviceAddrType dev_addr_type = magic_enum::enum_value(dev_addr_idx); if (dev_addr_type == CommandQueueDeviceAddrType::UNRESERVED) { device_cq_addrs_[dev_addr_idx] = align(device_cq_addrs_[dev_addr_idx], pcie_alignment); - } else if (dev_addr_type == CommandQueueDeviceAddrType::FABRIC_INTERFACE) { - device_cq_addrs_[dev_addr_idx] = align(device_cq_addrs_[dev_addr_idx], l1_alignment); } } diff --git a/tt_metal/api/tt-metalium/control_plane.hpp b/tt_metal/api/tt-metalium/control_plane.hpp index 06dac3f4e84..0f62f81b7a0 100644 --- a/tt_metal/api/tt-metalium/control_plane.hpp +++ b/tt_metal/api/tt-metalium/control_plane.hpp @@ -43,7 +43,7 @@ class ControlPlane { chip_id_t dst_chip_id, chan_id_t src_chan_id) const; - // Return routers to get to the destination chip, avoid local eth to eth routing. CoreCoord is a virtual coord. + // Return routers to get to the destination chip, avoid local eth to eth routing std::vector> get_routers_to_chip( mesh_id_t src_mesh_id, chip_id_t src_chip_id, mesh_id_t dst_mesh_id, chip_id_t dst_chip_id) const; diff --git a/tt_metal/api/tt-metalium/device_pool.hpp b/tt_metal/api/tt-metalium/device_pool.hpp index 2b562c23da8..3becae3e6f6 100644 --- a/tt_metal/api/tt-metalium/device_pool.hpp +++ b/tt_metal/api/tt-metalium/device_pool.hpp @@ -87,10 +87,7 @@ class DevicePool { void init_firmware_on_active_devices() const; void init_profiler_devices() const; void activate_device(chip_id_t id); - // Initialize state on the host for this device - void initialize_host(tt_metal::IDevice* dev) const; - // Initialize state for activated devices - void initialize_active_devices() const; + void initialize_device(tt_metal::IDevice* dev) const; void add_devices_to_pool(const std::vector& device_ids); void wait_for_fabric_master_router_sync() const; tt_metal::IDevice* get_device(chip_id_t id) const; diff --git a/tt_metal/api/tt-metalium/dispatch_core_common.hpp b/tt_metal/api/tt-metalium/dispatch_core_common.hpp index b0d12e4d36f..322d8d57641 100644 --- a/tt_metal/api/tt-metalium/dispatch_core_common.hpp +++ b/tt_metal/api/tt-metalium/dispatch_core_common.hpp @@ -31,8 +31,7 @@ enum DispatchWorkerType : uint32_t { US_TUNNELER_REMOTE = 14, PACKET_ROUTER_MUX = 15, PACKET_ROUTER_DEMUX = 16, - FABRIC_ROUTER_VC = 17, - COUNT, + COUNT = 17 }; // NOC ID used by dispatch kernels to communicate with downstream cores. This parameter diff --git a/tt_metal/api/tt-metalium/fabric_host_interface.h b/tt_metal/api/tt-metalium/fabric_host_interface.h index 3cbcff00995..fbb7cf87068 100644 --- a/tt_metal/api/tt-metalium/fabric_host_interface.h +++ b/tt_metal/api/tt-metalium/fabric_host_interface.h @@ -4,7 +4,15 @@ #pragma once -#include +#include + +#if defined(KERNEL_BUILD) || defined(FW_BUILD) +#include "risc_attribs.h" +#else +#define tt_l1_ptr +#define tt_reg_ptr +#define FORCE_INLINE inline +#endif // TODO: move routing table here namespace tt::tt_fabric { diff --git a/tt_metal/api/tt-metalium/rtoptions.hpp b/tt_metal/api/tt-metalium/rtoptions.hpp index 292874c2fd7..18cb16a8f81 100644 --- a/tt_metal/api/tt-metalium/rtoptions.hpp +++ b/tt_metal/api/tt-metalium/rtoptions.hpp @@ -125,8 +125,6 @@ class RunTimeOptions { bool validate_kernel_binaries = false; unsigned num_hw_cqs = 1; - bool fb_fabric_en = false; - bool enable_dispatch_data_collection = false; // HW can clear Blackhole's L1 data cache psuedo-randomly once every 128 transactions @@ -306,8 +304,6 @@ class RunTimeOptions { inline unsigned get_num_hw_cqs() { return num_hw_cqs; } inline void set_num_hw_cqs(unsigned num) { num_hw_cqs = num; } - inline bool get_fd_fabric() const { return fb_fabric_en; } - inline uint32_t get_watcher_debug_delay() { return watcher_debug_delay; } inline void set_watcher_debug_delay(uint32_t delay) { watcher_debug_delay = delay; } @@ -316,7 +312,7 @@ class RunTimeOptions { inline bool get_hw_cache_invalidation_enabled() const { return this->enable_hw_cache_invalidation; } - tt_metal::DispatchCoreConfig get_dispatch_core_config() const; + tt_metal::DispatchCoreConfig get_dispatch_core_config(); inline bool get_skip_deleting_built_cache() { return skip_deleting_built_cache; } diff --git a/tt_metal/fabric/CMakeLists.txt b/tt_metal/fabric/CMakeLists.txt index 3b64bf2cac3..460e623bd4a 100644 --- a/tt_metal/fabric/CMakeLists.txt +++ b/tt_metal/fabric/CMakeLists.txt @@ -1,22 +1,10 @@ -add_library(fabric OBJECT) -add_library(TT::Metalium::Fabric ALIAS fabric) - -target_sources( - fabric - PRIVATE - control_plane.cpp - routing_table_generator.cpp - mesh_graph.cpp +set(FABRIC_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/control_plane.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/routing_table_generator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mesh_graph.cpp ) -target_sources( - fabric - PUBLIC - FILE_SET jit_api - TYPE HEADERS - BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} - FILES hw/inc/tt_fabric_api.h hw/inc/tt_fabric.h hw/inc/tt_fabric_interface.h hw/inc/eth_chan_noc_mapping.h -) +add_library(fabric OBJECT ${FABRIC_SRC}) target_include_directories(fabric PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) @@ -45,13 +33,3 @@ target_precompile_headers( ) target_compile_options(fabric PRIVATE -Wno-int-to-pointer-cast) - -install( - TARGETS - fabric - FILE_SET - jit_api - DESTINATION - ${CMAKE_INSTALL_LIBEXECDIR}/tt-metalium/tt_metal/fabric # FIXME: fix the include paths for jit_build - COMPONENT metalium-runtime -) diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index a478891cd5f..db78ed6d2cb 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -41,7 +41,6 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/kernel_config/demux.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/kernel_config/eth_router.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/kernel_config/eth_tunneler.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/kernel_config/fabric_router_vc.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/util/dispatch_settings.cpp ${CMAKE_CURRENT_SOURCE_DIR}/debug/dprint_server.cpp ${CMAKE_CURRENT_SOURCE_DIR}/debug/noc_logging.cpp diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 2bf2716373e..080710bbbe6 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -802,6 +802,8 @@ void Device::clear_l1_state() { void Device::compile_command_queue_programs() { ZoneScoped; + auto command_queue_program_ptr = std::make_unique(); + auto mmio_command_queue_program_ptr = std::make_unique(); if (this->is_mmio_capable()) { auto command_queue_program_ptr = create_and_compile_cq_program(this); this->command_queue_programs_.push_back(std::move(command_queue_program_ptr)); @@ -947,13 +949,6 @@ void Device::init_fabric() { // Note: the l1_barrier below is needed to be sure writes to cores that // don't get the GO mailbox (eg, storage cores) have all landed tt::Cluster::instance().l1_barrier(this->id()); - const routing_info_t routing_info_enabled = { - .routing_enabled = 1, - .src_sent_valid_cmd = 0, - .dst_acked_valid_cmd = 0, - }; - uint32_t routing_info_addr = tt::tt_metal::hal.get_dev_addr( - tt::tt_metal::HalProgrammableCoreType::ACTIVE_ETH, tt::tt_metal::HalL1MemAddrType::APP_ROUTING_INFO); std::vector> logical_cores_used_in_program = fabric_program_->logical_cores(); for (uint32_t programmable_core_type_index = 0; programmable_core_type_index < logical_cores_used_in_program.size(); programmable_core_type_index++) { @@ -967,12 +962,6 @@ void Device::init_fabric() { auto physical_core = this->virtual_core_from_logical_core(logical_core, core_type); tt::llrt::write_launch_msg_to_core( this->id(), physical_core, msg, go_msg, this->get_dev_addr(physical_core, HalL1MemAddrType::LAUNCH)); - if (core_type == CoreType::ETH) { - tt_cxy_pair virtual_eth_core(this->id(), physical_core); - // Enable internal ethernet routing - tt::Cluster::instance().write_core( - (void*)&routing_info_enabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); - } } } } diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 973429efed4..9b1d8302f5e 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -253,10 +253,11 @@ void DevicePool::initialize( _inst->init_firmware_on_active_devices(); tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(true, target_mmio_ids); + _inst->wait_for_fabric_master_router_sync(); _inst->init_profiler_devices(); } -void DevicePool::initialize_host(IDevice* dev) const { +void DevicePool::initialize_device(IDevice* dev) const { detail::ClearProfilerControlBuffer(dev); // Create system memory writer for this device to have an associated interface to hardware command queue (i.e. @@ -280,55 +281,16 @@ void DevicePool::initialize_host(IDevice* dev) const { dev->initialize_and_launch_firmware(); watcher_attach(dev); -} - -void DevicePool::initialize_active_devices() const { - const auto& active_devices = this->get_all_active_devices(); // MMIO - // Activate fabric (must be before FD) // TODO: add handling of EDM if (tt::Cluster::instance().get_fabric_config() == FabricConfig::FABRIC_2D) { - // Initialize control plane, does not configure kernels/routing tables - // We always need a control plane for mapping of logical devices to physical devices - // TODO: add single device support - _inst->initialize_control_plane(); // not const - // write routing tables to all ethernet cores - // TODO: writing to device normally goes through cluster - this->control_plane->configure_routing_tables(); // Initialize fabric on mmio device - for (const auto& dev : active_devices) { - dev->init_fabric(); - } - _inst->wait_for_fabric_master_router_sync(); - } - - // Activate FD kernels - // Remaining steps are for setting up FD - if (!this->using_fast_dispatch) { - return; + dev->init_fabric(); } - populate_cq_static_args(active_devices); - - for (const auto& dev : active_devices) { - // For Galaxy init, we only need to loop over mmio devices - const auto& mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(dev->id()); - if (mmio_device_id != dev->id()) { - continue; - } - - auto tunnels_from_mmio = tt::Cluster::instance().get_tunnels_from_mmio_device(mmio_device_id); + // Set up HW command queues on device for FD + if (this->using_fast_dispatch) { dev->init_command_queue_device(); - if (not this->skip_remote_devices) { - for (uint32_t t = 0; t < tunnels_from_mmio.size(); t++) { - // Need to create devices from farthest to the closest. - for (uint32_t ts = tunnels_from_mmio[t].size() - 1; ts > 0; ts--) { - uint32_t mmio_controlled_device_id = tunnels_from_mmio[t][ts]; - auto device = get_device(mmio_controlled_device_id); - device->init_command_queue_device(); - } - } - } } } @@ -432,6 +394,15 @@ void DevicePool::add_devices_to_pool(const std::vector& device_ids) { } } + // TODO: add handling of EDM + if (tt::Cluster::instance().get_fabric_config() == FabricConfig::FABRIC_2D) { + // Initialize control plane, does not configure kernels/routing tables + // We always need a control plane for mapping of logical devices to physical devices + // TODO: add single device support + _inst->initialize_control_plane(); + // write routing tables to all ethernet cores + this->control_plane->configure_routing_tables(); + } this->using_fast_dispatch = (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr); if (this->using_fast_dispatch) { populate_fd_kernels(devices_to_activate, this->num_hw_cqs); @@ -503,8 +474,7 @@ void DevicePool::unregister_worker_thread_for_device(IDevice* device) { const std::unordered_set& DevicePool::get_worker_thread_ids() const { return this->worker_thread_ids; } void DevicePool::init_firmware_on_active_devices() const { - const auto& active_devices = this->get_all_active_devices(); - for (const auto& dev : active_devices) { + for (const auto& dev : this->get_all_active_devices()) { // For Galaxy init, we only need to loop over mmio devices const auto& mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(dev->id()); if (mmio_device_id != dev->id()) { @@ -527,7 +497,7 @@ void DevicePool::init_firmware_on_active_devices() const { tt::Cluster::instance().get_device_tunnel_depth(mmio_device_id)); auto tunnels_from_mmio = tt::Cluster::instance().get_tunnels_from_mmio_device(mmio_device_id); - this->initialize_host(dev); + this->initialize_device(dev); if (not this->skip_remote_devices) { for (uint32_t t = 0; t < tunnels_from_mmio.size(); t++) { // Need to create devices from farthest to the closest. @@ -535,13 +505,11 @@ void DevicePool::init_firmware_on_active_devices() const { uint32_t mmio_controlled_device_id = tunnels_from_mmio[t][ts]; log_debug(tt::LogMetal, "Tunnel {} Device {} Tunnel Stop: {}", t, mmio_controlled_device_id, ts); auto device = get_device(mmio_controlled_device_id); - this->initialize_host(device); + this->initialize_device(device); } } } } - - this->initialize_active_devices(); } void DevicePool::initialize_control_plane() { @@ -563,9 +531,7 @@ void DevicePool::initialize_control_plane() { this->control_plane = std::make_unique(mesh_graph_desc_path.string()); } -tt::tt_fabric::ControlPlane* DevicePool::get_control_plane() const { - return this->control_plane.get(); -} // TODO: Don't use get to expose the raw pointer +tt::tt_fabric::ControlPlane* DevicePool::get_control_plane() const { return this->control_plane.get(); } DevicePool::DevicePool() { ZoneScoped; diff --git a/tt_metal/impl/dispatch/kernel_config/demux.cpp b/tt_metal/impl/dispatch/kernel_config/demux.cpp index b03a33e30fb..acd79eeeb2e 100644 --- a/tt_metal/impl/dispatch/kernel_config/demux.cpp +++ b/tt_metal/impl/dispatch/kernel_config/demux.cpp @@ -23,7 +23,7 @@ void DemuxKernel::GenerateStaticConfigs() { static_config_.rx_queue_size_words = 0x10000 >> 4; static_config_.demux_fan_out = downstream_kernels_.size(); - static_config_.remote_rx_network_type = tt::packet_queue::DispatchRemoteNetworkType::NOC0; + static_config_.remote_rx_network_type = DispatchRemoteNetworkType::NOC0; static_config_.test_results_buf_addr_arg = 0; static_config_.test_results_buf_size_bytes = 0; @@ -32,7 +32,7 @@ void DemuxKernel::GenerateStaticConfigs() { for (int idx = 0; idx < downstream_kernels_.size(); idx++) { FDKernel* k = downstream_kernels_[idx]; static_config_.remote_tx_queue_id[idx] = 0; - static_config_.remote_tx_network_type[idx] = (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + static_config_.remote_tx_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; static_config_.output_depacketize_cb_log_page_size[idx] = DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE; static_config_.output_depacketize_local_sem_id[idx] = tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType()); @@ -61,7 +61,7 @@ void DemuxKernel::GenerateDependentConfigs() { } // Downstream, expect DISPATCH_H or DEMUX - TT_ASSERT(downstream_kernels_.size() <= tt::packet_queue::MAX_SWITCH_FAN_OUT && downstream_kernels_.size() > 0); + TT_ASSERT(downstream_kernels_.size() <= MAX_SWITCH_FAN_OUT && downstream_kernels_.size() > 0); dependent_config_.output_depacketize = 0; // Populated per downstream kernel for (int idx = 0; idx < downstream_kernels_.size(); idx++) { FDKernel* k = downstream_kernels_[idx]; @@ -71,14 +71,17 @@ void DemuxKernel::GenerateDependentConfigs() { if (auto dispatch_kernel = dynamic_cast(k)) { dependent_config_.remote_tx_queue_start_addr_words[idx] = dispatch_kernel->GetStaticConfig().dispatch_cb_base.value() >> 4; - dependent_config_.remote_tx_queue_size_words[idx] = dispatch_kernel->GetDispatchBufferSize() >> 4; + dependent_config_.remote_tx_queue_size_words[idx] = + ((1 << dispatch_kernel->GetStaticConfig().dispatch_cb_log_page_size.value()) * + dispatch_kernel->GetStaticConfig().dispatch_cb_pages.value()) >> + 4; dependent_config_.output_depacketize = dependent_config_.output_depacketize.value() | (1 << idx); // Only depacketize for dispatch downstream dependent_config_.output_depacketize_downstream_sem_id[idx] = dispatch_kernel->GetStaticConfig().my_dispatch_cb_sem_id; uint32_t dest_map_array[4] = {0, 1, 2, 3}; // TODO: how to set these generically? Currently just matching // the hard-coded previous implementation - uint64_t dest_endpoint_output_map = tt::packet_queue::packet_switch_dest_pack(dest_map_array, 4); + uint64_t dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); dependent_config_.dest_endpoint_output_map_hi = (uint32_t)(dest_endpoint_output_map >> 32); dependent_config_.dest_endpoint_output_map_lo = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); } else if (auto demux_kernel = dynamic_cast(k)) { @@ -89,10 +92,10 @@ void DemuxKernel::GenerateDependentConfigs() { if (device_->num_hw_cqs() == 1) { uint32_t dest_map_array[4] = {0, 0, 1, 1}; // TODO: how to set these generically? Currently just // matching the hard-coded previous implementation - dest_endpoint_output_map = tt::packet_queue::packet_switch_dest_pack(dest_map_array, 4); + dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); } else { uint32_t dest_map_array[8] = {0, 0, 0, 0, 1, 1, 1, 1}; - dest_endpoint_output_map = tt::packet_queue::packet_switch_dest_pack(dest_map_array, 8); + dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 8); } dependent_config_.dest_endpoint_output_map_hi = (uint32_t)(dest_endpoint_output_map >> 32); dependent_config_.dest_endpoint_output_map_lo = (uint32_t)(dest_endpoint_output_map & 0xFFFFFFFF); @@ -135,7 +138,7 @@ void DemuxKernel::CreateKernel() { 0, 0 // Populate output_depacketize_config after }; - for (int idx = 0; idx < tt::packet_queue::MAX_SWITCH_FAN_OUT; idx++) { + for (int idx = 0; idx < MAX_SWITCH_FAN_OUT; idx++) { if (dependent_config_.remote_tx_x[idx]) { compile_args[4 + idx] |= (dependent_config_.remote_tx_x[idx].value() & 0xFF); compile_args[4 + idx] |= (dependent_config_.remote_tx_y[idx].value() & 0xFF) << 8; diff --git a/tt_metal/impl/dispatch/kernel_config/demux.hpp b/tt_metal/impl/dispatch/kernel_config/demux.hpp index 26c82c18fde..06f291171a8 100644 --- a/tt_metal/impl/dispatch/kernel_config/demux.hpp +++ b/tt_metal/impl/dispatch/kernel_config/demux.hpp @@ -10,28 +10,23 @@ typedef struct demux_static_config { std::optional rx_queue_size_words; std::optional demux_fan_out; - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> remote_tx_queue_id; // [4:7] - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> remote_tx_network_type; // [4:7] + std::array, MAX_SWITCH_FAN_OUT> remote_tx_queue_id; // [4:7] + std::array, MAX_SWITCH_FAN_OUT> remote_tx_network_type; // [4:7] std::optional remote_rx_network_type; std::optional test_results_buf_addr_arg; std::optional test_results_buf_size_bytes; std::optional timeout_cycles; - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - output_depacketize_cb_log_page_size; // [26:29] - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - output_depacketize_local_sem_id; // [26:29] - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - output_depacketize_remove_header; // [26:29] + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_cb_log_page_size; // [26:29] + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_local_sem_id; // [26:29] + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_remove_header; // [26:29] } demux_static_config_t; typedef struct demux_dependent_config { - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> remote_tx_x; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> remote_tx_y; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - remote_tx_queue_start_addr_words; // [8:2:14], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - remote_tx_queue_size_words; // [9:2:15], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_x; // [4:7], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_y; // [4:7], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_queue_start_addr_words; // [8:2:14], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_queue_size_words; // [9:2:15], dependent std::optional remote_rx_x; // Dependent std::optional remote_rx_y; // Dependent std::optional remote_rx_queue_id; // Dependent @@ -39,8 +34,7 @@ typedef struct demux_dependent_config { std::optional dest_endpoint_output_map_hi; // Dependent std::optional dest_endpoint_output_map_lo; // Dependent std::optional output_depacketize; // Dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - output_depacketize_downstream_sem_id; // [26:29], dependent + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_downstream_sem_id; // [26:29], dependent } demux_dependent_config_t; class DemuxKernel : public FDKernel { diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp index d40f8f0fb1a..6235957f70e 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch.cpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: Apache-2.0 #include "dispatch.hpp" -#include "assert.hpp" -#include "hal.hpp" #include "prefetch.hpp" #include "dispatch_s.hpp" #include "demux.hpp" @@ -11,7 +9,6 @@ #include #include -#include "rtoptions.hpp" #include "tt_metal/impl/dispatch/dispatch_query_manager.hpp" #include @@ -142,7 +139,7 @@ void DispatchKernel::GenerateStaticConfigs() { my_dispatch_constants.mux_buffer_pages(device_->num_hw_cqs()), GetCoreType()); // Apparently unused - static_config_.split_dispatch_page_preamble_size = sizeof(tt::packet_queue::dispatch_packet_header_t); + static_config_.split_dispatch_page_preamble_size = sizeof(dispatch_packet_header_t); static_config_.prefetch_h_max_credits = my_dispatch_constants.mux_buffer_pages(device_->num_hw_cqs()); static_config_.packed_write_max_unicast_sub_cmds = @@ -210,22 +207,15 @@ void DispatchKernel::GenerateDependentConfigs() { dependent_config_.downstream_cb_sem_id = UNUSED_SEM_ID; // Unused } else if (static_config_.is_h_variant.value()) { // Upstream, expect DEMUX - // Or direct connection to DISPATCH_D if using fabric TT_ASSERT(upstream_kernels_.size() == 1); - if (auto demux_kernel = dynamic_cast(upstream_kernels_[0])) { - dependent_config_.upstream_logical_core = demux_kernel->GetLogicalCore(); - int demux_idx = - demux_kernel->GetDownstreamPort(this); // Need to know which port this kernel connects to upstream - dependent_config_.upstream_dispatch_cb_sem_id = - demux_kernel->GetStaticConfig().output_depacketize_local_sem_id[demux_idx].value(); - dependent_config_.upstream_sync_sem = 0; // Unused - } else if (auto dispatch_d = dynamic_cast(upstream_kernels_[0])) { - dependent_config_.upstream_logical_core = dispatch_d->GetLogicalCore(); - dependent_config_.upstream_dispatch_cb_sem_id = dispatch_d->GetStaticConfig().my_dispatch_cb_sem_id.value(); - dependent_config_.upstream_sync_sem = 0; // Unused - } else { - TT_FATAL(false, "Unimplemented path"); - } + auto demux_kernel = dynamic_cast(upstream_kernels_[0]); + TT_ASSERT(demux_kernel); + dependent_config_.upstream_logical_core = demux_kernel->GetLogicalCore(); + int demux_idx = + demux_kernel->GetDownstreamPort(this); // Need to know which port this kernel connects to upstream + dependent_config_.upstream_dispatch_cb_sem_id = + demux_kernel->GetStaticConfig().output_depacketize_local_sem_id[demux_idx].value(); + dependent_config_.upstream_sync_sem = 0; // Unused // Downstream, no official downstream core but use the field to connect is to the PREFETCH_H that we need to // write to when resuming sending of commands post exec_buf stall. @@ -264,56 +254,33 @@ void DispatchKernel::GenerateDependentConfigs() { prefetch_kernel->GetStaticConfig().my_downstream_cb_sem_id.value(); } - // Downstream, expect a MUX_D - // Or direct connection to DISPATCH_H if using fabric - // - // + A Dispatch_s if enabled + // Downstream, expect a MUX_D and a DISPATCH_S if enabled auto dispatch_s_kernel = dynamic_cast(downstream_kernels_[0]); auto mux_kernel = dynamic_cast(downstream_kernels_[0]); - - bool found_dispatch_s = false; - bool found_mux = false; - bool found_dispatch_h = false; - for (auto ds_kernel : downstream_kernels_) { - if (auto dispatch_s_kernel = dynamic_cast(ds_kernel)) { - dependent_config_.downstream_s_logical_core = dispatch_s_kernel->GetLogicalCore(); - found_dispatch_s = true; - } else if (auto mux_kernel = dynamic_cast(ds_kernel)) { - dependent_config_.downstream_logical_core = mux_kernel->GetLogicalCore(); - // Some configs depend on which port this kernel connects to on the downstream kernel - int dispatch_d_idx = - mux_kernel->GetUpstreamPort(this); // Need the port that this connects to downstream - dependent_config_.downstream_cb_size = mux_kernel->GetStaticConfig().rx_queue_size_words.value() << 4; - // MUX queue id is "dependent_config_.downstream_cb_size.value()" - // The address for that queue starts at "rx_queue_start_addr_words + i*rx_queue_size_words" (based on - // kernel code) - dependent_config_.downstream_cb_base = - (mux_kernel->GetStaticConfig().rx_queue_start_addr_words.value() << 4) + - dispatch_d_idx * dependent_config_.downstream_cb_size.value(); - dependent_config_.downstream_cb_sem_id = dispatch_d_idx; - found_mux = true; - } else if (auto dispatch_h_kernel = dynamic_cast(ds_kernel)) { - dependent_config_.downstream_logical_core = dispatch_h_kernel->GetLogicalCore(); - dependent_config_.downstream_cb_size = dispatch_h_kernel->GetDispatchBufferSize(); - dependent_config_.downstream_cb_base = dispatch_h_kernel->GetStaticConfig().dispatch_cb_base.value(); - dependent_config_.downstream_cb_sem_id = - dispatch_h_kernel->GetStaticConfig().my_dispatch_cb_sem_id.value(); - found_dispatch_h = true; - } else { - TT_FATAL(false, "Unexpected downstream kernel for dispatch_d"); + if (DispatchQueryManager::instance().dispatch_s_enabled()) { + TT_ASSERT(downstream_kernels_.size() == 2); + mux_kernel = dynamic_cast(downstream_kernels_[1]); + if (!dispatch_s_kernel) { + dispatch_s_kernel = dynamic_cast(downstream_kernels_[1]); + mux_kernel = dynamic_cast(downstream_kernels_[0]); } - } - - TT_FATAL( - !DispatchQueryManager::instance().dispatch_s_enabled() || found_dispatch_s, - "dispatch_d is missing dispatch_s downstream"); - TT_FATAL( - found_mux || found_dispatch_h, - "Path not implemented for dispatch_d. Either a mux or dispatch_h in downstream is required"); - - if (!found_dispatch_s) { + TT_ASSERT(dispatch_s_kernel); + dependent_config_.downstream_s_logical_core = dispatch_s_kernel->GetLogicalCore(); + } else { + TT_ASSERT(downstream_kernels_.size() == 1); dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE; } + TT_ASSERT(mux_kernel); + dependent_config_.downstream_logical_core = mux_kernel->GetLogicalCore(); + // Some configs depend on which port this kernel connects to on the downstream kernel + int dispatch_d_idx = mux_kernel->GetUpstreamPort(this); // Need the port that this connects to downstream + dependent_config_.downstream_cb_size = mux_kernel->GetStaticConfig().rx_queue_size_words.value() << 4; + // MUX queue id is "dependent_config_.downstream_cb_size.value()" + // The address for that queue starts at "rx_queue_start_addr_words + i*rx_queue_size_words" (based on kernel + // code) + dependent_config_.downstream_cb_base = (mux_kernel->GetStaticConfig().rx_queue_start_addr_words.value() << 4) + + dispatch_d_idx * dependent_config_.downstream_cb_size.value(); + dependent_config_.downstream_cb_sem_id = dispatch_d_idx; } else { TT_FATAL(false, "DispatchKernel must be one of (or both) H and D variants"); } @@ -356,17 +323,10 @@ void DispatchKernel::CreateKernel() { static_config_.dev_completion_q_wr_ptr.value(), static_config_.dev_completion_q_rd_ptr.value(), - dependent_config_.downstream_mesh_id.value_or(0), - dependent_config_.downstream_chip_id.value_or(0), - dependent_config_.upstream_mesh_id.value_or(0), - dependent_config_.upstream_chip_id.value_or(0), - dependent_config_.fabric_router_noc_xy.value_or(0), - static_config_.client_interface_addr.value_or(0), - static_config_.is_d_variant.value(), static_config_.is_h_variant.value(), }; - TT_ASSERT(compile_args.size() == 37); + TT_ASSERT(compile_args.size() == 31); auto my_virtual_core = device_->virtual_core_from_logical_core(logical_core_, GetCoreType()); auto upstream_virtual_core = device_->virtual_core_from_logical_core(dependent_config_.upstream_logical_core.value(), GetCoreType()); @@ -424,20 +384,3 @@ void DispatchKernel::ConfigureCore() { detail::WriteToDeviceL1(device_, logical_core_, completion_q1_last_event_ptr, zero, GetCoreType()); } } - -void DispatchKernel::UpdateArgsForFabric( - const CoreCoord& fabric_router_virtual, - tt::tt_fabric::mesh_id_t upstream_mesh_id, - chip_id_t upstream_chip_id, - tt::tt_fabric::mesh_id_t downstream_mesh_id, - chip_id_t downstream_chip_id) { - dependent_config_.fabric_router_noc_xy = - tt::tt_metal::hal.noc_xy_encoding(fabric_router_virtual.x, fabric_router_virtual.y); - dependent_config_.upstream_mesh_id = upstream_mesh_id; - dependent_config_.upstream_chip_id = upstream_chip_id; - dependent_config_.downstream_mesh_id = downstream_mesh_id; - dependent_config_.downstream_chip_id = downstream_chip_id; - auto& my_dispatch_constants = DispatchMemMap::get(GetCoreType()); - static_config_.client_interface_addr = - my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::FABRIC_INTERFACE); -} diff --git a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp index 7322fc4ab23..31884b8159e 100644 --- a/tt_metal/impl/dispatch/kernel_config/dispatch.hpp +++ b/tt_metal/impl/dispatch/kernel_config/dispatch.hpp @@ -2,9 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "core_coord.hpp" #include "fd_kernel.hpp" -#include "mesh_graph.hpp" typedef struct dispatch_static_config { std::optional dispatch_cb_base; // 0 @@ -36,9 +34,6 @@ typedef struct dispatch_static_config { std::optional is_d_variant; std::optional is_h_variant; - - // Populated if fabric is being used to talk to downstream - std::optional client_interface_addr; } dispatch_static_config_t; typedef struct dispatch_dependent_config { @@ -57,13 +52,6 @@ typedef struct dispatch_dependent_config { std::optional split_prefetch; // If upstream is NOT a prefetch_HD std::optional prefetch_h_noc_xy; // Dependent. Used if split_prefetch is true std::optional prefetch_h_local_downstream_sem_addr; // Dependent. Used if split_prefetch is true - - // Populated if fabric is being used to talk to downstream - std::optional fabric_router_noc_xy; - std::optional upstream_mesh_id; - std::optional upstream_chip_id; - std::optional downstream_mesh_id; - std::optional downstream_chip_id; } dispatch_dependent_config_t; class DispatchKernel : public FDKernel { @@ -96,25 +84,10 @@ class DispatchKernel : public FDKernel { this->logical_core_ = core_manager.dispatcher_d_core(device_id, channel, cq_id); } } - void CreateKernel() override; - void GenerateStaticConfigs() override; - void GenerateDependentConfigs() override; - void ConfigureCore() override; - - void UpdateArgsForFabric( - const CoreCoord& fabric_router, - tt::tt_fabric::mesh_id_t src_mesh_id, - chip_id_t src_chip_id, - tt::tt_fabric::mesh_id_t dst_mesh_id, - chip_id_t dst_chip_id) override; - - uint32_t GetDispatchBufferSize() const { - return (1 << static_config_.dispatch_cb_log_page_size.value()) * static_config_.dispatch_cb_pages.value(); - } const dispatch_static_config_t& GetStaticConfig() { return static_config_; } private: diff --git a/tt_metal/impl/dispatch/kernel_config/eth_router.cpp b/tt_metal/impl/dispatch/kernel_config/eth_router.cpp index 445d57009d7..9f5b704a248 100644 --- a/tt_metal/impl/dispatch/kernel_config/eth_router.cpp +++ b/tt_metal/impl/dispatch/kernel_config/eth_router.cpp @@ -83,7 +83,7 @@ void EthRouterKernel::GenerateStaticConfigs() { void EthRouterKernel::GenerateDependentConfigs() { if (as_mux_) { // Upstream, expect PRETETCH_Hs - TT_ASSERT(upstream_kernels_.size() <= tt::packet_queue::MAX_SWITCH_FAN_IN && upstream_kernels_.size() > 0); + TT_ASSERT(upstream_kernels_.size() <= MAX_SWITCH_FAN_IN && upstream_kernels_.size() > 0); // Downstream, expect US_TUNNELER_REMOTE TT_ASSERT(downstream_kernels_.size() == 1); @@ -96,25 +96,24 @@ void EthRouterKernel::GenerateDependentConfigs() { TT_ASSERT(prefetch_kernel); dependent_config_.remote_tx_x[idx] = tunneler_kernel->GetVirtualCore().x; dependent_config_.remote_tx_y[idx] = tunneler_kernel->GetVirtualCore().y; - dependent_config_.remote_tx_queue_id[idx] = idx + tt::packet_queue::MAX_SWITCH_FAN_IN * router_id; - dependent_config_.remote_tx_network_type[idx] = (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_tx_queue_id[idx] = idx + MAX_SWITCH_FAN_IN * router_id; + dependent_config_.remote_tx_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; dependent_config_.remote_tx_queue_start_addr_words[idx] = tunneler_kernel->GetStaticConfig().in_queue_start_addr_words.value() + - (idx + router_id * tt::packet_queue::MAX_SWITCH_FAN_IN) * - tunneler_kernel->GetStaticConfig().in_queue_size_words.value(); + (idx + router_id * MAX_SWITCH_FAN_IN) * tunneler_kernel->GetStaticConfig().in_queue_size_words.value(); dependent_config_.remote_tx_queue_size_words[idx] = tunneler_kernel->GetStaticConfig().in_queue_size_words.value(); dependent_config_.remote_rx_x[idx] = prefetch_kernel->GetVirtualCore().x; dependent_config_.remote_rx_y[idx] = prefetch_kernel->GetVirtualCore().y; - dependent_config_.remote_rx_network_type[idx] = (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_rx_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; dependent_config_.input_packetize_upstream_sem[idx] = prefetch_kernel->GetStaticConfig().my_downstream_cb_sem_id.value(); } - uint32_t src_id_start = 0xA1 + router_id * tt::packet_queue::MAX_SWITCH_FAN_IN; - uint32_t dst_id_start = 0xB1 + router_id * tt::packet_queue::MAX_SWITCH_FAN_IN; + uint32_t src_id_start = 0xA1 + router_id * MAX_SWITCH_FAN_IN; + uint32_t dst_id_start = 0xB1 + router_id * MAX_SWITCH_FAN_IN; dependent_config_.input_packetize_src_endpoint = { src_id_start, src_id_start + 1, src_id_start + 2, src_id_start + 3}; dependent_config_.input_packetize_dst_endpoint = { @@ -130,11 +129,11 @@ void EthRouterKernel::GenerateDependentConfigs() { dependent_config_.remote_rx_y[idx] = us_tunneler_kernel->GetVirtualCore().y; // Queue id starts counting after the input VCs dependent_config_.remote_rx_queue_id[idx] = us_tunneler_kernel->GetRouterQueueIdOffset(this, false) + idx; - dependent_config_.remote_rx_network_type[idx] = (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_rx_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; } // Downstream, expect PREFETCH_D/US_TUNNELER_REMOTE - TT_ASSERT(downstream_kernels_.size() <= tt::packet_queue::MAX_SWITCH_FAN_OUT && downstream_kernels_.size() > 0); + TT_ASSERT(downstream_kernels_.size() <= MAX_SWITCH_FAN_OUT && downstream_kernels_.size() > 0); std::vector prefetch_kernels; EthTunnelerKernel* ds_tunneler_kernel = nullptr; for (auto k : downstream_kernels_) { @@ -153,8 +152,7 @@ void EthRouterKernel::GenerateDependentConfigs() { dependent_config_.remote_tx_x[remote_idx] = prefetch_kernel->GetVirtualCore().x; dependent_config_.remote_tx_y[remote_idx] = prefetch_kernel->GetVirtualCore().y; dependent_config_.remote_tx_queue_id[remote_idx] = 0; // Prefetch queue id always 0 - dependent_config_.remote_tx_network_type[remote_idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_tx_network_type[remote_idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; dependent_config_.remote_tx_queue_start_addr_words[remote_idx] = prefetch_kernel->GetStaticConfig().cmddat_q_base.value() >> 4; dependent_config_.remote_tx_queue_size_words[remote_idx] = @@ -172,8 +170,7 @@ void EthRouterKernel::GenerateDependentConfigs() { dependent_config_.remote_tx_y[remote_idx] = ds_tunneler_kernel->GetVirtualCore().y; dependent_config_.remote_tx_queue_id[remote_idx] = ds_tunneler_kernel->GetRouterQueueIdOffset(this, true) + idx; - dependent_config_.remote_tx_network_type[remote_idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_tx_network_type[remote_idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; dependent_config_.remote_tx_queue_start_addr_words[remote_idx] = ds_tunneler_kernel->GetStaticConfig().in_queue_start_addr_words.value() + ds_tunneler_kernel->GetStaticConfig().in_queue_size_words.value() * @@ -233,7 +230,7 @@ void EthRouterKernel::CreateKernel() { compile_args[0] = 0xB1; // compile_args[21] = 84; } - for (int idx = 0; idx < tt::packet_queue::MAX_SWITCH_FAN_OUT; idx++) { + for (int idx = 0; idx < MAX_SWITCH_FAN_OUT; idx++) { if (dependent_config_.remote_tx_x[idx]) { compile_args[4 + idx] |= (dependent_config_.remote_tx_x[idx].value() & 0xFF); compile_args[4 + idx] |= (dependent_config_.remote_tx_y[idx].value() & 0xFF) << 8; @@ -255,7 +252,7 @@ void EthRouterKernel::CreateKernel() { } } } - for (int idx = 0; idx < tt::packet_queue::MAX_SWITCH_FAN_IN; idx++) { + for (int idx = 0; idx < MAX_SWITCH_FAN_IN; idx++) { if (dependent_config_.remote_rx_x[idx]) { compile_args[16 + idx] |= (dependent_config_.remote_rx_x[idx].value() & 0xFF); compile_args[16 + idx] |= (dependent_config_.remote_rx_y[idx].value() & 0xFF) << 8; diff --git a/tt_metal/impl/dispatch/kernel_config/eth_router.hpp b/tt_metal/impl/dispatch/kernel_config/eth_router.hpp index 532fe698ae2..8915044f221 100644 --- a/tt_metal/impl/dispatch/kernel_config/eth_router.hpp +++ b/tt_metal/impl/dispatch/kernel_config/eth_router.hpp @@ -14,39 +14,31 @@ typedef struct eth_router_static_config { std::optional kernel_status_buf_size_bytes; std::optional timeout_cycles; - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - output_depacketize_log_page_size; // [26:29] - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> output_depacketize_local_sem; // [26:29] - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - output_depacketize_remove_header; // [26:29] - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize; // [30:33] - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize_log_page_size; // [30:33] - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize_local_sem; // [30:33] + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_log_page_size; // [26:29] + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_local_sem; // [26:29] + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_remove_header; // [26:29] + std::array, MAX_SWITCH_FAN_IN> input_packetize; // [30:33] + std::array, MAX_SWITCH_FAN_IN> input_packetize_log_page_size; // [30:33] + std::array, MAX_SWITCH_FAN_IN> input_packetize_local_sem; // [30:33] } eth_router_static_config_t; typedef struct eth_router_dependent_config { - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> remote_tx_x; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> remote_tx_y; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> remote_tx_queue_id; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - remote_tx_network_type; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - remote_tx_queue_start_addr_words; // [8:2:14], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - remote_tx_queue_size_words; // [9:2:15], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> remote_rx_x; // [16:19], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> remote_rx_y; // [16:19], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> remote_rx_queue_id; // [16:19], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> - remote_rx_network_type; // [17:19], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_x; // [4:7], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_y; // [4:7], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_queue_id; // [4:7], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_network_type; // [4:7], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_queue_start_addr_words; // [8:2:14], dependent + std::array, MAX_SWITCH_FAN_OUT> remote_tx_queue_size_words; // [9:2:15], dependent + std::array, MAX_SWITCH_FAN_IN> remote_rx_x; // [16:19], dependent + std::array, MAX_SWITCH_FAN_IN> remote_rx_y; // [16:19], dependent + std::array, MAX_SWITCH_FAN_IN> remote_rx_queue_id; // [16:19], dependent + std::array, MAX_SWITCH_FAN_IN> remote_rx_network_type; // [17:19], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> output_depacketize; // 25, dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_OUT> - output_depacketize_downstream_sem; // [26:29], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> - input_packetize_upstream_sem; // [30:33], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize_src_endpoint; // Dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize_dst_endpoint; // Dependent + std::array, MAX_SWITCH_FAN_OUT> output_depacketize; // 25, dependent + std::array, MAX_SWITCH_FAN_OUT> output_depacketize_downstream_sem; // [26:29], dependent + std::array, MAX_SWITCH_FAN_IN> input_packetize_upstream_sem; // [30:33], dependent + std::array, MAX_SWITCH_FAN_IN> input_packetize_src_endpoint; // Dependent + std::array, MAX_SWITCH_FAN_IN> input_packetize_dst_endpoint; // Dependent } eth_router_dependent_config_t; class EthRouterKernel : public FDKernel { diff --git a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp index 0635b340570..2f5d91352e2 100644 --- a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.cpp @@ -80,8 +80,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { // kernels dependent_config_.remote_sender_queue_id[remote_idx] = router_vc_count + idx + router_vc_count - router_fwd_vc_count; - dependent_config_.remote_sender_network_type[remote_idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_sender_network_type[remote_idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; remote_idx++; } } @@ -91,7 +90,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { dependent_config_.remote_sender_queue_id[this->static_config_.vc_count.value() - 1] = this->static_config_.vc_count.value() * 2 - 1; dependent_config_.remote_sender_network_type[this->static_config_.vc_count.value() - 1] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::ETH; + (uint32_t)DispatchRemoteNetworkType::ETH; dependent_config_.inner_stop_mux_d_bypass = 0; // Downstream, we expect the same US_TUNNELER_LOCAL and a DEMUX (tunnel start)/MUX_D (non-tunnel start) @@ -108,8 +107,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { // Last VC is the return VC, driving a DEMUX or MUX_D dependent_config_.remote_receiver_x[idx] = other_ds_kernel->GetVirtualCore().x; dependent_config_.remote_receiver_y[idx] = other_ds_kernel->GetVirtualCore().y; - dependent_config_.remote_receiver_network_type[idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_receiver_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; if (auto demux_kernel = dynamic_cast(other_ds_kernel)) { dependent_config_.remote_receiver_queue_start[idx] = demux_kernel->GetStaticConfig().rx_queue_start_addr_words; @@ -134,8 +132,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { dependent_config_.remote_receiver_y[idx] = paired_physical_coord.y; // Tunneler upstream queue ids start counting up from 0 dependent_config_.remote_receiver_queue_id[idx] = idx; - dependent_config_.remote_receiver_network_type[idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::ETH; + dependent_config_.remote_receiver_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::ETH; dependent_config_.remote_receiver_queue_start[idx] = static_config_.in_queue_start_addr_words.value() + idx * this->static_config_.in_queue_size_words.value(); @@ -168,15 +165,13 @@ void EthTunnelerKernel::GenerateDependentConfigs() { dependent_config_.remote_sender_y[idx] = mux_kernel->GetVirtualCore().y; // MUX output queue id is counted after all of it's inputs dependent_config_.remote_sender_queue_id[idx] = mux_kernel->GetStaticConfig().mux_fan_in.value(); - dependent_config_.remote_sender_network_type[idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_sender_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; } else { dependent_config_.remote_sender_x[idx] = paired_physical_coord.x; dependent_config_.remote_sender_y[idx] = paired_physical_coord.y; // Tunneler downstream queue ids start counting after the upstream ones dependent_config_.remote_sender_queue_id[idx] = this->static_config_.vc_count.value() + idx; - dependent_config_.remote_sender_network_type[idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::ETH; + dependent_config_.remote_sender_network_type[idx] = (uint32_t)DispatchRemoteNetworkType::ETH; } } @@ -202,8 +197,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { dependent_config_.remote_receiver_y[remote_idx] = router_kernel->GetVirtualCore().y; dependent_config_.remote_receiver_queue_id[remote_idx] = idx; // Queue ids start counting from 0 at input - dependent_config_.remote_receiver_network_type[remote_idx] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + dependent_config_.remote_receiver_network_type[remote_idx] = (uint32_t)DispatchRemoteNetworkType::NOC0; dependent_config_.remote_receiver_queue_start[remote_idx] = router_kernel->GetStaticConfig().rx_queue_start_addr_words.value() + idx * router_kernel->GetStaticConfig().rx_queue_size_words.value(); @@ -217,8 +211,7 @@ void EthTunnelerKernel::GenerateDependentConfigs() { dependent_config_.remote_receiver_x[return_vc_id] = paired_physical_coord.x; dependent_config_.remote_receiver_y[return_vc_id] = paired_physical_coord.y; dependent_config_.remote_receiver_queue_id[return_vc_id] = return_vc_id; - dependent_config_.remote_receiver_network_type[return_vc_id] = - (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::ETH; + dependent_config_.remote_receiver_network_type[return_vc_id] = (uint32_t)DispatchRemoteNetworkType::ETH; dependent_config_.remote_receiver_queue_start[return_vc_id] = static_config_.in_queue_start_addr_words.value() + (return_vc_id) * this->static_config_.in_queue_size_words.value(); @@ -285,7 +278,7 @@ void EthTunnelerKernel::CreateKernel() { static_config_.kernel_status_buf_size_bytes.value(), static_config_.timeout_cycles.value(), dependent_config_.inner_stop_mux_d_bypass.value()}; - for (int idx = 0; idx < tt::packet_queue::MAX_TUNNEL_LANES; idx++) { + for (int idx = 0; idx < MAX_TUNNEL_LANES; idx++) { if (dependent_config_.remote_receiver_x[idx]) { compile_args[4 + idx] |= (dependent_config_.remote_receiver_x[idx].value() & 0xFF); compile_args[4 + idx] |= (dependent_config_.remote_receiver_y[idx].value() & 0xFF) << 8; diff --git a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.hpp b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.hpp index c6703949af0..c21853ffc53 100644 --- a/tt_metal/impl/dispatch/kernel_config/eth_tunneler.hpp +++ b/tt_metal/impl/dispatch/kernel_config/eth_tunneler.hpp @@ -16,22 +16,16 @@ typedef struct eth_tunneler_static_config { } eth_tunneler_static_config_t; typedef struct eth_tunneler_dependent_config { - std::array, tt::packet_queue::MAX_TUNNEL_LANES> remote_receiver_x; // [4:13], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> remote_receiver_y; // [4:13], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> - remote_receiver_queue_id; // [4:13], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> - remote_receiver_network_type; // [4:13], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> - remote_receiver_queue_start; // [14:2:32], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> - remote_receiver_queue_size; // [15:2:33], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> remote_sender_x; // [34:43], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> remote_sender_y; // [34:43], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> - remote_sender_queue_id; // [34:43], dependent - std::array, tt::packet_queue::MAX_TUNNEL_LANES> - remote_sender_network_type; // [34:43], dependent + std::array, MAX_TUNNEL_LANES> remote_receiver_x; // [4:13], dependent + std::array, MAX_TUNNEL_LANES> remote_receiver_y; // [4:13], dependent + std::array, MAX_TUNNEL_LANES> remote_receiver_queue_id; // [4:13], dependent + std::array, MAX_TUNNEL_LANES> remote_receiver_network_type; // [4:13], dependent + std::array, MAX_TUNNEL_LANES> remote_receiver_queue_start; // [14:2:32], dependent + std::array, MAX_TUNNEL_LANES> remote_receiver_queue_size; // [15:2:33], dependent + std::array, MAX_TUNNEL_LANES> remote_sender_x; // [34:43], dependent + std::array, MAX_TUNNEL_LANES> remote_sender_y; // [34:43], dependent + std::array, MAX_TUNNEL_LANES> remote_sender_queue_id; // [34:43], dependent + std::array, MAX_TUNNEL_LANES> remote_sender_network_type; // [34:43], dependent std::optional inner_stop_mux_d_bypass; // Dependent } eth_tunneler_dependent_config_t; diff --git a/tt_metal/impl/dispatch/kernel_config/fabric_router_vc.cpp b/tt_metal/impl/dispatch/kernel_config/fabric_router_vc.cpp deleted file mode 100644 index 35461c2238d..00000000000 --- a/tt_metal/impl/dispatch/kernel_config/fabric_router_vc.cpp +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include -#include -#include -#include "assert.hpp" -#include "dispatch/kernel_config/dispatch.hpp" -#include "dispatch/kernel_config/prefetch.hpp" - -#include "fabric_router_vc.hpp" - -namespace tt::tt_metal { - -void FabricRouterVC::GenerateStaticConfigs() {} - -void FabricRouterVC::GenerateDependentConfigs() { - // Provide router details to upstream and downstream kernels - TT_ASSERT( - upstream_kernels_.size() == downstream_kernels_.size(), - "Fabric Router VC requires upstream.size() == downstream.size()"); - const auto& control_plane = tt::DevicePool::instance().get_control_plane(); - TT_FATAL(control_plane, "Control plane is nullptr. Is fabric initialized yet?"); - - // Zip upstream and downstream kernels together - for (int i = 0; i < upstream_kernels_.size(); ++i) { - auto us_kernel = upstream_kernels_.at(i); - auto ds_kernel = downstream_kernels_.at(i); - - // Upstream can be PREFETCH_H or DISPATCH_D - // Downstream can be PREFETCH_D or DISPATCH_H - // 4 Combinations - const auto& [src_mesh_id, src_chip_id] = - control_plane->get_mesh_chip_id_from_physical_chip_id(us_kernel->GetDeviceId()); - const auto& [dst_mesh_id, dst_chip_id] = - control_plane->get_mesh_chip_id_from_physical_chip_id(ds_kernel->GetDeviceId()); - const auto& routers = control_plane->get_routers_to_chip(src_mesh_id, src_chip_id, dst_mesh_id, dst_chip_id); - const auto& [routing_plane, fabric_router] = routers.front(); - - const auto& routers_reversed = - control_plane->get_routers_to_chip(dst_mesh_id, dst_chip_id, src_mesh_id, src_chip_id); - const auto& [routing_plane_rev, fabric_router_rev] = routers_reversed.front(); - bool valid_path{false}; - - if (auto prefetch_us = dynamic_cast(us_kernel); - auto prefetch_ds = dynamic_cast(ds_kernel)) { - valid_path = true; - } - - if (auto dispatch_us = dynamic_cast(us_kernel); - auto dispatch_ds = dynamic_cast(ds_kernel)) { - valid_path = true; - } - - TT_FATAL(valid_path, "FabricRouterVC is not implemented for this path\n"); - - // Downstream path. src -> dst - us_kernel->UpdateArgsForFabric(fabric_router, src_mesh_id, src_chip_id, dst_mesh_id, dst_chip_id); - // Upstream path. dst -> src - ds_kernel->UpdateArgsForFabric(fabric_router_rev, dst_mesh_id, dst_chip_id, src_mesh_id, src_chip_id); - } -} - -void FabricRouterVC::CreateKernel() {} - -void FabricRouterVC::ConfigureCore() {} - -} // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/kernel_config/fabric_router_vc.hpp b/tt_metal/impl/dispatch/kernel_config/fabric_router_vc.hpp deleted file mode 100644 index 531b765f785..00000000000 --- a/tt_metal/impl/dispatch/kernel_config/fabric_router_vc.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "core_coord.hpp" -#include "fd_kernel.hpp" - -namespace tt::tt_metal { - -struct fabric_router_depedent_config {}; - -class FabricRouterVC : public FDKernel { -public: - FabricRouterVC(int node_id, chip_id_t device_id, chip_id_t servicing_device_id, uint8_t cq_id) : - FDKernel( - node_id, - device_id, - servicing_device_id, - cq_id, - {RISCV_0_default, RISCV_0_default, RISCV_0_default} /*Arbitrary*/) {} - - void CreateKernel() override; - void GenerateStaticConfigs() override; - void GenerateDependentConfigs() override; - void ConfigureCore() override; -}; - -} // namespace tt::tt_metal diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp index f4e9974ab5e..83abbdb2ea9 100644 --- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp +++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.cpp @@ -5,8 +5,6 @@ #include "fd_kernel.hpp" #include #include -#include "dispatch/kernel_config/fabric_router_vc.hpp" -#include "dispatch_core_common.hpp" #include "dprint_server.hpp" #include "kernel_types.hpp" @@ -95,7 +93,6 @@ FDKernel* FDKernel::Generate( return new EthRouterKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection, true); case PACKET_ROUTER_DEMUX: return new EthRouterKernel(node_id, device_id, servicing_device_id, cq_id, noc_selection, false); - case FABRIC_ROUTER_VC: return new tt::tt_metal::FabricRouterVC(node_id, device_id, servicing_device_id, cq_id); default: TT_FATAL(false, "Unrecognized dispatch kernel type: {}.", type); return nullptr; } } diff --git a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp index 03606016401..7b3c2ae6abe 100644 --- a/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp +++ b/tt_metal/impl/dispatch/kernel_config/fd_kernel.hpp @@ -5,8 +5,6 @@ #include #include -#include "core_coord.hpp" -#include "mesh_graph.hpp" #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tt_cluster.hpp" @@ -37,7 +35,6 @@ static std::vector dispatch_kernel_file_names = { "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp", // US_TUNNELER_REMOTE "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp", // PACKET_ROUTER_MUX "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp", // PACKET_ROUTER_DEMUX - "", // FABRIC_ROUTER_VC "" // COUNT }; @@ -51,7 +48,7 @@ class FDKernel { device_id_(device_id), servicing_device_id_(servicing_device_id), cq_id_(cq_id), - noc_selection_(noc_selection) {} + noc_selection_(noc_selection) {}; virtual ~FDKernel() = default; // Populate the static configs for this kernel (ones that do not depend on configs from other kernels), including @@ -62,21 +59,12 @@ class FDKernel { // after GenerateStaticConfigs for all upstream/downstream kernels. virtual void GenerateDependentConfigs() = 0; - // Use all configs and add this kernel to its Program. Called after GenerateStaticConfigs/GenerateDependentConfigs. + // Use all configs and add this kernel to its Program. Called agter GenerateStaticConfigs/GenerateDependentConfigs. virtual void CreateKernel() = 0; // Override for specific kernels that need host-side configureation (special values written to l1, etc.). Is called // after above functions and before FD kernels are launched. - virtual void ConfigureCore() {} - - // Override for specific kernels that can be configured for fabric. Will be called by the FABRIC_ROUTER_VC, which is - // an intermediary FDKernel for indicating a fabric router path needs to be found. - virtual void UpdateArgsForFabric( - const CoreCoord& fabric_router_virtual, - tt::tt_fabric::mesh_id_t upstream_mesh_id, - chip_id_t upstream_chip_id, - tt::tt_fabric::mesh_id_t downstream_mesh_id, - chip_id_t downstream_chip_id) {} + virtual void ConfigureCore() {}; // Generator function to create a kernel of a given type. New kernels need to be added here. static FDKernel* Generate( @@ -103,8 +91,10 @@ class FDKernel { // Get the port index for which a given kernel is upstream/downstream of this one int GetUpstreamPort(FDKernel* other) { return GetPort(other, this->upstream_kernels_); } int GetDownstreamPort(FDKernel* other) { return GetPort(other, this->downstream_kernels_); } - void AddDevice(tt::tt_metal::IDevice* device) { device_ = device; } - void AddProgram(tt::tt_metal::Program* program) { program_ = program; } + void AddDeviceAndProgram(tt::tt_metal::IDevice* device, tt::tt_metal::Program* program) { + device_ = device; + program_ = program; + }; protected: void configure_kernel_variant( diff --git a/tt_metal/impl/dispatch/kernel_config/mux.cpp b/tt_metal/impl/dispatch/kernel_config/mux.cpp index 2bc6b3bf0b8..0cff6abcafa 100644 --- a/tt_metal/impl/dispatch/kernel_config/mux.cpp +++ b/tt_metal/impl/dispatch/kernel_config/mux.cpp @@ -25,10 +25,10 @@ void MuxKernel::GenerateStaticConfigs() { 4; static_config_.mux_fan_in = upstream_kernels_.size(); for (int idx = 0; idx < upstream_kernels_.size(); idx++) { - static_config_.remote_rx_network_type[idx] = tt::packet_queue::DispatchRemoteNetworkType::NOC0; + static_config_.remote_rx_network_type[idx] = DispatchRemoteNetworkType::NOC0; } - static_config_.tx_network_type = (uint32_t)tt::packet_queue::DispatchRemoteNetworkType::NOC0; + static_config_.tx_network_type = (uint32_t)DispatchRemoteNetworkType::NOC0; static_config_.test_results_buf_addr_arg = 0; static_config_.test_results_buf_size_bytes = 0; static_config_.timeout_cycles = 0; @@ -43,7 +43,7 @@ void MuxKernel::GenerateStaticConfigs() { void MuxKernel::GenerateDependentConfigs() { // Upstream, expect DISPATCH_D or TUNNELER - TT_ASSERT(upstream_kernels_.size() <= tt::packet_queue::MAX_SWITCH_FAN_IN && upstream_kernels_.size() > 0); + TT_ASSERT(upstream_kernels_.size() <= MAX_SWITCH_FAN_IN && upstream_kernels_.size() > 0); uint32_t num_upstream_dispatchers = 0; for (int idx = 0; idx < upstream_kernels_.size(); idx++) { FDKernel* k = upstream_kernels_[idx]; @@ -68,10 +68,9 @@ void MuxKernel::GenerateDependentConfigs() { } uint32_t src_id = 0xC1 + (FDKernel::GetTunnelStop(device_id_) - 1) * num_upstream_dispatchers; uint32_t dest_id = 0xD1 + (FDKernel::GetTunnelStop(device_id_) - 1) * num_upstream_dispatchers; - static_config_.input_packetize_src_endpoint = - tt::packet_queue::packet_switch_4B_pack(src_id, src_id + 1, src_id + 2, src_id + 3); + static_config_.input_packetize_src_endpoint = packet_switch_4B_pack(src_id, src_id + 1, src_id + 2, src_id + 3); static_config_.input_packetize_dest_endpoint = - tt::packet_queue::packet_switch_4B_pack(dest_id, dest_id + 1, dest_id + 2, dest_id + 3); + packet_switch_4B_pack(dest_id, dest_id + 1, dest_id + 2, dest_id + 3); // Downstream, expect TUNNELER TT_ASSERT(downstream_kernels_.size() == 1); @@ -115,7 +114,7 @@ void MuxKernel::CreateKernel() { 0, // Populate input_packetize_config after static_config_.input_packetize_src_endpoint.value(), static_config_.input_packetize_dest_endpoint.value()}; - for (int idx = 0; idx < tt::packet_queue::MAX_SWITCH_FAN_IN; idx++) { + for (int idx = 0; idx < MAX_SWITCH_FAN_IN; idx++) { if (dependent_config_.remote_rx_x[idx]) { compile_args[4 + idx] |= (dependent_config_.remote_rx_x[idx].value() & 0xFF); compile_args[4 + idx] |= (dependent_config_.remote_rx_y[idx].value() & 0xFF) << 8; diff --git a/tt_metal/impl/dispatch/kernel_config/mux.hpp b/tt_metal/impl/dispatch/kernel_config/mux.hpp index e18350814f3..2bb85a1d298 100644 --- a/tt_metal/impl/dispatch/kernel_config/mux.hpp +++ b/tt_metal/impl/dispatch/kernel_config/mux.hpp @@ -10,7 +10,7 @@ typedef struct mux_static_config { std::optional rx_queue_size_words; std::optional mux_fan_in; - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> remote_rx_network_type; // [4:7] + std::array, MAX_SWITCH_FAN_IN> remote_rx_network_type; // [4:7] std::optional tx_network_type; std::optional test_results_buf_addr_arg; @@ -18,24 +18,23 @@ typedef struct mux_static_config { std::optional timeout_cycles; std::optional output_depacketize; std::optional output_depacketize_info; // Packed, pack with above same is input? - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize_local_sem; + std::array, MAX_SWITCH_FAN_IN> input_packetize_local_sem; std::optional input_packetize_src_endpoint; // Packed w/ max 4 assumption std::optional input_packetize_dest_endpoint; // Same as src } mux_static_config_t; typedef struct mux_dependent_config { - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> remote_rx_x; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> remote_rx_y; // [4:7], dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> remote_rx_queue_id; // [4:7], dependent + std::array, MAX_SWITCH_FAN_IN> remote_rx_x; // [4:7], dependent + std::array, MAX_SWITCH_FAN_IN> remote_rx_y; // [4:7], dependent + std::array, MAX_SWITCH_FAN_IN> remote_rx_queue_id; // [4:7], dependent std::optional remote_tx_queue_start_addr_words; // Dependent std::optional remote_tx_queue_size_words; // Dependent std::optional remote_tx_x; // Dependent std::optional remote_tx_y; // Dependent std::optional remote_tx_queue_id; // Dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize; // Dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> - input_packetize_log_page_size; // Dependent - std::array, tt::packet_queue::MAX_SWITCH_FAN_IN> input_packetize_upstream_sem; // Dependent + std::array, MAX_SWITCH_FAN_IN> input_packetize; // Dependent + std::array, MAX_SWITCH_FAN_IN> input_packetize_log_page_size; // Dependent + std::array, MAX_SWITCH_FAN_IN> input_packetize_upstream_sem; // Dependent } mux_dependent_config_t; class MuxKernel : public FDKernel { diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp index e7c221c6f78..d660876cdc7 100644 --- a/tt_metal/impl/dispatch/kernel_config/prefetch.cpp +++ b/tt_metal/impl/dispatch/kernel_config/prefetch.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include "prefetch.hpp" #include "dispatch.hpp" -#include "dispatch/kernel_config/fd_kernel.hpp" #include "dispatch_s.hpp" #include "eth_router.hpp" @@ -28,6 +27,9 @@ void PrefetchKernel::GenerateStaticConfigs() { uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start; uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_); + dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base(); + static_config_.downstream_cb_log_page_size = DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE; + static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages(); static_config_.my_downstream_cb_sem_id = tt::tt_metal::CreateSemaphore( *program_, logical_core_, my_dispatch_constants.dispatch_buffer_pages(), GetCoreType()); @@ -83,6 +85,13 @@ void PrefetchKernel::GenerateStaticConfigs() { uint32_t issue_queue_start_addr = command_queue_start_addr + cq_start; uint32_t issue_queue_size = device_->sysmem_manager().get_issue_queue_size(cq_id_); + static_config_.downstream_cb_log_page_size = DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; + if (tt::Cluster::instance().is_galaxy_cluster()) { // TODO: whys is this hard-coded for galaxy? + static_config_.downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(1); + } else { + static_config_.downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(device_->num_hw_cqs()); + } + static_config_.pcie_base = issue_queue_start_addr; static_config_.pcie_size = issue_queue_size; static_config_.prefetch_q_base = @@ -103,19 +112,8 @@ void PrefetchKernel::GenerateStaticConfigs() { static_config_.cmddat_q_pages = my_dispatch_constants.prefetch_d_buffer_pages(); static_config_.my_upstream_cb_sem_id = tt::tt_metal::CreateSemaphore(*program_, logical_core_, 0, GetCoreType()); - - // Workaround for now. Need downstream to initialize my semaphore. Can't defer creating semaphore yet - { - uint32_t downstream_cb_pages; - if (tt::Cluster::instance().is_galaxy_cluster()) { // TODO: whys is this hard-coded for galaxy? - downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(1); - } else { - downstream_cb_pages = my_dispatch_constants.mux_buffer_pages(device_->num_hw_cqs()); - } - - static_config_.my_downstream_cb_sem_id = - tt::tt_metal::CreateSemaphore(*program_, logical_core_, downstream_cb_pages, GetCoreType()); - } + static_config_.my_downstream_cb_sem_id = tt::tt_metal::CreateSemaphore( + *program_, logical_core_, static_config_.downstream_cb_pages.value(), GetCoreType()); static_config_.cmddat_q_log_page_size = DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; static_config_.cmddat_q_blocks = DispatchSettings::PREFETCH_D_BUFFER_BLOCKS; @@ -125,6 +123,9 @@ void PrefetchKernel::GenerateStaticConfigs() { static_config_.dispatch_s_buffer_size = 0; static_config_.dispatch_s_cb_log_page_size = 0; } else if (static_config_.is_d_variant.value()) { + dependent_config_.downstream_cb_base = my_dispatch_constants.dispatch_buffer_base(); + static_config_.downstream_cb_log_page_size = DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; + static_config_.downstream_cb_pages = my_dispatch_constants.dispatch_buffer_pages(); static_config_.my_downstream_cb_sem_id = tt::tt_metal::CreateSemaphore( *program_, logical_core_, my_dispatch_constants.dispatch_buffer_pages(), GetCoreType()); @@ -201,12 +202,7 @@ void PrefetchKernel::GenerateDependentConfigs() { found_dispatch = true; dependent_config_.downstream_logical_core = dispatch_kernel->GetLogicalCore(); - dependent_config_.downstream_cb_sem_id = - dispatch_kernel->GetStaticConfig().my_dispatch_cb_sem_id.value(); - dependent_config_.downstream_cb_base = dispatch_kernel->GetStaticConfig().dispatch_cb_base.value(); - dependent_config_.downstream_cb_log_page_size = - dispatch_kernel->GetStaticConfig().dispatch_cb_log_page_size.value(); - dependent_config_.downstream_cb_pages = dispatch_kernel->GetStaticConfig().dispatch_cb_pages.value(); + dependent_config_.downstream_cb_sem_id = dispatch_kernel->GetStaticConfig().my_dispatch_cb_sem_id; } else if (auto dispatch_s_kernel = dynamic_cast(k)) { TT_ASSERT(!found_dispatch_s, "PREFETCH kernel has multiple downstream DISPATCH kernels."); found_dispatch_s = true; @@ -233,61 +229,29 @@ void PrefetchKernel::GenerateDependentConfigs() { dependent_config_.upstream_logical_core = UNUSED_LOGICAL_CORE; dependent_config_.upstream_cb_sem_id = 0; // Used in prefetch_d only - // Downstream - // one ROUTER or direct connection to PREFETCH_D if using fabric + // Downstream, expect just one ROUTER TT_ASSERT(downstream_kernels_.size() == 1); - if (auto router_kernel = dynamic_cast(downstream_kernels_[0])) { - dependent_config_.downstream_logical_core = router_kernel->GetLogicalCore(); - dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE; - uint32_t router_idx = - router_kernel->GetUpstreamPort(this); // Need the port that this connects to downstream - auto downstream_buffer_size = router_kernel->GetStaticConfig().rx_queue_size_words.value() << 4; - dependent_config_.downstream_cb_base = - (router_kernel->GetStaticConfig().rx_queue_start_addr_words.value() << 4) + - downstream_buffer_size * router_idx; - dependent_config_.downstream_cb_sem_id = - router_kernel->GetStaticConfig().input_packetize_local_sem[router_idx]; - dependent_config_.downstream_dispatch_s_cb_sem_id = 0; // No downstream DISPATCH_S in this case - - dependent_config_.downstream_cb_log_page_size = DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; - dependent_config_.downstream_cb_pages = - downstream_buffer_size / (1 << DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE); - } else if (auto prefetch_d = dynamic_cast(downstream_kernels_[0])) { - TT_ASSERT( - prefetch_d->GetStaticConfig().is_d_variant.value() && - !prefetch_d->GetStaticConfig().is_h_variant.value()); - - dependent_config_.downstream_logical_core = prefetch_d->GetLogicalCore(); - dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE; - dependent_config_.downstream_cb_base = prefetch_d->GetStaticConfig().cmddat_q_base.value(); - dependent_config_.downstream_cb_sem_id = prefetch_d->GetStaticConfig().my_upstream_cb_sem_id.value(); - dependent_config_.downstream_dispatch_s_cb_sem_id = 0; - - static_assert( - DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE == DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE); - dependent_config_.downstream_cb_log_page_size = DispatchSettings::PREFETCH_D_BUFFER_LOG_PAGE_SIZE; - dependent_config_.downstream_cb_pages = prefetch_d->GetStaticConfig().cmddat_q_pages.value(); - } else { - TT_FATAL(false, "Path not implemented"); - } + auto router_kernel = dynamic_cast(downstream_kernels_[0]); + TT_ASSERT(router_kernel); + dependent_config_.downstream_logical_core = router_kernel->GetLogicalCore(); + dependent_config_.downstream_s_logical_core = UNUSED_LOGICAL_CORE; + uint32_t router_idx = router_kernel->GetUpstreamPort(this); // Need the port that this connects to downstream + dependent_config_.downstream_cb_base = + (router_kernel->GetStaticConfig().rx_queue_start_addr_words.value() << 4) + + (router_kernel->GetStaticConfig().rx_queue_size_words.value() << 4) * router_idx; + dependent_config_.downstream_cb_sem_id = router_kernel->GetStaticConfig().input_packetize_local_sem[router_idx]; + dependent_config_.downstream_dispatch_s_cb_sem_id = 0; // No downstream DISPATCH_S in this case } else if (static_config_.is_d_variant.value()) { - // Upstream - // One ROUTER or direct connection to PREFETCH_H if using fabric + // Upstream, expect just one ROUTER TT_ASSERT(upstream_kernels_.size() == 1); - if (auto router_kernel = dynamic_cast(upstream_kernels_[0])) { - dependent_config_.upstream_logical_core = router_kernel->GetLogicalCore(); - int router_idx = router_kernel->GetDownstreamPort(this); - dependent_config_.upstream_cb_sem_id = - router_kernel->GetStaticConfig().output_depacketize_local_sem[router_idx]; - } else if (auto prefetch_h = dynamic_cast(upstream_kernels_[0])) { - dependent_config_.upstream_logical_core = prefetch_h->GetLogicalCore(); - dependent_config_.upstream_cb_sem_id = prefetch_h->GetStaticConfig().my_downstream_cb_sem_id.value(); - } else { - TT_FATAL(false, "Path not implemented"); - } + auto router_kernel = dynamic_cast(upstream_kernels_[0]); + TT_ASSERT(router_kernel); + dependent_config_.upstream_logical_core = router_kernel->GetLogicalCore(); + int router_idx = router_kernel->GetDownstreamPort(this); + dependent_config_.upstream_cb_sem_id = + router_kernel->GetStaticConfig().output_depacketize_local_sem[router_idx]; // Downstream, expect a DISPATCH_D and s DISPATCH_S - // Prefetch_d will always be local with dispatch_d if (DispatchQueryManager::instance().dispatch_s_enabled()) { TT_ASSERT(downstream_kernels_.size() == 2); } else { @@ -301,12 +265,7 @@ void PrefetchKernel::GenerateDependentConfigs() { found_dispatch = true; dependent_config_.downstream_logical_core = dispatch_kernel->GetLogicalCore(); - dependent_config_.downstream_cb_sem_id = - dispatch_kernel->GetStaticConfig().my_dispatch_cb_sem_id.value(); - dependent_config_.downstream_cb_base = dispatch_kernel->GetStaticConfig().dispatch_cb_base.value(); - dependent_config_.downstream_cb_log_page_size = - dispatch_kernel->GetStaticConfig().dispatch_cb_log_page_size.value(); - dependent_config_.downstream_cb_pages = dispatch_kernel->GetStaticConfig().dispatch_cb_pages.value(); + dependent_config_.downstream_cb_sem_id = dispatch_kernel->GetStaticConfig().my_dispatch_cb_sem_id; } else if (auto dispatch_s_kernel = dynamic_cast(k)) { TT_ASSERT(!found_dispatch_s, "PREFETCH kernel has multiple downstream DISPATCH kernels."); found_dispatch_s = true; @@ -338,8 +297,8 @@ void PrefetchKernel::GenerateDependentConfigs() { void PrefetchKernel::CreateKernel() { std::vector compile_args = { dependent_config_.downstream_cb_base.value(), - dependent_config_.downstream_cb_log_page_size.value(), - dependent_config_.downstream_cb_pages.value(), + static_config_.downstream_cb_log_page_size.value(), + static_config_.downstream_cb_pages.value(), static_config_.my_downstream_cb_sem_id.value(), dependent_config_.downstream_cb_sem_id.value(), static_config_.pcie_base.value(), @@ -363,16 +322,10 @@ void PrefetchKernel::CreateKernel() { dependent_config_.downstream_dispatch_s_cb_sem_id.value(), static_config_.dispatch_s_buffer_size.value(), static_config_.dispatch_s_cb_log_page_size.value(), - dependent_config_.downstream_mesh_id.value_or(0), - dependent_config_.downstream_chip_id.value_or(0), - dependent_config_.upstream_mesh_id.value_or(0), - dependent_config_.upstream_chip_id.value_or(0), - dependent_config_.fabric_router_noc_xy.value_or(0xdeadbeef), - static_config_.client_interface_addr.value_or(0), static_config_.is_d_variant.value(), static_config_.is_h_variant.value(), }; - TT_ASSERT(compile_args.size() == 34); + TT_ASSERT(compile_args.size() == 28); auto my_virtual_core = device_->virtual_core_from_logical_core(logical_core_, GetCoreType()); auto upstream_virtual_core = device_->virtual_core_from_logical_core(dependent_config_.upstream_logical_core.value(), GetCoreType()); @@ -447,21 +400,3 @@ void PrefetchKernel::ConfigureCore() { detail::WriteToDeviceL1(device_, logical_core_, prefetch_q_base, prefetch_q, GetCoreType()); } } - -void PrefetchKernel::UpdateArgsForFabric( - const CoreCoord& fabric_router_virtual, - tt::tt_fabric::mesh_id_t upstream_mesh_id, - chip_id_t upstream_chip_id, - tt::tt_fabric::mesh_id_t downstream_mesh_id, - chip_id_t downstream_chip_id) { - dependent_config_.fabric_router_noc_xy = - tt::tt_metal::hal.noc_xy_encoding(fabric_router_virtual.x, fabric_router_virtual.y); - dependent_config_.upstream_mesh_id = upstream_mesh_id; - dependent_config_.upstream_chip_id = upstream_chip_id; - dependent_config_.downstream_mesh_id = downstream_mesh_id; - dependent_config_.downstream_chip_id = downstream_chip_id; - - auto& my_dispatch_constants = DispatchMemMap::get(GetCoreType()); - static_config_.client_interface_addr = - my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::FABRIC_INTERFACE); -} diff --git a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp index 23415e73f6c..3ba0a426564 100644 --- a/tt_metal/impl/dispatch/kernel_config/prefetch.hpp +++ b/tt_metal/impl/dispatch/kernel_config/prefetch.hpp @@ -3,10 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once #include "fd_kernel.hpp" -#include "mesh_graph.hpp" -#include "umd/device/types/cluster_descriptor_types.h" typedef struct prefetch_static_config { + std::optional downstream_cb_log_page_size; + std::optional downstream_cb_pages; std::optional my_downstream_cb_sem_id; std::optional pcie_base; @@ -38,31 +38,19 @@ typedef struct prefetch_static_config { std::optional is_d_variant; std::optional is_h_variant; - - // Populated if fabric is being used to talk to downstream - std::optional client_interface_addr; } prefetch_static_config_t; typedef struct prefetch_dependent_config { - std::optional upstream_logical_core; - std::optional downstream_logical_core; - std::optional downstream_s_logical_core; - - std::optional downstream_cb_base; - std::optional downstream_cb_log_page_size; - std::optional downstream_cb_pages; - std::optional downstream_cb_sem_id; + std::optional upstream_logical_core; // Dependant + std::optional downstream_logical_core; // Dependant + std::optional downstream_s_logical_core; // Dependant - std::optional upstream_cb_sem_id; + std::optional downstream_cb_base; // Dependent + std::optional downstream_cb_sem_id; // Dependant - std::optional downstream_dispatch_s_cb_sem_id; + std::optional upstream_cb_sem_id; // Dependant - // Populated if fabric is being used to talk to downstream - std::optional fabric_router_noc_xy; - std::optional upstream_mesh_id; - std::optional upstream_chip_id; - std::optional downstream_mesh_id; - std::optional downstream_chip_id; + std::optional downstream_dispatch_s_cb_sem_id; // Dependant } prefetch_dependent_config_t; class PrefetchKernel : public FDKernel { @@ -96,12 +84,6 @@ class PrefetchKernel : public FDKernel { void GenerateStaticConfigs() override; void GenerateDependentConfigs() override; void ConfigureCore() override; - void UpdateArgsForFabric( - const CoreCoord& fabric_router, - tt::tt_fabric::mesh_id_t src_mesh_id, - chip_id_t src_chip_id, - tt::tt_fabric::mesh_id_t dst_mesh_id, - chip_id_t dst_chip_id) override; const prefetch_static_config_t& GetStaticConfig() { return static_config_; } private: diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 12d0ad4b36a..f890383c491 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -15,8 +15,6 @@ #include #include "tt_metal/impl/dispatch/kernels/cq_common.hpp" #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" -#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" -#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" // The command queue write interface controls writes to the completion region, host owns the completion region read // interface Data requests from device and event states are written to the completion region @@ -53,17 +51,8 @@ constexpr uint32_t distributed_dispatcher = get_compile_time_arg_val(25); constexpr uint32_t host_completion_q_wr_ptr = get_compile_time_arg_val(26); constexpr uint32_t dev_completion_q_wr_ptr = get_compile_time_arg_val(27); constexpr uint32_t dev_completion_q_rd_ptr = get_compile_time_arg_val(28); - -// used for fd on fabric -constexpr uint32_t downstream_mesh_id = get_compile_time_arg_val(29); -constexpr uint32_t downstream_chip_id = get_compile_time_arg_val(30); -constexpr uint32_t upstream_mesh_id = get_compile_time_arg_val(31); -constexpr uint32_t upstream_chip_id = get_compile_time_arg_val(32); -constexpr uint32_t fabric_router_noc_xy = get_compile_time_arg_val(33); -constexpr uint32_t client_interface_addr = get_compile_time_arg_val(34); - -constexpr uint32_t is_d_variant = get_compile_time_arg_val(35); -constexpr uint32_t is_h_variant = get_compile_time_arg_val(36); +constexpr uint32_t is_d_variant = get_compile_time_arg_val(29); +constexpr uint32_t is_h_variant = get_compile_time_arg_val(30); constexpr uint8_t upstream_noc_index = UPSTREAM_NOC_INDEX; constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y)); @@ -101,9 +90,6 @@ static uint32_t write_offset[3]; // added to write address on non-host writes static uint32_t upstream_total_acquired_page_count; -static auto client_interface = - reinterpret_cast(client_interface_addr); - constexpr uint32_t packed_write_max_multicast_sub_cmds = get_packed_write_max_multicast_sub_cmds(packed_write_max_unicast_sub_cmds); constexpr uint32_t max_write_packed_large_cmd = @@ -281,9 +267,8 @@ void process_exec_buf_end_h() { template void relay_to_next_cb( uint32_t data_ptr, uint32_t length, uint32_t& block_noc_writes_to_clear, uint32_t block_next_start_addr[]) { - // TODO: Size for fabric static_assert( - preamble_size == 0 || preamble_size == sizeof(tt::packet_queue::dispatch_packet_header_t), + preamble_size == 0 || preamble_size == sizeof(dispatch_packet_header_t), "Dispatcher preamble size must be 0 or sizeof(dispatch_packet_header_t)"); // DPRINT << "relay_to_next_cb: " << data_ptr << " " << cb_fence << " " << length << ENDL(); @@ -879,7 +864,7 @@ static void process_wait() { uint32_t count = cmd->wait.count; if (barrier) { - // DPRINT << " DISPATCH BARRIER\n"; + DPRINT << " DISPATCH BARRIER\n"; noc_async_write_barrier(); } @@ -887,7 +872,7 @@ static void process_wait() { volatile tt_l1_ptr uint32_t* sem_addr = reinterpret_cast(addr); uint32_t heartbeat = 0; if (wait) { - // DPRINT << " DISPATCH WAIT " << HEX() << addr << DEC() << " count " << count << ENDL(); + DPRINT << " DISPATCH WAIT " << HEX() << addr << DEC() << " count " << count << ENDL(); do { invalidate_l1_cache(); IDLE_ERISC_HEARTBEAT_AND_RETURN(heartbeat); @@ -1000,13 +985,13 @@ static inline bool process_cmd_d( switch (cmd->base.cmd_id) { case CQ_DISPATCH_CMD_WRITE_LINEAR: WAYPOINT("DWB"); - // DPRINT << "cmd_write_linear\n"; + DPRINT << "cmd_write_linear\n"; process_write(block_noc_writes_to_clear, block_next_start_addr); WAYPOINT("DWD"); break; case CQ_DISPATCH_CMD_WRITE_LINEAR_H: - // DPRINT << "cmd_write_linear_h\n"; + DPRINT << "cmd_write_linear_h\n"; if (is_h_variant) { process_write(block_noc_writes_to_clear, block_next_start_addr); } else { @@ -1015,7 +1000,7 @@ static inline bool process_cmd_d( break; case CQ_DISPATCH_CMD_WRITE_LINEAR_H_HOST: - // DPRINT << "cmd_write_linear_h_host\n"; + DPRINT << "cmd_write_linear_h_host\n"; if (is_h_variant) { process_write_host_h(block_noc_writes_to_clear, block_next_start_addr); } else { @@ -1024,7 +1009,7 @@ static inline bool process_cmd_d( break; case CQ_DISPATCH_CMD_WRITE_PAGED: - // DPRINT << "cmd_write_paged is_dram: " << (uint32_t)cmd->write_paged.is_dram << ENDL(); + DPRINT << "cmd_write_paged is_dram: " << (uint32_t)cmd->write_paged.is_dram << ENDL(); if (cmd->write_paged.is_dram) { process_write_paged(block_noc_writes_to_clear, block_next_start_addr); } else { @@ -1033,7 +1018,7 @@ static inline bool process_cmd_d( break; case CQ_DISPATCH_CMD_WRITE_PACKED: { - // DPRINT << "cmd_write_packed" << ENDL(); + DPRINT << "cmd_write_packed" << ENDL(); uint32_t flags = cmd->write_packed.flags; if (flags & CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST) { process_write_packed( @@ -1045,17 +1030,17 @@ static inline bool process_cmd_d( } break; case CQ_DISPATCH_NOTIFY_SLAVE_GO_SIGNAL: - // DPRINT << "cmd_notify_dispatch_s_go_signal" << ENDL(); + DPRINT << "cmd_notify_dispatch_s_go_signal" << ENDL(); process_notify_dispatch_s_go_signal_cmd(); break; case CQ_DISPATCH_CMD_WRITE_PACKED_LARGE: - // DPRINT << "cmd_write_packed_large" << ENDL(); + DPRINT << "cmd_write_packed_large" << ENDL(); process_write_packed_large(l1_cache, block_noc_writes_to_clear, block_next_start_addr); break; case CQ_DISPATCH_CMD_WAIT: - // DPRINT << "cmd_wait" << ENDL(); + DPRINT << "cmd_wait" << ENDL(); process_wait(); break; @@ -1075,7 +1060,7 @@ static inline bool process_cmd_d( break; case CQ_DISPATCH_CMD_EXEC_BUF_END: - // DPRINT << "cmd_exec_buf_end\n"; + DPRINT << "cmd_exec_buf_end\n"; if (is_h_variant) { process_exec_buf_end_h(); } else { @@ -1084,12 +1069,12 @@ static inline bool process_cmd_d( break; case CQ_DISPATCH_CMD_SEND_GO_SIGNAL: - // DPRINT << "cmd_go_send_go_signal" << ENDL(); + DPRINT << "cmd_go_send_go_signal" << ENDL(); process_go_signal_mcast_cmd(); break; case CQ_DISPATCH_SET_NUM_WORKER_SEMS: - // DPRINT << "cmd_set_num_worker_sems" << ENDL(); + DPRINT << "cmd_set_num_worker_sems" << ENDL(); // This command is only used by dispatch_s ASSERT(0); cmd_ptr += sizeof(CQDispatchCmd); @@ -1098,9 +1083,8 @@ static inline bool process_cmd_d( case CQ_DISPATCH_SET_GO_SIGNAL_NOC_DATA: set_go_signal_noc_data(); break; case CQ_DISPATCH_CMD_SET_WRITE_OFFSET: - // DPRINT << "write offset: " << cmd->set_write_offset.offset0 << " " << cmd->set_write_offset.offset1 << " - // " - // << cmd->set_write_offset.offset2 << ENDL(); + DPRINT << "write offset: " << cmd->set_write_offset.offset0 << " " << cmd->set_write_offset.offset1 << " " + << cmd->set_write_offset.offset2 << ENDL(); write_offset[0] = cmd->set_write_offset.offset0; write_offset[1] = cmd->set_write_offset.offset1; write_offset[2] = cmd->set_write_offset.offset2; @@ -1108,7 +1092,7 @@ static inline bool process_cmd_d( break; case CQ_DISPATCH_CMD_TERMINATE: - // DPRINT << "dispatch terminate\n"; + DPRINT << "dispatch terminate\n"; if (is_d_variant && !is_h_variant) { relay_to_next_cb( cmd_ptr, sizeof(CQDispatchCmd), block_noc_writes_to_clear, block_next_start_addr); @@ -1140,21 +1124,21 @@ static inline bool process_cmd_h( switch (cmd->base.cmd_id) { case CQ_DISPATCH_CMD_WRITE_LINEAR_H: - // DPRINT << "dispatch_h write_linear_h\n"; + DPRINT << "dispatch_h write_linear_h\n"; process_write(block_noc_writes_to_clear, block_next_start_addr); break; case CQ_DISPATCH_CMD_WRITE_LINEAR_H_HOST: - // DPRINT << "dispatch_h linear_h_host\n"; + DPRINT << "dispatch_h linear_h_host\n"; process_write_host_h(block_noc_writes_to_clear, block_next_start_addr); break; case CQ_DISPATCH_CMD_EXEC_BUF_END: - // DPRINT << "dispatch_h exec_buf_end\n"; + DPRINT << "dispatch_h exec_buf_end\n"; process_exec_buf_end_h(); break; case CQ_DISPATCH_CMD_TERMINATE: - // DPRINT << "dispatch_h terminate\n"; + DPRINT << "dispatch_h terminate\n"; cmd_ptr += sizeof(CQDispatchCmd); done = true; break; @@ -1175,7 +1159,7 @@ static inline bool process_cmd_h( } void kernel_main() { - // DPRINT << "dispatch_" << is_h_variant << is_d_variant << ": start" << ENDL(); + DPRINT << "dispatch_" << is_h_variant << is_d_variant << ": start" << ENDL(); // Initialize local state of any additional nocs used instead of the default static_assert(my_noc_index != upstream_noc_index); if constexpr (my_noc_index != upstream_noc_index) { @@ -1269,5 +1253,5 @@ void kernel_main() { noc_async_full_barrier(); - // DPRINT << "dispatch_" << is_h_variant << is_d_variant << ": out" << ENDL(); + DPRINT << "dispatch_" << is_h_variant << is_d_variant << ": out" << ENDL(); } diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 1863f73c1d3..ea03c9ab8b8 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -10,11 +10,7 @@ // - syncs w/ dispatcher via 2 semaphores, page_ready, page_done #include -#include "dataflow_api.h" -#include "dataflow_api_addrgen.h" #include "tt_metal/impl/dispatch/kernels/cq_common.hpp" -#include "tt_metal/fabric/hw/inc/tt_fabric_api.h" -#include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include "debug/dprint.h" #include "noc/noc_parameters.h" // PCIE_ALIGNMENT @@ -50,7 +46,7 @@ constexpr uint32_t cmddat_q_size = get_compile_time_arg_val(12); // unused for prefetch_h constexpr uint32_t scratch_db_base = get_compile_time_arg_val(13); constexpr uint32_t scratch_db_size = get_compile_time_arg_val(14); -constexpr uint32_t my_downstream_sync_sem_id = get_compile_time_arg_val(15); +constexpr uint32_t downstream_sync_sem_id = get_compile_time_arg_val(15); // prefetch_d specific constexpr uint32_t cmddat_q_pages = get_compile_time_arg_val(16); @@ -65,17 +61,8 @@ constexpr uint32_t my_dispatch_s_cb_sem_id = get_compile_time_arg_val(22); constexpr uint32_t downstream_dispatch_s_cb_sem_id = get_compile_time_arg_val(23); constexpr uint32_t dispatch_s_buffer_size = get_compile_time_arg_val(24); constexpr uint32_t dispatch_s_cb_log_page_size = get_compile_time_arg_val(25); - -// used for fd on fabric -constexpr uint32_t downstream_mesh_id = get_compile_time_arg_val(26); -constexpr uint32_t downstream_chip_id = get_compile_time_arg_val(27); -constexpr uint32_t upstream_mesh_id = get_compile_time_arg_val(28); -constexpr uint32_t upstream_chip_id = get_compile_time_arg_val(29); -constexpr uint32_t fabric_router_noc_xy = get_compile_time_arg_val(30); -constexpr uint32_t client_interface_addr = get_compile_time_arg_val(31); - -constexpr uint32_t is_d_variant = get_compile_time_arg_val(32); -constexpr uint32_t is_h_variant = get_compile_time_arg_val(33); +constexpr uint32_t is_d_variant = get_compile_time_arg_val(26); +constexpr uint32_t is_h_variant = get_compile_time_arg_val(27); constexpr uint32_t prefetch_q_end = prefetch_q_base + prefetch_q_size; constexpr uint32_t cmddat_q_end = cmddat_q_base + cmddat_q_size; @@ -169,9 +156,6 @@ static uint32_t downstream_data_ptr_s = dispatch_s_buffer_base; static uint32_t block_next_start_addr[cmddat_q_blocks]; static uint32_t rd_block_idx = 0; static uint32_t upstream_total_acquired_page_count = 0; -static auto client_interface = - reinterpret_cast(client_interface_addr); - // Feature to stall the prefetcher, mainly for ExecBuf impl which reuses CmdDataQ static enum StallState { STALL_NEXT = 2, STALLED = 1, NOT_STALLED = 0 } stall_state = NOT_STALLED; @@ -938,7 +922,7 @@ uint32_t process_stall(uint32_t cmd_ptr) { WAYPOINT("PSW"); volatile tt_l1_ptr uint32_t* sem_addr = - reinterpret_cast(get_semaphore(my_downstream_sync_sem_id)); + reinterpret_cast(get_semaphore(downstream_sync_sem_id)); uint32_t heartbeat = 0; do { invalidate_l1_cache(); @@ -1479,9 +1463,6 @@ void kernel_main_hd() { void kernel_main() { DPRINT << "prefetcher_" << is_h_variant << is_d_variant << ": start" << ENDL(); - if constexpr (client_interface_addr) { - tt::tt_fabric::fabric_endpoint_init(client_interface, 0 /*Unused*/); - } if (is_h_variant and is_d_variant) { kernel_main_hd(); diff --git a/tt_metal/impl/dispatch/kernels/packet_demux.cpp b/tt_metal/impl/dispatch/kernels/packet_demux.cpp index f743f861856..36b01a59d3c 100644 --- a/tt_metal/impl/dispatch/kernels/packet_demux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_demux.cpp @@ -10,19 +10,19 @@ constexpr uint32_t endpoint_id_start_index = get_compile_time_arg_val(0); constexpr uint32_t rx_queue_start_addr_words = get_compile_time_arg_val(1); constexpr uint32_t rx_queue_size_words = get_compile_time_arg_val(2); -constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words * tt::packet_queue::PACKET_WORD_SIZE_BYTES; +constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words*PACKET_WORD_SIZE_BYTES; static_assert(is_power_of_2(rx_queue_size_words), "rx_queue_size_words must be a power of 2"); constexpr uint32_t demux_fan_out = get_compile_time_arg_val(3); // FIXME imatosevic - is there a way to do this without explicit indexes? -static_assert(demux_fan_out > 0 && demux_fan_out <= tt::packet_queue::MAX_SWITCH_FAN_OUT, +static_assert(demux_fan_out > 0 && demux_fan_out <= MAX_SWITCH_FAN_OUT, "demux fan-out 0 or higher than MAX_SWITCH_FAN_OUT"); -static_assert(tt::packet_queue::MAX_SWITCH_FAN_OUT == 4, +static_assert(MAX_SWITCH_FAN_OUT == 4, "MAX_SWITCH_FAN_OUT must be 4 for the initialization below to work"); -constexpr uint32_t remote_tx_x[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t remote_tx_x[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(4) & 0xFF), (get_compile_time_arg_val(5) & 0xFF), @@ -30,7 +30,7 @@ constexpr uint32_t remote_tx_x[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(7) & 0xFF) }; -constexpr uint32_t remote_tx_y[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t remote_tx_y[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(4) >> 8) & 0xFF, (get_compile_time_arg_val(5) >> 8) & 0xFF, @@ -38,7 +38,7 @@ constexpr uint32_t remote_tx_y[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(7) >> 8) & 0xFF }; -constexpr uint32_t remote_tx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t remote_tx_queue_id[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(4) >> 16) & 0xFF, (get_compile_time_arg_val(5) >> 16) & 0xFF, @@ -46,15 +46,15 @@ constexpr uint32_t remote_tx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(7) >> 16) & 0xFF }; -constexpr tt::packet_queue::DispatchRemoteNetworkType remote_tx_network_type[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr DispatchRemoteNetworkType remote_tx_network_type[MAX_SWITCH_FAN_OUT] = { - static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(6) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(7) >> 24) & 0xFF) + static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(6) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(7) >> 24) & 0xFF) }; -constexpr uint32_t remote_tx_queue_start_addr_words[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t remote_tx_queue_start_addr_words[MAX_SWITCH_FAN_OUT] = { get_compile_time_arg_val(8), get_compile_time_arg_val(10), @@ -62,7 +62,7 @@ constexpr uint32_t remote_tx_queue_start_addr_words[tt::packet_queue::MAX_SWITCH get_compile_time_arg_val(14) }; -constexpr uint32_t remote_tx_queue_size_words[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t remote_tx_queue_size_words[MAX_SWITCH_FAN_OUT] = { get_compile_time_arg_val(9), get_compile_time_arg_val(11), @@ -78,17 +78,17 @@ static_assert((demux_fan_out < 4) || is_power_of_2(remote_tx_queue_size_words[3] constexpr uint32_t remote_rx_x = get_compile_time_arg_val(16); constexpr uint32_t remote_rx_y = get_compile_time_arg_val(17); constexpr uint32_t remote_rx_queue_id = get_compile_time_arg_val(18); -constexpr tt::packet_queue::DispatchRemoteNetworkType +constexpr DispatchRemoteNetworkType remote_rx_network_type = - static_cast(get_compile_time_arg_val(19)); + static_cast(get_compile_time_arg_val(19)); -static_assert(tt::packet_queue::MAX_DEST_ENDPOINTS <= 32 && tt::packet_queue::MAX_SWITCH_FAN_OUT <= 4, - "We assume tt::packet_queue::MAX_DEST_ENDPOINTS <= 32 and tt::packet_queue::MAX_SWITCH_FAN_OUT <= 4 for the initialization below to work"); +static_assert(MAX_DEST_ENDPOINTS <= 32 && MAX_SWITCH_FAN_OUT <= 4, + "We assume MAX_DEST_ENDPOINTS <= 32 and MAX_SWITCH_FAN_OUT <= 4 for the initialization below to work"); constexpr uint32_t dest_endpoint_output_map_hi = get_compile_time_arg_val(20); constexpr uint32_t dest_endpoint_output_map_lo = get_compile_time_arg_val(21); -constexpr uint8_t dest_output_queue_id_map[tt::packet_queue::MAX_DEST_ENDPOINTS] = +constexpr uint8_t dest_output_queue_id_map[MAX_DEST_ENDPOINTS] = { (dest_endpoint_output_map_lo >> 0) & 0x3, (dest_endpoint_output_map_lo >> 2) & 0x3, @@ -136,7 +136,7 @@ tt_l1_ptr uint32_t* const test_results = constexpr uint32_t timeout_cycles = get_compile_time_arg_val(24); -constexpr bool output_depacketize[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr bool output_depacketize[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(25) >> 0) & 0x1, (get_compile_time_arg_val(25) >> 1) & 0x1, @@ -144,7 +144,7 @@ constexpr bool output_depacketize[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(25) >> 3) & 0x1 }; -constexpr uint32_t output_depacketize_log_page_size[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t output_depacketize_log_page_size[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 0) & 0xFF, (get_compile_time_arg_val(27) >> 0) & 0xFF, @@ -152,7 +152,7 @@ constexpr uint32_t output_depacketize_log_page_size[tt::packet_queue::MAX_SWITCH (get_compile_time_arg_val(29) >> 0) & 0xFF }; -constexpr uint32_t output_depacketize_downstream_sem[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t output_depacketize_downstream_sem[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 8) & 0xFF, (get_compile_time_arg_val(27) >> 8) & 0xFF, @@ -160,7 +160,7 @@ constexpr uint32_t output_depacketize_downstream_sem[tt::packet_queue::MAX_SWITC (get_compile_time_arg_val(29) >> 8) & 0xFF }; -constexpr uint32_t output_depacketize_local_sem[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t output_depacketize_local_sem[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 16) & 0xFF, (get_compile_time_arg_val(27) >> 16) & 0xFF, @@ -168,7 +168,7 @@ constexpr uint32_t output_depacketize_local_sem[tt::packet_queue::MAX_SWITCH_FAN (get_compile_time_arg_val(29) >> 16) & 0xFF }; -constexpr uint32_t output_depacketize_remove_header[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t output_depacketize_remove_header[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 24) & 0x1, (get_compile_time_arg_val(27) >> 24) & 0x1, @@ -177,13 +177,13 @@ constexpr uint32_t output_depacketize_remove_header[tt::packet_queue::MAX_SWITCH }; -tt::packet_queue::packet_input_queue_state_t input_queue; -using input_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using input_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_input_queue_state_t input_queue; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; -tt::packet_queue::packet_output_queue_state_t output_queues[tt::packet_queue::MAX_SWITCH_FAN_OUT]; -using output_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using output_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_output_queue_state_t output_queues[MAX_SWITCH_FAN_OUT]; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; inline uint8_t dest_output_queue_id(uint32_t dest_endpoint_id) { @@ -192,17 +192,6 @@ inline uint8_t dest_output_queue_id(uint32_t dest_endpoint_id) { } void kernel_main() { - using tt::packet_queue::PACKET_QUEUE_TEST_STARTED; - using tt::packet_queue::PQ_TEST_STATUS_INDEX; - using tt::packet_queue::PQ_TEST_MISC_INDEX; - using tt::packet_queue::PQ_TEST_WORD_CNT_INDEX; - using tt::packet_queue::PQ_TEST_CYCLES_INDEX; - using tt::packet_queue::PQ_TEST_ITER_INDEX; - using tt::packet_queue::write_test_results; - using tt::packet_queue::get_timestamp; - using tt::packet_queue::get_timestamp_32b; - using tt::packet_queue::set_64b_result; - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000000); write_test_results(test_results, PQ_TEST_MISC_INDEX+1, 0xbb000000 | demux_fan_out); @@ -221,11 +210,11 @@ void kernel_main() { input_queue.init(0, rx_queue_start_addr_words, rx_queue_size_words, remote_rx_x, remote_rx_y, remote_rx_queue_id, remote_rx_network_type); - if (!tt::packet_queue::wait_all_input_output_ready(&input_queue, output_queues, timeout_cycles)) { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_TIMEOUT); + write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } @@ -283,7 +272,7 @@ void kernel_main() { } } all_outputs_finished = true; - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { output_queues[sequence_i].template prev_words_in_flight_check_flush(); all_outputs_finished &= output_queues[sequence_i].is_remote_finished(); return true; @@ -292,7 +281,7 @@ void kernel_main() { if (!timeout) { write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000002); - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { if (!output_queues[sequence_i].template output_barrier(timeout_cycles)) { timeout = true; return false; @@ -312,9 +301,9 @@ void kernel_main() { set_64b_result(test_results, iter, PQ_TEST_ITER_INDEX); if (timeout) { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_TIMEOUT); + write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); } else { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_PASS); + write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff00005); } noc_async_full_barrier(); diff --git a/tt_metal/impl/dispatch/kernels/packet_mux.cpp b/tt_metal/impl/dispatch/kernels/packet_mux.cpp index 486d8b6419d..c1e8777ec84 100644 --- a/tt_metal/impl/dispatch/kernels/packet_mux.cpp +++ b/tt_metal/impl/dispatch/kernels/packet_mux.cpp @@ -13,19 +13,19 @@ constexpr uint32_t reserved = get_compile_time_arg_val(0); // starting at rx_queue_start_addr constexpr uint32_t rx_queue_start_addr_words = get_compile_time_arg_val(1); constexpr uint32_t rx_queue_size_words = get_compile_time_arg_val(2); -constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words * tt::packet_queue::PACKET_WORD_SIZE_BYTES; +constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words*PACKET_WORD_SIZE_BYTES; static_assert(is_power_of_2(rx_queue_size_words), "rx_queue_size_words must be a power of 2"); constexpr uint32_t mux_fan_in = get_compile_time_arg_val(3); // FIXME imatosevic - is there a way to do this without explicit indexes? -static_assert(mux_fan_in > 0 && mux_fan_in <= tt::packet_queue::MAX_SWITCH_FAN_IN, - "mux fan-in 0 or higher than tt::packet_queue::MAX_SWITCH_FAN_IN"); -static_assert(tt::packet_queue::MAX_SWITCH_FAN_IN == 4, - "tt::packet_queue::MAX_SWITCH_FAN_IN must be 4 for the initialization below to work"); +static_assert(mux_fan_in > 0 && mux_fan_in <= MAX_SWITCH_FAN_IN, + "mux fan-in 0 or higher than MAX_SWITCH_FAN_IN"); +static_assert(MAX_SWITCH_FAN_IN == 4, + "MAX_SWITCH_FAN_IN must be 4 for the initialization below to work"); -constexpr uint32_t remote_rx_x[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t remote_rx_x[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(4) & 0xFF), (get_compile_time_arg_val(5) & 0xFF), @@ -33,7 +33,7 @@ constexpr uint32_t remote_rx_x[tt::packet_queue::MAX_SWITCH_FAN_IN] = (get_compile_time_arg_val(7) & 0xFF) }; -constexpr uint32_t remote_rx_y[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t remote_rx_y[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(4) >> 8) & 0xFF, (get_compile_time_arg_val(5) >> 8) & 0xFF, @@ -41,7 +41,7 @@ constexpr uint32_t remote_rx_y[tt::packet_queue::MAX_SWITCH_FAN_IN] = (get_compile_time_arg_val(7) >> 8) & 0xFF }; -constexpr uint32_t remote_rx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t remote_rx_queue_id[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(4) >> 16) & 0xFF, (get_compile_time_arg_val(5) >> 16) & 0xFF, @@ -49,12 +49,12 @@ constexpr uint32_t remote_rx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_IN] = (get_compile_time_arg_val(7) >> 16) & 0xFF }; -constexpr tt::packet_queue::DispatchRemoteNetworkType remote_rx_network_type[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr DispatchRemoteNetworkType remote_rx_network_type[MAX_SWITCH_FAN_IN] = { - static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(6) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(7) >> 24) & 0xFF) + static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(6) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(7) >> 24) & 0xFF) }; constexpr uint32_t remote_tx_queue_start_addr_words = get_compile_time_arg_val(8); @@ -65,9 +65,9 @@ static_assert(is_power_of_2(remote_tx_queue_size_words), "remote_tx_queue_size_w constexpr uint32_t remote_tx_x = get_compile_time_arg_val(10); constexpr uint32_t remote_tx_y = get_compile_time_arg_val(11); constexpr uint32_t remote_tx_queue_id = get_compile_time_arg_val(12); -constexpr tt::packet_queue::DispatchRemoteNetworkType +constexpr DispatchRemoteNetworkType tx_network_type = - static_cast(get_compile_time_arg_val(13)); + static_cast(get_compile_time_arg_val(13)); constexpr uint32_t test_results_buf_addr_arg = get_compile_time_arg_val(14); constexpr uint32_t test_results_buf_size_bytes = get_compile_time_arg_val(15); @@ -86,7 +86,7 @@ constexpr uint32_t output_depacketize_downstream_sem = (output_depacketize_info constexpr uint32_t output_depacketize_local_sem = (output_depacketize_info >> 16) & 0xFF; constexpr bool output_depacketize_remove_header = (output_depacketize_info >> 24) & 0x1; -constexpr uint32_t input_packetize[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t input_packetize[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(19) >> 0) & 0x1, (get_compile_time_arg_val(20) >> 0) & 0x1, @@ -94,7 +94,7 @@ constexpr uint32_t input_packetize[tt::packet_queue::MAX_SWITCH_FAN_IN] = (get_compile_time_arg_val(22) >> 0) & 0x1 }; -constexpr uint32_t input_packetize_log_page_size[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t input_packetize_log_page_size[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(19) >> 8) & 0xFF, (get_compile_time_arg_val(20) >> 8) & 0xFF, @@ -102,7 +102,7 @@ constexpr uint32_t input_packetize_log_page_size[tt::packet_queue::MAX_SWITCH_FA (get_compile_time_arg_val(22) >> 8) & 0xFF }; -constexpr uint32_t input_packetize_upstream_sem[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t input_packetize_upstream_sem[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(19) >> 16) & 0xFF, (get_compile_time_arg_val(20) >> 16) & 0xFF, @@ -110,7 +110,7 @@ constexpr uint32_t input_packetize_upstream_sem[tt::packet_queue::MAX_SWITCH_FAN (get_compile_time_arg_val(22) >> 16) & 0xFF }; -constexpr uint32_t input_packetize_local_sem[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t input_packetize_local_sem[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(19) >> 24) & 0xFF, (get_compile_time_arg_val(20) >> 24) & 0xFF, @@ -118,7 +118,7 @@ constexpr uint32_t input_packetize_local_sem[tt::packet_queue::MAX_SWITCH_FAN_IN (get_compile_time_arg_val(22) >> 24) & 0xFF }; -constexpr uint32_t input_packetize_src_endpoint[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t input_packetize_src_endpoint[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(23) >> 0) & 0xFF, (get_compile_time_arg_val(23) >> 8) & 0xFF, @@ -126,7 +126,7 @@ constexpr uint32_t input_packetize_src_endpoint[tt::packet_queue::MAX_SWITCH_FAN (get_compile_time_arg_val(23) >> 24) & 0xFF }; -constexpr uint32_t input_packetize_dest_endpoint[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint32_t input_packetize_dest_endpoint[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(24) >> 0) & 0xFF, (get_compile_time_arg_val(24) >> 8) & 0xFF, @@ -134,27 +134,15 @@ constexpr uint32_t input_packetize_dest_endpoint[tt::packet_queue::MAX_SWITCH_FA (get_compile_time_arg_val(24) >> 24) & 0xFF }; -tt::packet_queue::packet_input_queue_state_t input_queues[tt::packet_queue::MAX_SWITCH_FAN_IN]; -using input_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using input_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN]; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; -tt::packet_queue::packet_output_queue_state_t output_queue; -using output_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using output_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_output_queue_state_t output_queue; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; void kernel_main() { - using tt::packet_queue::PACKET_QUEUE_TEST_STARTED; - using tt::packet_queue::PQ_TEST_STATUS_INDEX; - using tt::packet_queue::PQ_TEST_MISC_INDEX; - using tt::packet_queue::PQ_TEST_WORD_CNT_INDEX; - using tt::packet_queue::PQ_TEST_CYCLES_INDEX; - using tt::packet_queue::PQ_TEST_ITER_INDEX; - using tt::packet_queue::PACKET_QUEUE_TEST_TIMEOUT; - using tt::packet_queue::write_test_results; - using tt::packet_queue::get_timestamp; - using tt::packet_queue::get_timestamp_32b; - using tt::packet_queue::set_64b_result; - write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000000); write_test_results(test_results, PQ_TEST_MISC_INDEX+1, 0xaa000000 | mux_fan_in); @@ -174,7 +162,7 @@ void kernel_main() { output_depacketize_downstream_sem, output_depacketize_local_sem, output_depacketize_remove_header); - if (!tt::packet_queue::wait_all_input_output_ready< + if (!wait_all_input_output_ready< input_queue_network_sequence, input_queue_cb_mode_sequence, output_queue_network_sequence, @@ -207,7 +195,7 @@ void kernel_main() { } } - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { if (curr_input_partial_packet_sent && partial_packet_sent_index != sequence_i) return true; if (input_queues[sequence_i].template get_curr_packet_valid()) { @@ -248,7 +236,7 @@ void kernel_main() { if (!timeout) { write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000003); - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { input_queues[sequence_i].template send_remote_finished_notification(); return true; }); @@ -261,7 +249,7 @@ void kernel_main() { if (timeout) { write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); } else { - write_test_results(test_results, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_PASS); + write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff00005); } diff --git a/tt_metal/impl/dispatch/kernels/packet_queue.hpp b/tt_metal/impl/dispatch/kernels/packet_queue.hpp index bd9cef011c7..38d54fc9be5 100644 --- a/tt_metal/impl/dispatch/kernels/packet_queue.hpp +++ b/tt_metal/impl/dispatch/kernels/packet_queue.hpp @@ -16,9 +16,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "debug/dprint.h" -#define ENABLE_DPRINTS false - -namespace tt::packet_queue { +#define ENABLE_DPRINTS true constexpr ProgrammableCoreType fd_core_type = static_cast(FD_CORE_TYPE); @@ -1101,5 +1099,3 @@ bool wait_all_input_output_ready(packet_input_queue_state_t* input_queue_array, } return true; } - -} // namespace tt::packet_queue diff --git a/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp b/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp index 3cabb1f44d7..f7be23a8d36 100644 --- a/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp +++ b/tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp @@ -5,8 +5,6 @@ #pragma once #include -namespace tt::packet_queue { - constexpr uint32_t PACKET_WORD_SIZE_BYTES = 16; constexpr uint32_t MAX_SWITCH_FAN_IN = 4; constexpr uint32_t MAX_SWITCH_FAN_OUT = 4; @@ -104,5 +102,3 @@ inline uint64_t packet_switch_dest_pack(uint32_t* dest_output_map_array, uint32_ } return result; } - -} // namespace tt::packet_queue diff --git a/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp index 0056fc4b1a3..e61bfb2a3bb 100644 --- a/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp @@ -5,18 +5,16 @@ #include "dataflow_api.h" #include "tt_metal/impl/dispatch/kernels/packet_queue.hpp" -using tt::packet_queue::DispatchRemoteNetworkType; - constexpr uint32_t endpoint_id_start_index = get_compile_time_arg_val(0); constexpr uint32_t tunnel_lanes = get_compile_time_arg_val(1); constexpr uint32_t in_queue_start_addr_words = get_compile_time_arg_val(2); constexpr uint32_t in_queue_size_words = get_compile_time_arg_val(3); -constexpr uint32_t in_queue_size_bytes = in_queue_size_words * tt::packet_queue::PACKET_WORD_SIZE_BYTES; +constexpr uint32_t in_queue_size_bytes = in_queue_size_words * PACKET_WORD_SIZE_BYTES; static_assert(is_power_of_2(in_queue_size_words), "in_queue_size_words must be a power of 2"); -static_assert(tunnel_lanes <= tt::packet_queue::MAX_TUNNEL_LANES, "cannot have more than 2 tunnel directions."); +static_assert(tunnel_lanes <= MAX_TUNNEL_LANES, "cannot have more than 2 tunnel directions."); static_assert(tunnel_lanes, "tunnel directions cannot be 0. 1 => Unidirectional. 2 => Bidirectional"); -constexpr uint32_t remote_receiver_x[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_receiver_x[MAX_TUNNEL_LANES] = { (get_compile_time_arg_val(4) & 0xFF), (get_compile_time_arg_val(5) & 0xFF), @@ -30,7 +28,7 @@ constexpr uint32_t remote_receiver_x[tt::packet_queue::MAX_TUNNEL_LANES] = (get_compile_time_arg_val(13) & 0xFF) }; -constexpr uint32_t remote_receiver_y[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_receiver_y[MAX_TUNNEL_LANES] = { (get_compile_time_arg_val(4) >> 8) & 0xFF, (get_compile_time_arg_val(5) >> 8) & 0xFF, @@ -44,7 +42,7 @@ constexpr uint32_t remote_receiver_y[tt::packet_queue::MAX_TUNNEL_LANES] = (get_compile_time_arg_val(13) >> 8) & 0xFF }; -constexpr uint32_t remote_receiver_queue_id[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_receiver_queue_id[MAX_TUNNEL_LANES] = { (get_compile_time_arg_val(4) >> 16) & 0xFF, (get_compile_time_arg_val(5) >> 16) & 0xFF, @@ -58,7 +56,7 @@ constexpr uint32_t remote_receiver_queue_id[tt::packet_queue::MAX_TUNNEL_LANES] (get_compile_time_arg_val(13) >> 16) & 0xFF }; -constexpr DispatchRemoteNetworkType remote_receiver_network_type[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr DispatchRemoteNetworkType remote_receiver_network_type[MAX_TUNNEL_LANES] = { static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF), @@ -72,7 +70,7 @@ constexpr DispatchRemoteNetworkType remote_receiver_network_type[tt::packet_queu static_cast((get_compile_time_arg_val(13) >> 24) & 0xFF) }; -constexpr uint32_t remote_receiver_queue_start_addr_words[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_receiver_queue_start_addr_words[MAX_TUNNEL_LANES] = { get_compile_time_arg_val(14), get_compile_time_arg_val(16), @@ -86,7 +84,7 @@ constexpr uint32_t remote_receiver_queue_start_addr_words[tt::packet_queue::MAX_ get_compile_time_arg_val(32) }; -constexpr uint32_t remote_receiver_queue_size_words[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_receiver_queue_size_words[MAX_TUNNEL_LANES] = { get_compile_time_arg_val(15), get_compile_time_arg_val(17), @@ -111,7 +109,7 @@ static_assert(is_power_of_2(remote_receiver_queue_size_words[7]), "remote_receiv static_assert(is_power_of_2(remote_receiver_queue_size_words[8]), "remote_receiver_queue_size_words must be a power of 2"); static_assert(is_power_of_2(remote_receiver_queue_size_words[9]), "remote_receiver_queue_size_words must be a power of 2"); -constexpr uint32_t remote_sender_x[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_sender_x[MAX_TUNNEL_LANES] = { (get_compile_time_arg_val(34) & 0xFF), (get_compile_time_arg_val(35) & 0xFF), @@ -125,7 +123,7 @@ constexpr uint32_t remote_sender_x[tt::packet_queue::MAX_TUNNEL_LANES] = (get_compile_time_arg_val(43) & 0xFF) }; -constexpr uint32_t remote_sender_y[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_sender_y[MAX_TUNNEL_LANES] = { (get_compile_time_arg_val(34) >> 8) & 0xFF, (get_compile_time_arg_val(35) >> 8) & 0xFF, @@ -139,7 +137,7 @@ constexpr uint32_t remote_sender_y[tt::packet_queue::MAX_TUNNEL_LANES] = (get_compile_time_arg_val(43) >> 8) & 0xFF }; -constexpr uint32_t remote_sender_queue_id[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr uint32_t remote_sender_queue_id[MAX_TUNNEL_LANES] = { (get_compile_time_arg_val(34) >> 16) & 0xFF, (get_compile_time_arg_val(35) >> 16) & 0xFF, @@ -153,7 +151,7 @@ constexpr uint32_t remote_sender_queue_id[tt::packet_queue::MAX_TUNNEL_LANES] = (get_compile_time_arg_val(43) >> 16) & 0xFF }; -constexpr tt::packet_queue::DispatchRemoteNetworkType remote_sender_network_type[tt::packet_queue::MAX_TUNNEL_LANES] = +constexpr DispatchRemoteNetworkType remote_sender_network_type[MAX_TUNNEL_LANES] = { static_cast((get_compile_time_arg_val(34) >> 24) & 0xFF), static_cast((get_compile_time_arg_val(35) >> 24) & 0xFF), @@ -177,8 +175,8 @@ tt_l1_ptr uint32_t* const kernel_status = reinterpret_cast( constexpr uint32_t timeout_cycles = get_compile_time_arg_val(46); constexpr uint32_t inner_stop_mux_d_bypass = get_compile_time_arg_val(47); -tt::packet_queue::packet_input_queue_state_t input_queues[tt::packet_queue::MAX_TUNNEL_LANES]; -using input_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using input_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; -tt::packet_queue::packet_output_queue_state_t output_queues[tt::packet_queue::MAX_TUNNEL_LANES]; -using output_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using output_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence(input_queues, output_queues, timeout_cycles)) { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_TIMEOUT); + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } @@ -290,9 +281,9 @@ void kernel_main() { iter++; switch_counter++; all_outputs_finished = switch_counter >= SWITCH_THRESHOLD; - tt::packet_queue::process_queues([&](auto) -> bool { - using remote_input_networks = tt::packet_queue::NetworkTypeSequence; - using remote_input_cb_modes = tt::packet_queue::CBModeTypeSequence; + process_queues([&](auto) -> bool { + using remote_input_networks = NetworkTypeSequence; + using remote_input_cb_modes = CBModeTypeSequence; if (input_queues[sequence_i].template get_curr_packet_valid()) { bool full_packet_sent; @@ -341,10 +332,10 @@ void kernel_main() { timeout = false; write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { // inputs for this output queue - using remote_input_networks = tt::packet_queue::NetworkTypeSequence; - using remote_input_cb_modes = tt::packet_queue::CBModeTypeSequence; + using remote_input_networks = NetworkTypeSequence; + using remote_input_cb_modes = CBModeTypeSequence; if (!output_queues[sequence_i].template output_barrier(timeout_cycles)) { timeout = true; @@ -356,10 +347,10 @@ void kernel_main() { uint64_t cycles_elapsed = get_timestamp() - start_timestamp; write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); - tt::packet_queue::set_64b_result(kernel_status, data_words_sent, tt::packet_queue::PQ_TEST_WORD_CNT_INDEX); - tt::packet_queue::set_64b_result(kernel_status, cycles_elapsed, tt::packet_queue::PQ_TEST_CYCLES_INDEX); - tt::packet_queue::set_64b_result(kernel_status, iter, tt::packet_queue::PQ_TEST_ITER_INDEX); + set_64b_result(kernel_status, data_words_sent, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(kernel_status, cycles_elapsed, PQ_TEST_CYCLES_INDEX); + set_64b_result(kernel_status, iter, PQ_TEST_ITER_INDEX); - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_PASS); + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); } diff --git a/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp b/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp index 1f1c095d965..fe85768570b 100644 --- a/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp +++ b/tt_metal/impl/dispatch/kernels/vc_packet_router.cpp @@ -8,19 +8,19 @@ constexpr uint32_t rx_queue_start_addr_words = get_compile_time_arg_val(1); constexpr uint32_t rx_queue_size_words = get_compile_time_arg_val(2); -constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words * tt::packet_queue::PACKET_WORD_SIZE_BYTES; +constexpr uint32_t rx_queue_size_bytes = rx_queue_size_words*PACKET_WORD_SIZE_BYTES; static_assert(is_power_of_2(rx_queue_size_words), "rx_queue_size_words must be a power of 2"); constexpr uint32_t router_lanes = get_compile_time_arg_val(3); // FIXME imatosevic - is there a way to do this without explicit indexes? -static_assert(router_lanes > 0 && router_lanes <= tt::packet_queue::MAX_SWITCH_FAN_OUT, - "demux fan-out 0 or higher than tt::packet_queue::MAX_SWITCH_FAN_OUT"); -static_assert(tt::packet_queue::MAX_SWITCH_FAN_OUT == 4, - "tt::packet_queue::MAX_SWITCH_FAN_OUT must be 4 for the initialization below to work"); +static_assert(router_lanes > 0 && router_lanes <= MAX_SWITCH_FAN_OUT, + "demux fan-out 0 or higher than MAX_SWITCH_FAN_OUT"); +static_assert(MAX_SWITCH_FAN_OUT == 4, + "MAX_SWITCH_FAN_OUT must be 4 for the initialization below to work"); -constexpr uint8_t remote_tx_x[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t remote_tx_x[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(4) & 0xFF), (get_compile_time_arg_val(5) & 0xFF), @@ -28,7 +28,7 @@ constexpr uint8_t remote_tx_x[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(7) & 0xFF) }; -constexpr uint8_t remote_tx_y[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t remote_tx_y[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(4) >> 8) & 0xFF, (get_compile_time_arg_val(5) >> 8) & 0xFF, @@ -36,7 +36,7 @@ constexpr uint8_t remote_tx_y[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(7) >> 8) & 0xFF }; -constexpr uint8_t remote_tx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t remote_tx_queue_id[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(4) >> 16) & 0xFF, (get_compile_time_arg_val(5) >> 16) & 0xFF, @@ -44,15 +44,15 @@ constexpr uint8_t remote_tx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(7) >> 16) & 0xFF }; -constexpr tt::packet_queue::DispatchRemoteNetworkType remote_tx_network_type[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr DispatchRemoteNetworkType remote_tx_network_type[MAX_SWITCH_FAN_OUT] = { - static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(6) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(7) >> 24) & 0xFF) + static_cast((get_compile_time_arg_val(4) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(5) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(6) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(7) >> 24) & 0xFF) }; -constexpr uint32_t remote_tx_queue_start_addr_words[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t remote_tx_queue_start_addr_words[MAX_SWITCH_FAN_OUT] = { get_compile_time_arg_val(8), get_compile_time_arg_val(10), @@ -60,7 +60,7 @@ constexpr uint32_t remote_tx_queue_start_addr_words[tt::packet_queue::MAX_SWITCH get_compile_time_arg_val(14) }; -constexpr uint32_t remote_tx_queue_size_words[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint32_t remote_tx_queue_size_words[MAX_SWITCH_FAN_OUT] = { get_compile_time_arg_val(9), get_compile_time_arg_val(11), @@ -73,7 +73,7 @@ static_assert((router_lanes < 2) || is_power_of_2(remote_tx_queue_size_words[1]) static_assert((router_lanes < 3) || is_power_of_2(remote_tx_queue_size_words[2]), "remote_tx_queue_size_words must be a power of 2"); static_assert((router_lanes < 4) || is_power_of_2(remote_tx_queue_size_words[3]), "remote_tx_queue_size_words must be a power of 2"); -constexpr uint8_t remote_rx_x[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t remote_rx_x[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(16) & 0xFF), (get_compile_time_arg_val(17) & 0xFF), @@ -81,7 +81,7 @@ constexpr uint8_t remote_rx_x[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(19) & 0xFF) }; -constexpr uint8_t remote_rx_y[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t remote_rx_y[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(16) >> 8) & 0xFF, (get_compile_time_arg_val(17) >> 8) & 0xFF, @@ -89,7 +89,7 @@ constexpr uint8_t remote_rx_y[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(19) >> 8) & 0xFF }; -constexpr uint8_t remote_rx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t remote_rx_queue_id[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(16) >> 16) & 0xFF, (get_compile_time_arg_val(17) >> 16) & 0xFF, @@ -97,12 +97,12 @@ constexpr uint8_t remote_rx_queue_id[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(19) >> 16) & 0xFF }; -constexpr tt::packet_queue::DispatchRemoteNetworkType remote_rx_network_type[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr DispatchRemoteNetworkType remote_rx_network_type[MAX_SWITCH_FAN_OUT] = { - static_cast((get_compile_time_arg_val(16) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(17) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(18) >> 24) & 0xFF), - static_cast((get_compile_time_arg_val(19) >> 24) & 0xFF) + static_cast((get_compile_time_arg_val(16) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(17) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(18) >> 24) & 0xFF), + static_cast((get_compile_time_arg_val(19) >> 24) & 0xFF) }; constexpr uint32_t kernel_status_buf_addr_arg = get_compile_time_arg_val(22); @@ -114,7 +114,7 @@ tt_l1_ptr uint32_t* const kernel_status = constexpr uint32_t timeout_cycles = get_compile_time_arg_val(24); -constexpr bool output_depacketize[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr bool output_depacketize[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(25) >> 0) & 0x1, (get_compile_time_arg_val(25) >> 1) & 0x1, @@ -122,7 +122,7 @@ constexpr bool output_depacketize[tt::packet_queue::MAX_SWITCH_FAN_OUT] = (get_compile_time_arg_val(25) >> 3) & 0x1 }; -constexpr uint8_t output_depacketize_log_page_size[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t output_depacketize_log_page_size[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 0) & 0xFF, (get_compile_time_arg_val(27) >> 0) & 0xFF, @@ -130,7 +130,7 @@ constexpr uint8_t output_depacketize_log_page_size[tt::packet_queue::MAX_SWITCH_ (get_compile_time_arg_val(29) >> 0) & 0xFF }; -constexpr uint8_t output_depacketize_downstream_sem[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t output_depacketize_downstream_sem[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 8) & 0xFF, (get_compile_time_arg_val(27) >> 8) & 0xFF, @@ -138,7 +138,7 @@ constexpr uint8_t output_depacketize_downstream_sem[tt::packet_queue::MAX_SWITCH (get_compile_time_arg_val(29) >> 8) & 0xFF }; -constexpr uint8_t output_depacketize_local_sem[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t output_depacketize_local_sem[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 16) & 0xFF, (get_compile_time_arg_val(27) >> 16) & 0xFF, @@ -146,7 +146,7 @@ constexpr uint8_t output_depacketize_local_sem[tt::packet_queue::MAX_SWITCH_FAN_ (get_compile_time_arg_val(29) >> 16) & 0xFF }; -constexpr uint8_t output_depacketize_remove_header[tt::packet_queue::MAX_SWITCH_FAN_OUT] = +constexpr uint8_t output_depacketize_remove_header[MAX_SWITCH_FAN_OUT] = { (get_compile_time_arg_val(26) >> 24) & 0x1, (get_compile_time_arg_val(27) >> 24) & 0x1, @@ -154,7 +154,7 @@ constexpr uint8_t output_depacketize_remove_header[tt::packet_queue::MAX_SWITCH_ (get_compile_time_arg_val(29) >> 24) & 0x1 }; -constexpr uint8_t input_packetize[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint8_t input_packetize[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(30) >> 0) & 0x1, (get_compile_time_arg_val(31) >> 0) & 0x1, @@ -162,7 +162,7 @@ constexpr uint8_t input_packetize[tt::packet_queue::MAX_SWITCH_FAN_IN] = (get_compile_time_arg_val(33) >> 0) & 0x1 }; -constexpr uint8_t input_packetize_log_page_size[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint8_t input_packetize_log_page_size[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(30) >> 8) & 0xFF, (get_compile_time_arg_val(31) >> 8) & 0xFF, @@ -170,7 +170,7 @@ constexpr uint8_t input_packetize_log_page_size[tt::packet_queue::MAX_SWITCH_FAN (get_compile_time_arg_val(33) >> 8) & 0xFF }; -constexpr uint8_t input_packetize_upstream_sem[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint8_t input_packetize_upstream_sem[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(30) >> 16) & 0xFF, (get_compile_time_arg_val(31) >> 16) & 0xFF, @@ -178,7 +178,7 @@ constexpr uint8_t input_packetize_upstream_sem[tt::packet_queue::MAX_SWITCH_FAN_ (get_compile_time_arg_val(33) >> 16) & 0xFF }; -constexpr uint8_t input_packetize_local_sem[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint8_t input_packetize_local_sem[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(30) >> 24) & 0xFF, (get_compile_time_arg_val(31) >> 24) & 0xFF, @@ -186,7 +186,7 @@ constexpr uint8_t input_packetize_local_sem[tt::packet_queue::MAX_SWITCH_FAN_IN] (get_compile_time_arg_val(33) >> 24) & 0xFF }; -constexpr uint8_t input_packetize_src_endpoint[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint8_t input_packetize_src_endpoint[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(34) >> 0) & 0xFF, (get_compile_time_arg_val(34) >> 8) & 0xFF, @@ -194,7 +194,7 @@ constexpr uint8_t input_packetize_src_endpoint[tt::packet_queue::MAX_SWITCH_FAN_ (get_compile_time_arg_val(34) >> 24) & 0xFF }; -constexpr uint8_t input_packetize_dest_endpoint[tt::packet_queue::MAX_SWITCH_FAN_IN] = +constexpr uint8_t input_packetize_dest_endpoint[MAX_SWITCH_FAN_IN] = { (get_compile_time_arg_val(35) >> 0) & 0xFF, (get_compile_time_arg_val(35) >> 8) & 0xFF, @@ -202,23 +202,15 @@ constexpr uint8_t input_packetize_dest_endpoint[tt::packet_queue::MAX_SWITCH_FAN (get_compile_time_arg_val(35) >> 24) & 0xFF }; -tt::packet_queue::packet_input_queue_state_t input_queues[tt::packet_queue::MAX_SWITCH_FAN_IN]; -using input_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using input_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_input_queue_state_t input_queues[MAX_SWITCH_FAN_IN]; +using input_queue_network_sequence = NetworkTypeSequence; +using input_queue_cb_mode_sequence = CBModeTypeSequence; -tt::packet_queue::packet_output_queue_state_t output_queues[tt::packet_queue::MAX_SWITCH_FAN_OUT]; -using output_queue_network_sequence = tt::packet_queue::NetworkTypeSequence; -using output_queue_cb_mode_sequence = tt::packet_queue::CBModeTypeSequence; +packet_output_queue_state_t output_queues[MAX_SWITCH_FAN_OUT]; +using output_queue_network_sequence = NetworkTypeSequence; +using output_queue_cb_mode_sequence = CBModeTypeSequence; void kernel_main() { - using tt::packet_queue::PACKET_QUEUE_TEST_STARTED; - using tt::packet_queue::PQ_TEST_STATUS_INDEX; - using tt::packet_queue::PQ_TEST_MISC_INDEX; - using tt::packet_queue::write_kernel_status; - using tt::packet_queue::get_timestamp; - using tt::packet_queue::get_timestamp_32b; - using tt::packet_queue::set_64b_result; - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000000); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX+1, 0xbb000000 | router_lanes); @@ -238,11 +230,11 @@ void kernel_main() { output_depacketize_remove_header[i]); } - if (!tt::packet_queue::wait_all_input_output_ready(input_queues, output_queues, timeout_cycles)) { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_TIMEOUT); + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); return; } @@ -266,7 +258,7 @@ void kernel_main() { } // Loop through router lanes - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { iter++; if (input_queues[sequence_i].template get_curr_packet_valid()) { bool full_packet_sent; @@ -282,7 +274,7 @@ void kernel_main() { } // Flush for all inputs of this output queue (only 1 input) - output_queues[sequence_i].template prev_words_in_flight_check_flush, tt::packet_queue::CBModeTypeSequence>(); + output_queues[sequence_i].template prev_words_in_flight_check_flush, CBModeTypeSequence>(); if ((iter & 0xFF) == 0) { @@ -298,7 +290,7 @@ void kernel_main() { if (!timeout) { write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000002); - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { if (!output_queues[sequence_i].template output_barrier(timeout_cycles)) { timeout = true; } @@ -309,20 +301,20 @@ void kernel_main() { uint64_t cycles_elapsed = get_timestamp() - start_timestamp; if (!timeout) { write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff000003); - tt::packet_queue::process_queues([&](auto) -> bool { + process_queues([&](auto) -> bool { input_queues[sequence_i].template send_remote_finished_notification(); return true; }); } - set_64b_result(kernel_status, data_words_sent, tt::packet_queue::PQ_TEST_WORD_CNT_INDEX); - set_64b_result(kernel_status, cycles_elapsed, tt::packet_queue::PQ_TEST_CYCLES_INDEX); - set_64b_result(kernel_status, iter, tt::packet_queue::PQ_TEST_ITER_INDEX); + set_64b_result(kernel_status, data_words_sent, PQ_TEST_WORD_CNT_INDEX); + set_64b_result(kernel_status, cycles_elapsed, PQ_TEST_CYCLES_INDEX); + set_64b_result(kernel_status, iter, PQ_TEST_ITER_INDEX); if (timeout) { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_TIMEOUT); + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_TIMEOUT); } else { - write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, tt::packet_queue::PACKET_QUEUE_TEST_PASS); + write_kernel_status(kernel_status, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_PASS); write_kernel_status(kernel_status, PQ_TEST_MISC_INDEX, 0xff00005); } diff --git a/tt_metal/impl/dispatch/topology.cpp b/tt_metal/impl/dispatch/topology.cpp index ae96a31a4ed..07477f4047b 100644 --- a/tt_metal/impl/dispatch/topology.cpp +++ b/tt_metal/impl/dispatch/topology.cpp @@ -3,13 +3,10 @@ // SPDX-License-Identifier: Apache-2.0 #include "topology.hpp" -#include "dispatch_core_common.hpp" #include "kernel_config/fd_kernel.hpp" #include -#include #include #include -#include #include "kernel_config/fd_kernel.hpp" #include "kernel_config/prefetch.hpp" #include "kernel_config/dispatch.hpp" @@ -20,8 +17,6 @@ #include "kernel_config/eth_tunneler.hpp" #include "fabric_host_interface.h" -#include "program_impl.hpp" -#include "rtoptions.hpp" #include "tt_cluster.hpp" namespace tt::tt_metal { @@ -86,25 +81,6 @@ static const std::vector two_chip_arch_1cq = { {13, 1, x, 0, PACKET_ROUTER_DEMUX, {11, x, x, x}, {8, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, }; -static const std::vector two_chip_arch_1cq_fabric = { - {0, 0, 0, 0, PREFETCH_HD, /*up*/ {x, x, x, x}, /*down*/ {1, 2, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {1, 0, 0, 0, DISPATCH_HD, {0, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, - {2, 0, 0, 0, DISPATCH_S, {0, x, x, x}, {1, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, - - {3, 0, 1, 0, PREFETCH_H, {x, x, x, x}, {7, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {4, 0, 1, 0, DISPATCH_H, {8, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, - - // Sender path PREFETCH_H -> PREFETCH_D - {5, 0, x, 0, FABRIC_ROUTER_VC, {3, x, x, x}, {7, x, x, x}}, - - // Return path DISPATCH_D -> DISPATCH_H - {6, 0, x, 0, FABRIC_ROUTER_VC, {8, x, x, x}, {4, x, x, x}}, - - {7, 1, 1, 0, PREFETCH_D, {3, x, x, x}, {8, 9, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, - {8, 1, 1, 0, DISPATCH_D, {7, x, x, x}, {9, 4, x, x}, NOC::NOC_0, NOC::NOC_1, NOC::NOC_0}, - {9, 1, 1, 0, DISPATCH_S, {7, x, x, x}, {8, x, x, x}, NOC::NOC_1, NOC::NOC_1, NOC::NOC_1}, -}; - static const std::vector two_chip_arch_2cq = { {0, 0, 0, 0, PREFETCH_HD, {x, x, x, x}, {2, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, {1, 0, 0, 1, PREFETCH_HD, {x, x, x, x}, {3, x, x, x}, NOC::NOC_0, NOC::NOC_0, NOC::NOC_0}, @@ -407,7 +383,6 @@ static const std::vector galaxy_nine_chip_arch_2cq = { }; std::vector node_id_to_kernel; -std::unordered_map> command_queue_pgms; // Helper function to automatically generate dispatch nodes given devices + num hw CQs + detection of card type. std::vector generate_nodes(const std::set& device_ids, uint32_t num_hw_cqs) { @@ -495,16 +470,8 @@ std::vector generate_nodes(const std::set& device TT_ASSERT( mmio_devices.size() == remote_devices.size() or remote_devices.empty(), "N300/T3K expects devices in mmio/remote pairs."); - std::vector nodes_for_one_mmio; - // TODO: Put this in a better place - if (llrt::RunTimeOptions::get_instance().get_fd_fabric()) { - TT_FATAL(num_hw_cqs == 1, "Only 1 CQ is supported at this time for FD on Fabric"); - // Must call tt::tt_metal::detail::InitializeFabricConfig upstream - nodes_for_one_mmio = two_chip_arch_1cq_fabric; - } else { - nodes_for_one_mmio = (num_hw_cqs == 1) ? two_chip_arch_1cq : two_chip_arch_2cq; - } - + const std::vector* nodes_for_one_mmio = + (num_hw_cqs == 1) ? &two_chip_arch_1cq : &two_chip_arch_2cq; uint32_t index_offset = 0; for (auto mmio_device_id : mmio_devices) { // Find the corresponding remote chip @@ -520,7 +487,7 @@ std::vector generate_nodes(const std::set& device TT_ASSERT(found_remote, "Couldn't find paired remote chip for device {}", mmio_device_id); // Add dispatch kernels for the mmio/remote pair - for (DispatchKernelNode node : nodes_for_one_mmio) { + for (DispatchKernelNode node : *nodes_for_one_mmio) { TT_ASSERT(node.device_id == 0 || node.device_id == 1); if (node.device_id == 0) { node.device_id = mmio_device_id; @@ -535,7 +502,7 @@ std::vector generate_nodes(const std::set& device increment_node_ids(node, index_offset); nodes.push_back(node); } - index_offset += nodes_for_one_mmio.size(); + index_offset += nodes_for_one_mmio->size(); } } } @@ -546,18 +513,9 @@ std::vector generate_nodes(const std::set& device // Populate node_id_to_kernel and set up kernel objects. Do this once at the beginning since they (1) don't need a valid // Device until fields are populated, (2) need to be connected to kernel objects for devices that aren't created yet, // and (3) the table to choose depends on total number of devices, not know at Device creation. -void populate_fd_kernels(const std::vector& devices, uint32_t num_hw_cqs) { - std::set device_ids; - for (const auto& device : devices) { - device_ids.insert(device->id()); - } - populate_fd_kernels(generate_nodes(device_ids, num_hw_cqs)); -} - void populate_fd_kernels(const std::set& device_ids, uint32_t num_hw_cqs) { populate_fd_kernels(generate_nodes(device_ids, num_hw_cqs)); } - void populate_fd_kernels(const std::vector& nodes) { // If we already had nodes from a previous run, clear them (since we could have a different # of devices or CQs). if (!node_id_to_kernel.empty()) { @@ -706,53 +664,33 @@ void populate_fd_kernels(const std::vector& nodes) { } } -void populate_cq_static_args(const std::vector& devices) { +std::unique_ptr create_and_compile_cq_program(IDevice* device) { TT_ASSERT( node_id_to_kernel.size() > 0, - "Tried to populate static args on nodes without the nodes populated (need to run populate_fd_kernels()"); - for (const auto& device : devices) { - // First pass, add device/program to all kernels for this device and generate static configs. - auto cq_program_ptr = std::make_unique(); - - for (auto node_and_kernel : node_id_to_kernel) { - if (node_and_kernel->GetDeviceId() == device->id()) { - node_and_kernel->AddDevice(device); - // TODO: Be careful downstream. Using get() on a smart pointer defeats the purpose of using them - // Memory could be changed at that location later. - node_and_kernel->AddProgram(cq_program_ptr.get()); - node_and_kernel->GenerateStaticConfigs(); - } - } - - // Move program into the storage for create_and_compile_cq_program to be called later - command_queue_pgms[device->id()] = std::move(cq_program_ptr); - } -} + "Tried to create CQ program without nodes populated (need to run populate_fd_kernels()"); -std::unique_ptr create_and_compile_cq_program(IDevice* device) { - TT_ASSERT( - command_queue_pgms.contains(device->id()), - "Tried to create and compile CQ program without static args populated (need to run populate_cq_static_args())"); - std::unique_ptr cq_program = std::move(command_queue_pgms[device->id()]); - // Third pass, populate dependent configs and create kernels for each node - for (auto node_and_kernel : node_id_to_kernel) { - if (node_and_kernel->GetDeviceId() == device->id()) { - node_and_kernel->GenerateDependentConfigs(); + // First pass, add device/program to all kernels for this device and generate static configs. + auto cq_program_ptr = std::make_unique(); + // for (auto &node_and_kernel : node_id_to_kernel) { + for (int idx = 0; idx < node_id_to_kernel.size(); idx++) { + if (node_id_to_kernel[idx]->GetDeviceId() == device->id()) { + node_id_to_kernel[idx]->AddDeviceAndProgram(device, cq_program_ptr.get()); + node_id_to_kernel[idx]->GenerateStaticConfigs(); } } - for (auto node_and_kernel : node_id_to_kernel) { - if (node_and_kernel->GetDeviceId() == device->id()) { - node_and_kernel->CreateKernel(); + // Third pass, populate dependent configs and create kernels for each node + // for (auto &node_and_kernel : node_id_to_kernel) { + for (int idx = 0; idx < node_id_to_kernel.size(); idx++) { + if (node_id_to_kernel[idx]->GetDeviceId() == device->id()) { + node_id_to_kernel[idx]->GenerateDependentConfigs(); + node_id_to_kernel[idx]->CreateKernel(); } } // Compile the program and return it so Device can register it - detail::CompileProgram(device, *cq_program, /*fd_bootloader_mode=*/true); - // Erase from map. Note: program in map is no longer valid - // It is returned from this function and the caller will take ownership of it - command_queue_pgms.erase(device->id()); - return cq_program; + detail::CompileProgram(device, *cq_program_ptr, /*fd_bootloader_mode=*/true); + return cq_program_ptr; } void configure_dispatch_cores(IDevice* device) { diff --git a/tt_metal/impl/dispatch/topology.hpp b/tt_metal/impl/dispatch/topology.hpp index 16d9a1b7458..0da7b40472c 100644 --- a/tt_metal/impl/dispatch/topology.hpp +++ b/tt_metal/impl/dispatch/topology.hpp @@ -26,16 +26,10 @@ struct DispatchKernelNode { // Create FD kernels for all given device ids. Creates all objects, but need to call create_and_compile_cq_program() use // a created Device to fill out the settings. First version automatically generates the topology based on devices, num // cqs, and detected board. Second version uses the topology passed in. -void populate_fd_kernels(const std::vector& devices, uint32_t num_hw_cqs); void populate_fd_kernels(const std::set& device_ids, uint32_t num_hw_cqs); void populate_fd_kernels(const std::vector& nodes); -// Populate the static arguments. -// Prerequisites: Must call populate_fd_kernels -void populate_cq_static_args(const std::vector& devices); - // Fill out all settings for FD kernels on the given device, and add them to a Program and return it. -// Prerequisites: Must call populate_cq_static_args std::unique_ptr create_and_compile_cq_program(tt::tt_metal::IDevice* device); // Perform additional configuration (writing to specific L1 addresses, etc.) for FD kernels on this device. diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index fc1c5541854..875196c6fe0 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -4,9 +4,15 @@ #include +#include #include +#include +#include +#include #include +#include #include +#include #include "common/executor.hpp" #include "jit_build/genfiles.hpp" @@ -166,30 +172,20 @@ void JitBuildEnv::init( // Includes // TODO(pgk) this list is insane - // clang-format off - this->includes_ = string("") + - "-I. " + - "-I.. " + - "-I" + this->root_ + " " + - "-I" + this->root_ + "ttnn " + - "-I" + this->root_ + "tt_metal " + - "-I" + this->root_ + "tt_metal/include " + - "-I" + this->root_ + "tt_metal/hw/inc " + - "-I" + this->root_ + "tt_metal/fabric/hw/inc " + - "-I" + this->root_ + "tt_metal/hostdevcommon/api " + - "-I" + this->root_ + "tt_metal/hw/inc/debug " + - "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + " " + - "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/" + this->arch_name_ + "_defines " + - "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/noc " + - "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/common " + - "-I" + this->root_ + "tt_metal/hw/ckernels/" + this->arch_name_ + "/metal/llk_io " + - "-I" + this->root_ + "tt_metal/third_party/tt_llk/tt_llk_" + this->arch_name_ + "/common/inc " + // TODO(fixme) datamovement fw shouldn't read this - "-I" + this->root_ + "tt_metal/api/" + this->aliased_arch_name_ + " " + - "-I" + this->root_ + "tt_metal/api/" + this->aliased_arch_name_ + "/tt-metalium " + - "-I" + this->root_ + "tt_metal/api/tt-metalium/ " + - "-I" + this->root_ + "tt_metal/api/ " + - "-I" + this->root_ + "tt_metal/third_party/tt_llk/tt_llk_" + this->arch_name_ + "/llk_lib "; - // clang-format on + this->includes_ = string("") + "-I. " + "-I.. " + "-I" + this->root_ + " " + "-I" + this->root_ + "ttnn " + "-I" + + this->root_ + "tt_metal " + "-I" + this->root_ + "tt_metal/include " + "-I" + this->root_ + + "tt_metal/hw/inc " + "-I" + this->root_ + "tt_metal/hostdevcommon/api " + "-I" + this->root_ + + "tt_metal/hw/inc/debug " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + + " " + "-I" + this->root_ + "tt_metal/hw/inc/" + this->aliased_arch_name_ + "/" + + this->arch_name_ + "_defines " + "-I" + this->root_ + "tt_metal/hw/inc/" + + this->aliased_arch_name_ + "/noc " + "-I" + this->root_ + "tt_metal/hw/ckernels/" + + this->arch_name_ + "/metal/common " + "-I" + this->root_ + "tt_metal/hw/ckernels/" + + this->arch_name_ + "/metal/llk_io " + "-I" + this->root_ + "tt_metal/third_party/tt_llk/tt_llk_" + + this->arch_name_ + "/common/inc " + // TODO(fixme) datamovement fw shouldn't read this + "-I" + this->root_ + "tt_metal/api/" + this->aliased_arch_name_ + " " + "-I" + this->root_ + + "tt_metal/api/" + this->aliased_arch_name_ + "/tt-metalium " + "-I" + this->root_ + + "tt_metal/api/tt-metalium/ " + "-I" + this->root_ + "tt_metal/api/ " + "-I" + this->root_ + + "tt_metal/third_party/tt_llk/tt_llk_" + this->arch_name_ + "/llk_lib "; this->lflags_ = common_flags; this->lflags_ += "-fno-exceptions -Wl,-z,max-page-size=16 -Wl,-z,common-page-size=16 -nostartfiles "; @@ -277,13 +273,9 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, const JitBuil this->default_linker_opt_level_ = "Os"; this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_; this->cflags_ = env_.cflags_ + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops - - // clang-format off - this->includes_ = env_.includes_ + - "-I " + env_.root_ + "tt_metal/hw/firmware/src " + - "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + - "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; - // clang-format on + this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/firmware/src " + "-I " + env_.root_ + + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + "-I " + env_.root_ + + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; this->defines_ = env_.defines_; @@ -363,17 +355,13 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConf this->defines_ += "-DDISABLE_L1_DATA_CACHE "; } - // clang-format off - this->includes_ = env_.includes_ + - "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/inc " + - "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + - "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io " + - "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api " + - "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api/llk_sfpu " + - "-I" + env_.root_ + "runtime/sfpi/include " + - "-I" + env_.root_ + "tt_metal/hw/firmware/src " + - "-I" + env_.root_ + "tt_metal/third_party/tt_llk/tt_llk_" + env.arch_name_ + "/llk_lib "; - // clang-format on + this->includes_ = env_.includes_ + "-I" + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/inc " + "-I" + + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + "-I" + env_.root_ + + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io " + "-I" + env_.root_ + + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api " + "-I" + env_.root_ + + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_api/llk_sfpu " + "-I" + env_.root_ + + "runtime/sfpi/include " + "-I" + env_.root_ + "tt_metal/hw/firmware/src " + "-I" + env_.root_ + + "tt_metal/third_party/tt_llk/tt_llk_" + env.arch_name_ + "/llk_lib "; if (this->is_fw_) { this->srcs_.push_back("tt_metal/hw/firmware/src/trisc.cc"); @@ -448,12 +436,9 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit this->default_linker_opt_level_ = "Os"; this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_; - // clang-format off - this->includes_ = env_.includes_ + - "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + - "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io " + - "-I " + env_.root_ + "tt_metal/hw/inc/ethernet "; - // clang-format on + this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + + "/metal/common " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + + "/metal/llk_io " + "-I " + env_.root_ + "tt_metal/hw/inc/ethernet "; this->defines_ = env_.defines_; uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( @@ -558,11 +543,9 @@ JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuil this->default_linker_opt_level_ = "Os"; this->out_path_ = this->is_fw_ ? env_.out_firmware_root_ : env_.out_kernel_root_; - // clang-format off - this->includes_ = env_.includes_ + - "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + - "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/llk_io "; - // clang-format on + this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + + "/metal/common " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + + "/metal/llk_io "; this->defines_ = env_.defines_; uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 90403bfd3fa..762a7ee3e88 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -109,9 +109,6 @@ RunTimeOptions::RunTimeOptions() { } } - const char* fb_fabric = getenv("TT_METAL_FD_FABRIC"); - fb_fabric_en = fb_fabric != nullptr; - const char* dispatch_data_collection_str = std::getenv("TT_METAL_DISPATCH_DATA_COLLECTION"); if (dispatch_data_collection_str != nullptr) { enable_dispatch_data_collection = true; @@ -401,7 +398,7 @@ void RunTimeOptions::ParseFeaturePrependDeviceCoreRisc(RunTimeDebugFeatures feat // Can't create a DispatchCoreConfig as part of the RTOptions constructor because the DispatchCoreConfig constructor // depends on RTOptions settings. -tt_metal::DispatchCoreConfig RunTimeOptions::get_dispatch_core_config() const { +tt_metal::DispatchCoreConfig RunTimeOptions::get_dispatch_core_config() { tt_metal::DispatchCoreConfig dispatch_core_config = tt_metal::DispatchCoreConfig{}; dispatch_core_config.set_dispatch_core_type(this->dispatch_core_type); return dispatch_core_config;