From 0a727d83f999ca9d30313d9e763389224216d2ed Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Fri, 21 Feb 2025 18:09:10 +0000 Subject: [PATCH 1/7] #16541 - device profiler noc tracing feature --- .clangd | 3 + .../tools/profiler/test_device_profiler.py | 18 + tt_metal/api/tt-metalium/profiler.hpp | 72 +++- .../profiler_optional_metadata.hpp | 29 ++ tt_metal/api/tt-metalium/rtoptions.hpp | 4 + tt_metal/api/tt-metalium/tt_metal.hpp | 8 +- tt_metal/hw/firmware/src/brisc.cc | 1 + tt_metal/hw/inc/dataflow_api.h | 77 ++++ tt_metal/hw/inc/ethernet/tunneling.h | 1 + tt_metal/jit_build/build.cpp | 7 + tt_metal/llrt/rtoptions.cpp | 14 +- .../profiler/CMakeLists.txt | 1 + .../kernels/loopback_dram_copy.cpp | 27 ++ .../test_noc_event_profiler.cpp | 110 ++++++ tt_metal/tools/profiler/event_metadata.hpp | 73 ++++ tt_metal/tools/profiler/kernel_profiler.hpp | 20 +- .../tools/profiler/noc_event_profiler.hpp | 153 ++++++++ tt_metal/tools/profiler/profiler.cpp | 342 ++++++++++++++++-- tt_metal/tools/profiler/tt_metal_profiler.cpp | 10 +- ttnn/cpp/pybind11/device.cpp | 7 +- ttnn/tools/profiler/op_profiler.hpp | 47 +++ 21 files changed, 974 insertions(+), 50 deletions(-) create mode 100644 tt_metal/api/tt-metalium/profiler_optional_metadata.hpp create mode 100644 tt_metal/programming_examples/profiler/test_noc_event_profiler/kernels/loopback_dram_copy.cpp create mode 100644 tt_metal/programming_examples/profiler/test_noc_event_profiler/test_noc_event_profiler.cpp create mode 100644 tt_metal/tools/profiler/event_metadata.hpp create mode 100644 tt_metal/tools/profiler/noc_event_profiler.hpp diff --git a/.clangd b/.clangd index 51066a4a270..d749f7fa16d 100644 --- a/.clangd +++ b/.clangd @@ -2,3 +2,6 @@ CompileFlags: Add: - "std=c++20" + - "-DPROFILE_NOC_EVENTS=1" + - "-DPROFILE_KERNEL=1" + - "-DCOMPILE_FOR_BRISC=1" diff --git a/tests/tt_metal/tools/profiler/test_device_profiler.py b/tests/tt_metal/tools/profiler/test_device_profiler.py index dbb2d6313f8..b93c35864cc 100644 --- a/tests/tt_metal/tools/profiler/test_device_profiler.py +++ b/tests/tt_metal/tools/profiler/test_device_profiler.py @@ -351,6 +351,24 @@ def test_timestamped_events(): assert eventCount in REF_COUNT_DICT[ENV_VAR_ARCH_NAME], "Wrong event count" +def test_noc_event_profiler(): + ENV_VAR_ARCH_NAME = os.getenv("ARCH_NAME") + assert ENV_VAR_ARCH_NAME in ["grayskull", "wormhole_b0", "blackhole"] + + testCommand = f"build/{PROG_EXMP_DIR}/test_noc_event_profiler" + clear_profiler_runtime_artifacts() + nocEventProfilerEnv = "TT_METAL_DEVICE_PROFILER_NOC_EVENTS=1" + profilerRun = os.system(f"cd {TT_METAL_HOME} && {nocEventProfilerEnv} {testCommand}") + assert profilerRun == 0 + + expected_trace_file = f"{PROFILER_LOGS_DIR}/noc_trace_dev0_ID0.json" + assert os.path.isfile(expected_trace_file) + + with open(expected_trace_file, "r") as nocTraceJson: + noc_trace_data = json.load(nocTraceJson) + assert len(noc_trace_data) == 8 + + def test_sub_device_profiler(): ARCH_NAME = os.getenv("ARCH_NAME") run_gtest_profiler_test( diff --git a/tt_metal/api/tt-metalium/profiler.hpp b/tt_metal/api/tt-metalium/profiler.hpp index 098943a7b6c..70c9f37a148 100644 --- a/tt_metal/api/tt-metalium/profiler.hpp +++ b/tt_metal/api/tt-metalium/profiler.hpp @@ -14,9 +14,12 @@ #include "program_impl.hpp" #include "profiler_state.hpp" #include "common.hpp" +#include "profiler_optional_metadata.hpp" #include "tracy/TracyTTDevice.hpp" #include "common/TracyTTDeviceData.hpp" +#include + using std::chrono::duration; using std::chrono::duration_cast; using std::chrono::nanoseconds; @@ -61,10 +64,24 @@ class DeviceProfiler { // Iterate through all zone source locations and generate hash void generateZoneSourceLocationsHashes(); + // serialize all noc trace data into per-op json trace files + void serializeJsonNocTraces( + const nlohmann::ordered_json& noc_trace_json_log, const std::filesystem::path& output_dir, int device_id); + + void emitCSVHeader( + std::ofstream& log_file_ofs, const tt::ARCH& device_architecture, int device_core_frequency) const; + + // translates potentially-virtual coordinates recorded on Device into physical coordinates + CoreCoord getPhysicalAddressFromVirtual(const IDevice* device, const CoreCoord& c) const; + // Dumping profile result to file - void dumpResultToFile( + void logPacketData( + const IDevice* device, + std::ofstream& log_file_ofs, + nlohmann::ordered_json& noc_trace_json_log, uint32_t runID, uint32_t runHostID, + const std::string& opname, int device_id, CoreCoord core, int core_flat, @@ -73,9 +90,51 @@ class DeviceProfiler { uint32_t timer_id, uint64_t timestamp); + // logs packet data to CSV file + void logPacketDataToCSV( + const IDevice* device, + std::ofstream& log_file_ofs, + int device_id, + int core_x, + int core_y, + const std::string_view risc_name, + uint32_t timer_id, + uint64_t timestamp, + uint64_t data, + uint32_t run_id, + uint32_t run_host_id, + const std::string_view opname, + const std::string_view zone_name, + kernel_profiler::PacketTypes packet_type, + uint64_t source_line, + const std::string_view source_file); + + // dump noc trace related profile data to json file + void logNocTracePacketDataToJson( + const IDevice* device, + nlohmann::ordered_json& noc_trace_json_log, + int device_id, + int core_x, + int core_y, + const std::string_view risc_name, + uint32_t timer_id, + uint64_t timestamp, + uint64_t data, + uint32_t run_id, + uint32_t run_host_id, + const std::string_view opname, + const std::string_view zone_name, + kernel_profiler::PacketTypes packet_type, + uint64_t source_line, + const std::string_view source_file); + // Helper function for reading risc profile results void readRiscProfilerResults( - IDevice* device, CoreCoord& worker_core); + IDevice* device, + const CoreCoord& worker_core, + const std::optional& metadata, + std::ofstream& log_file_ofs, + nlohmann::ordered_json& noc_trace_json_log); // Push device results to tracy void pushTracyDeviceResults(); @@ -100,17 +159,17 @@ class DeviceProfiler { // DRAM Vector std::vector profile_buffer; - //Device events + // Device events std::set device_events; std::set device_sync_events; std::set device_sync_new_events; - //shift + // shift int64_t shift = 0; - //frequency scale + // frequency scale double freqScale = 1.0; uint32_t my_device_id = 0; @@ -128,7 +187,8 @@ class DeviceProfiler { void dumpResults( IDevice* device, const std::vector& worker_cores, - ProfilerDumpState state = ProfilerDumpState::NORMAL); + ProfilerDumpState state = ProfilerDumpState::NORMAL, + const std::optional& metadata = {}); }; } // namespace tt_metal diff --git a/tt_metal/api/tt-metalium/profiler_optional_metadata.hpp b/tt_metal/api/tt-metalium/profiler_optional_metadata.hpp new file mode 100644 index 00000000000..00982b86162 --- /dev/null +++ b/tt_metal/api/tt-metalium/profiler_optional_metadata.hpp @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "map" + +class ProfilerOptionalMetadata { + using DeviceID = uint32_t; + using RuntimeID = uint32_t; + +public: + ProfilerOptionalMetadata(std::map, std::string>&& runtime_map) : + runtime_id_to_opname(std::move(runtime_map)) {} + + const std::string& getOpName(DeviceID device_id, RuntimeID runtime_id) const { + static const std::string empty_string; + auto key = std::make_pair(device_id, runtime_id); + auto it = runtime_id_to_opname.find(key); + if (it != runtime_id_to_opname.end()) { + return it->second; + } + return empty_string; + } + +private: + std::map, std::string> runtime_id_to_opname; +}; diff --git a/tt_metal/api/tt-metalium/rtoptions.hpp b/tt_metal/api/tt-metalium/rtoptions.hpp index 18cb16a8f81..1bc3d3615a3 100644 --- a/tt_metal/api/tt-metalium/rtoptions.hpp +++ b/tt_metal/api/tt-metalium/rtoptions.hpp @@ -111,6 +111,8 @@ class RunTimeOptions { bool profile_dispatch_cores = false; bool profiler_sync_enabled = false; bool profiler_buffer_usage_enabled = false; + bool profiler_noc_events_enabled = false; + std::string profiler_noc_events_report_path; bool null_kernels = false; @@ -287,6 +289,8 @@ class RunTimeOptions { inline bool get_profiler_do_dispatch_cores() { return profile_dispatch_cores; } inline bool get_profiler_sync_enabled() { return profiler_sync_enabled; } inline bool get_profiler_buffer_usage_enabled() { return profiler_buffer_usage_enabled; } + inline bool get_profiler_noc_events_enabled() { return profiler_noc_events_enabled; } + inline std::string get_profiler_noc_events_report_path() { return profiler_noc_events_report_path; } inline void set_kernels_nullified(bool v) { null_kernels = v; } inline bool get_kernels_nullified() { return null_kernels; } diff --git a/tt_metal/api/tt-metalium/tt_metal.hpp b/tt_metal/api/tt-metalium/tt_metal.hpp index 0c8ee8dabb4..9de85184a75 100644 --- a/tt_metal/api/tt-metalium/tt_metal.hpp +++ b/tt_metal/api/tt-metalium/tt_metal.hpp @@ -14,6 +14,7 @@ #include "buffer.hpp" #include "profiler.hpp" #include "llrt/tt_cluster.hpp" +#include "profiler_optional_metadata.hpp" namespace tt::tt_metal { inline namespace v0 { @@ -220,7 +221,10 @@ void ProfilerSync(ProfilerSyncState state); * | satate | Dumpprofiler various states | ProfilerDumpState | | False | * */ void DumpDeviceProfileResults( - IDevice* device, std::vector& worker_cores, ProfilerDumpState = ProfilerDumpState::NORMAL); + IDevice* device, + std::vector& worker_cores, + ProfilerDumpState = ProfilerDumpState::NORMAL, + const std::optional& metadata = {}); /** * Traverse all cores and read device side profiler data and dump results into device side CSV log @@ -232,7 +236,7 @@ void DumpDeviceProfileResults( * | device | The device holding the program being profiled. | Device * | | True | * | satate | Dumpprofiler various states | ProfilerDumpState | | False | * */ -void DumpDeviceProfileResults(IDevice* device, ProfilerDumpState = ProfilerDumpState::NORMAL); +void DumpDeviceProfileResults(IDevice* device, ProfilerDumpState = ProfilerDumpState::NORMAL, const std::optional& metadata = {}); /** * Set the directory for device-side CSV logs produced by the profiler instance in the tt-metal module diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 03e78e1be33..cdd1083ad55 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -7,6 +7,7 @@ #include // clang-format off +#undef PROFILE_NOC_EVENTS #include "risc_common.h" #include "tensix.h" #include "tensix_types.h" diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index cd1e33c417c..c3bd17e9fa0 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -28,6 +28,7 @@ #include "dev_msgs.h" #include "dataflow_api_common.h" #include "dataflow_api_addrgen.h" +#include "tools/profiler/kernel_profiler.hpp" // clang-format off /** @@ -500,6 +501,9 @@ inline void noc_async_read( Read requests - use static VC Read responses - assigned VCs dynamically */ + + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ,src_noc_addr,size, -1); + if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) { noc_async_read_one_packet(src_noc_addr, dst_local_l1_addr, size, noc); } else { @@ -519,6 +523,8 @@ void noc_async_read_one_packet_set_state(std::uint64_t src_noc_addr, std::uint32 Read responses - assigned VCs dynamically */ + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ_SET_STATE, src_noc_addr, size, -1); + WAYPOINT("RP3W"); while (!noc_cmd_buf_ready(noc, read_cmd_buf)); WAYPOINT("RP3D"); @@ -554,6 +560,8 @@ FORCE_INLINE void noc_async_read_one_packet_with_state( Read responses - assigned VCs dynamically */ + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ_WITH_STATE, static_cast(src_noc_addr), 0, -1); + WAYPOINT("RP4W"); while (!noc_cmd_buf_ready(noc, read_cmd_buf)); WAYPOINT("RP4D"); @@ -586,6 +594,8 @@ void noc_async_read_set_state(std::uint64_t src_noc_addr, uint8_t noc = noc_inde Read responses - assigned VCs dynamically */ + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ_SET_STATE,src_noc_addr,0,-1); + WAYPOINT("RP5W"); while (!noc_cmd_buf_ready(noc, read_cmd_buf)); WAYPOINT("RP5D"); @@ -618,6 +628,8 @@ FORCE_INLINE void noc_async_read_with_state( Read requests - use static VC Read responses - assigned VCs dynamically */ + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ_WITH_STATE,src_noc_addr,size,-1); + WAYPOINT("NAVW"); // In order to sanitize, need to grab full noc addr + xfer size from state. @@ -678,6 +690,8 @@ void noc_async_read_inc_num_issued(std::uint32_t num_issued_reads_inc, uint8_t n FORCE_INLINE void noc_async_write_one_packet( std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size, uint8_t noc = noc_index) { + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_,dst_noc_addr,size,NOC_UNICAST_WRITE_VC); + WAYPOINT("NWPW"); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); while (!noc_cmd_buf_ready(noc, write_cmd_buf)); @@ -722,6 +736,9 @@ void noc_async_write_multicast_one_packet( bool linked = false, bool multicast_path_reserve = true, uint8_t noc = noc_index) { + + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_MULTICAST,dst_noc_addr_multicast,size, NOC_MULTICAST_WRITE_VC); + WAYPOINT("NWPW"); DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, size); while (!noc_cmd_buf_ready(noc, write_cmd_buf)); @@ -760,6 +777,8 @@ void noc_async_write_multicast_one_packet( template FORCE_INLINE void noc_async_write_one_packet_set_state( std::uint64_t dst_noc_addr, std::uint32_t size, uint8_t noc = noc_index, uint8_t vc = NOC_UNICAST_WRITE_VC) { + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_SET_STATE, dst_noc_addr, size, vc); + WAYPOINT("NWPW"); while (!noc_cmd_buf_ready(noc, write_cmd_buf)); WAYPOINT("NWPD"); @@ -787,6 +806,8 @@ FORCE_INLINE void noc_async_write_one_packet_set_state( template FORCE_INLINE void noc_async_write_one_packet_with_state( std::uint32_t src_local_l1_addr, std::uint32_t dst_noc_addr, uint8_t noc = noc_index) { + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_WITH_STATE, 0ull, 0, -1); + WAYPOINT("NWPW"); while (!noc_cmd_buf_ready(noc, write_cmd_buf)); WAYPOINT("NWPD"); @@ -820,6 +841,8 @@ FORCE_INLINE void noc_async_read_page( Read requests - use static VC Read responses - assigned VCs dynamically */ + RECORD_NOC_EVENT_WITH_ID(NocEventType::READ, id, s.page_size, -1); + s.noc_async_read_page(id, dst_local_l1_addr, offset, noc); } @@ -834,6 +857,8 @@ FORCE_INLINE void noc_async_read_tile( Read requests - use static VC Read responses - assigned VCs dynamically */ + RECORD_NOC_EVENT_WITH_ID(NocEventType::READ, id, s.page_size, -1); + s.noc_async_read_tile(id, dst_local_l1_addr, offset, noc); } @@ -863,6 +888,9 @@ inline void noc_async_write( if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) { noc_async_write_one_packet(src_local_l1_addr, dst_noc_addr, size, noc); } else { + + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_, dst_noc_addr, size, NOC_UNICAST_WRITE_VC); + WAYPOINT("NAWW"); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); ncrisc_noc_fast_write_any_len( @@ -877,6 +905,8 @@ FORCE_INLINE void noc_async_write_tile( const InterleavedAddrGenFast& s, std::uint32_t src_local_l1_addr, uint8_t noc = noc_index) { + RECORD_NOC_EVENT_WITH_ID(NocEventType::WRITE_, id, s.page_size, NOC_UNICAST_WRITE_VC); + s.noc_async_write_tile(id, src_local_l1_addr, noc); } @@ -952,6 +982,7 @@ inline void noc_async_write_multicast( } else { WAYPOINT("NMWW"); DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, size); + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_MULTICAST, dst_noc_addr_multicast, size, NOC_MULTICAST_WRITE_VC); ncrisc_noc_fast_write_any_len( noc, write_cmd_buf, @@ -1164,6 +1195,9 @@ inline void noc_async_write_multicast_exclude_region( * Return value: None */ void noc_async_read_barrier(uint8_t noc = noc_index) { + + RECORD_NOC_EVENT(NocEventType::READ_BARRIER_START); + WAYPOINT("NRBW"); if constexpr (noc_mode == DM_DYNAMIC_NOC) { while (!ncrisc_dynamic_noc_reads_flushed(noc)) { @@ -1174,6 +1208,8 @@ void noc_async_read_barrier(uint8_t noc = noc_index) { } invalidate_l1_cache(); WAYPOINT("NRBD"); + + RECORD_NOC_EVENT(NocEventType::READ_BARRIER_END); } /** @@ -1186,6 +1222,9 @@ void noc_async_read_barrier(uint8_t noc = noc_index) { */ FORCE_INLINE void noc_async_write_barrier(uint8_t noc = noc_index) { + + RECORD_NOC_EVENT(NocEventType::WRITE_BARRIER_START); + WAYPOINT("NWBW"); if constexpr (noc_mode == DM_DYNAMIC_NOC) { while (!ncrisc_dynamic_noc_nonposted_writes_flushed(noc)) { @@ -1196,6 +1235,8 @@ void noc_async_write_barrier(uint8_t noc = noc_index) { } invalidate_l1_cache(); WAYPOINT("NWBD"); + + RECORD_NOC_EVENT(NocEventType::WRITE_BARRIER_END); } /** @@ -1205,6 +1246,9 @@ void noc_async_write_barrier(uint8_t noc = noc_index) { */ FORCE_INLINE void noc_async_writes_flushed(uint8_t noc = noc_index) { + + RECORD_NOC_EVENT(NocEventType::WRITE_FLUSH); + WAYPOINT("NWFW"); if constexpr (noc_mode == DM_DYNAMIC_NOC) { while (!ncrisc_dynamic_noc_nonposted_writes_sent(noc)) { @@ -1246,6 +1290,9 @@ void noc_async_posted_writes_flushed(uint8_t noc = noc_index) { */ FORCE_INLINE void noc_async_atomic_barrier(uint8_t noc_idx = noc_index) { + + RECORD_NOC_EVENT(NocEventType::ATOMIC_BARRIER); + WAYPOINT("NABW"); if constexpr (noc_mode == DM_DYNAMIC_NOC) { while (!ncrisc_dynamic_noc_nonposted_atomics_flushed(noc_idx)) { @@ -1269,6 +1316,7 @@ void noc_async_atomic_barrier(uint8_t noc_idx = noc_index) { FORCE_INLINE void noc_async_full_barrier(uint8_t noc_idx = noc_index) { invalidate_l1_cache(); + RECORD_NOC_EVENT(NocEventType::FULL_BARRIER); if constexpr (noc_mode == DM_DYNAMIC_NOC) { WAYPOINT("NFBW"); while (!ncrisc_dynamic_noc_reads_flushed(noc_idx)); @@ -1313,6 +1361,9 @@ void noc_async_full_barrier(uint8_t noc_idx = noc_index) { // clang-format on FORCE_INLINE void noc_semaphore_wait(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { + + RECORD_NOC_EVENT(NocEventType::SEMAPHORE_WAIT); + WAYPOINT("NSW"); do { invalidate_l1_cache(); @@ -1337,6 +1388,9 @@ void noc_semaphore_wait(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { // clang-format on FORCE_INLINE void noc_semaphore_wait_min(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { + + RECORD_NOC_EVENT(NocEventType::SEMAPHORE_WAIT); + WAYPOINT("NSMW"); do { invalidate_l1_cache(); @@ -1361,6 +1415,9 @@ void noc_semaphore_wait_min(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) // clang-format on FORCE_INLINE void noc_semaphore_set(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { + + RECORD_NOC_EVENT(NocEventType::SEMAPHORE_SET); + // set semaphore value to val (*sem_addr) = val; } @@ -1389,6 +1446,9 @@ void noc_semaphore_set(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { // clang-format on FORCE_INLINE void noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t be = 0xF, uint8_t noc = noc_index) { + + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_INLINE, addr, 32, NOC_UNICAST_WRITE_VC); + WAYPOINT("NWIW"); DEBUG_SANITIZE_NOC_ADDR(noc, addr, 4); noc_fast_write_dw_inline( @@ -1425,6 +1485,9 @@ void noc_semaphore_inc(uint64_t addr, uint32_t incr, uint8_t noc_id = noc_index) [REFER TO grayskull/noc/noc.h for the documentation of noc_atomic_increment()] Generic increment with 32-bit wrap. */ + + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::SEMAPHORE_INC,addr,0, NOC_UNICAST_WRITE_VC); + WAYPOINT("NSIW"); DEBUG_SANITIZE_NOC_ADDR(noc_id, addr, 4); DEBUG_INSERT_DELAY(TransactionAtomic); @@ -1464,6 +1527,9 @@ FORCE_INLINE uint32_t noc_async_read_tile_dram_sharded_set_state( src_addr_ = bank_base_address + bank_to_dram_offset[bank_id]; src_noc_xy = dram_bank_to_noc_xy[noc][bank_id]; + RECORD_NOC_EVENT_WITH_ADDR( + NocEventType::READ_DRAM_SHARDED_SET_STATE, uint64_t(src_noc_xy) << 32, page_size, (use_vc) ? vc : -1); + WAYPOINT("NRTW"); while (!noc_cmd_buf_ready(noc, read_cmd_buf)); WAYPOINT("NRTD"); @@ -1483,6 +1549,8 @@ FORCE_INLINE uint32_t noc_async_read_tile_dram_sharded_set_state( FORCE_INLINE void noc_async_read_tile_dram_sharded_with_state( uint32_t src_base_addr, uint32_t src_addr, uint32_t dest_addr, uint32_t trid = 0, uint8_t noc = noc_index) { + RECORD_NOC_EVENT(NocEventType::READ_DRAM_SHARDED_WITH_STATE); + uint32_t src_addr_; src_addr_ = src_base_addr + src_addr; @@ -1504,6 +1572,8 @@ void noc_async_read_tile_dram_sharded_with_state( FORCE_INLINE void noc_async_read_tile_dram_sharded_with_state_with_trid( uint32_t src_base_addr, uint32_t src_addr, uint32_t dest_addr, uint32_t trid = 0, uint8_t noc = noc_index) { + RECORD_NOC_EVENT(NocEventType::READ_DRAM_SHARDED_WITH_STATE); + WAYPOINT("NRDW"); #ifndef ARCH_GRAYSKULL ncrisc_noc_fast_read_with_transaction_id(noc, read_cmd_buf, src_base_addr, src_addr, dest_addr, trid); @@ -1513,6 +1583,9 @@ void noc_async_read_tile_dram_sharded_with_state_with_trid( FORCE_INLINE void noc_async_read_tile_dram_sharded_set_trid(uint32_t trid = 0, uint8_t noc = noc_index) { + + RECORD_NOC_EVENT(NocEventType::READ_SET_TRID); + WAYPOINT("NSTW"); #ifndef ARCH_GRAYSKULL ncrisc_noc_set_transaction_id(noc, read_cmd_buf, trid); @@ -1523,6 +1596,7 @@ void noc_async_read_tile_dram_sharded_set_trid(uint32_t trid = 0, uint8_t noc = FORCE_INLINE void noc_async_read_barrier_with_trid(uint32_t trid, uint8_t noc = noc_index) { WAYPOINT("NBTW"); + RECORD_NOC_EVENT(NocEventType::READ_BARRIER_WITH_TRID); #ifndef ARCH_GRAYSKULL while (!ncrisc_noc_read_with_transaction_id_flushed(noc, trid)); #endif @@ -1534,6 +1608,7 @@ FORCE_INLINE void noc_async_write_one_packet_with_trid_set_state( std::uint64_t dst_noc_addr, uint8_t cmd_buf = write_cmd_buf, uint8_t noc = noc_index) { #ifndef ARCH_GRAYSKULL WAYPOINT("NAWW"); + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_WITH_TRID_SET_STATE, dst_noc_addr, 0, NOC_UNICAST_WRITE_VC); while (!noc_cmd_buf_ready(noc, cmd_buf)); WAYPOINT("NAWD"); uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(NOC_UNICAST_WRITE_VC) | @@ -1560,6 +1635,7 @@ FORCE_INLINE void noc_async_write_one_packet_with_trid_with_state( uint8_t noc = noc_index) { #ifndef ARCH_GRAYSKULL WAYPOINT("NWPW"); + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_WITH_TRID_WITH_STATE, 0ull, size, -1); while (!noc_cmd_buf_ready(noc, cmd_buf)); WAYPOINT("NWPD"); @@ -1580,6 +1656,7 @@ FORCE_INLINE void noc_async_write_one_packet_with_trid( std::uint32_t trid, uint8_t noc = noc_index) { WAYPOINT("NAWW"); + RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_WITH_TRID, dst_noc_addr, size, -1); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); #ifndef ARCH_GRAYSKULL ncrisc_noc_fast_write_any_len( diff --git a/tt_metal/hw/inc/ethernet/tunneling.h b/tt_metal/hw/inc/ethernet/tunneling.h index 2a77e284bdb..e090a75cedb 100644 --- a/tt_metal/hw/inc/ethernet/tunneling.h +++ b/tt_metal/hw/inc/ethernet/tunneling.h @@ -8,6 +8,7 @@ #include "erisc.h" #include "eth_l1_address_map.h" #include "noc_nonblocking_api.h" +#include "hw/inc/dataflow_api.h" inline void RISC_POST_STATUS(uint32_t status) { volatile uint32_t* ptr = (volatile uint32_t*)(NOC_CFG(ROUTER_CFG_2)); diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 53df41da5c9..f2f566cdae8 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -141,6 +141,13 @@ void JitBuildEnv::init( this->defines_ += "-DPROFILE_KERNEL=1 "; } } + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_noc_events_enabled()) { + // force profiler on if noc events are being profiled + if (not tt::tt_metal::getDeviceProfilerState()) { + this->defines_ += "-DPROFILE_KERNEL=1 "; + } + this->defines_ += "-DPROFILE_NOC_EVENTS=1 "; + } if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { this->defines_ += "-DWATCHER_ENABLED "; diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 762a7ee3e88..72f418f6680 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -72,7 +72,19 @@ RunTimeOptions::RunTimeOptions() { profiler_sync_enabled = true; } } - const char* profile_buffer_usage_str = std::getenv("TT_METAL_MEM_PROFILER"); + + const char *profiler_noc_events_str = std::getenv("TT_METAL_DEVICE_PROFILER_NOC_EVENTS"); + if (profiler_noc_events_str != nullptr && profiler_noc_events_str[0] == '1') { + profiler_enabled = true; + profiler_noc_events_enabled = true; + } + + const char *profiler_noc_events_report_path_str = std::getenv("TT_METAL_DEVICE_PROFILER_NOC_EVENTS_RPT_PATH"); + if (profiler_noc_events_report_path_str != nullptr) { + profiler_noc_events_report_path = profiler_noc_events_report_path_str; + } + + const char *profile_buffer_usage_str = std::getenv("TT_METAL_MEM_PROFILER"); if (profile_buffer_usage_str != nullptr && profile_buffer_usage_str[0] == '1') { profiler_buffer_usage_enabled = true; } diff --git a/tt_metal/programming_examples/profiler/CMakeLists.txt b/tt_metal/programming_examples/profiler/CMakeLists.txt index d2eb27d21c6..1adebb3c1e5 100644 --- a/tt_metal/programming_examples/profiler/CMakeLists.txt +++ b/tt_metal/programming_examples/profiler/CMakeLists.txt @@ -5,6 +5,7 @@ set(PROFILER_EXAMPLES_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_multi_op/test_multi_op.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_cores/test_dispatch_cores.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_timestamped_events/test_timestamped_events.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_noc_event_profiler/test_noc_event_profiler.cpp ) CREATE_PGM_EXAMPLES_EXE("${PROFILER_EXAMPLES_SRCS}" "profiler") diff --git a/tt_metal/programming_examples/profiler/test_noc_event_profiler/kernels/loopback_dram_copy.cpp b/tt_metal/programming_examples/profiler/test_noc_event_profiler/kernels/loopback_dram_copy.cpp new file mode 100644 index 00000000000..301034c7a86 --- /dev/null +++ b/tt_metal/programming_examples/profiler/test_noc_event_profiler/kernels/loopback_dram_copy.cpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +void kernel_main() { + std::uint32_t l1_buffer_addr = get_arg_val(0); + + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t dram_buffer_src_bank = get_arg_val(2); + + std::uint32_t dram_buffer_dst_addr = get_arg_val(3); + std::uint32_t dram_buffer_dst_bank = get_arg_val(4); + + std::uint32_t dram_buffer_size = get_arg_val(5); + + std::uint64_t dram_buffer_src_noc_addr = + get_noc_addr_from_bank_id(dram_buffer_src_bank, dram_buffer_src_addr); + noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, dram_buffer_size); + noc_async_read_barrier(); + + std::uint64_t dram_buffer_dst_noc_addr = + get_noc_addr_from_bank_id(dram_buffer_dst_bank, dram_buffer_dst_addr); + noc_async_write(l1_buffer_addr, dram_buffer_dst_noc_addr, dram_buffer_size); + noc_async_write_barrier(); +} diff --git a/tt_metal/programming_examples/profiler/test_noc_event_profiler/test_noc_event_profiler.cpp b/tt_metal/programming_examples/profiler/test_noc_event_profiler/test_noc_event_profiler.cpp new file mode 100644 index 00000000000..2d5f8ac09a1 --- /dev/null +++ b/tt_metal/programming_examples/profiler/test_noc_event_profiler/test_noc_event_profiler.cpp @@ -0,0 +1,110 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +/* + * 1. Host writes data to buffer in DRAM + * 2. dram_copy kernel on logical core {0, 0} BRISC copies data from buffer + * in step 1. to buffer in L1 and back to another buffer in DRAM + * 3. Host reads from buffer written to in step 2. + */ + +using namespace tt::tt_metal; + +int main(int argc, char** argv) { + if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) { + TT_THROW("Test not supported w/ slow dispatch, exiting"); + } + + bool pass = true; + + try { + /* + * Silicon accelerator setup + */ + constexpr int device_id = 0; + IDevice* device = CreateDevice(device_id); + + /* + * Setup program and command queue to execute along with its buffers and kernels to use + */ + CommandQueue& cq = device->command_queue(); + Program program = CreateProgram(); + + constexpr CoreCoord core = {0, 0}; + + KernelHandle dram_copy_kernel_id = CreateKernel( + program, + "tt_metal/programming_examples/profiler/test_noc_event_profiler/kernels/loopback_dram_copy.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + constexpr uint32_t single_tile_size = 2 * (32 * 32); + constexpr uint32_t num_tiles = 5; + constexpr uint32_t dram_buffer_size = single_tile_size * num_tiles; + + tt::tt_metal::InterleavedBufferConfig dram_config{ + .device = device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::DRAM}; + tt::tt_metal::InterleavedBufferConfig l1_config{ + .device = device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::L1}; + + auto l1_buffer = CreateBuffer(l1_config); + + auto input_dram_buffer = CreateBuffer(dram_config); + const uint32_t input_dram_buffer_addr = input_dram_buffer->address(); + + auto output_dram_buffer = CreateBuffer(dram_config); + const uint32_t output_dram_buffer_addr = output_dram_buffer->address(); + + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + const uint32_t input_bank_id = 0; + const uint32_t output_bank_id = 0; + + /* + * Create input data and runtime arguments, then execute + */ + std::vector input_vec = create_random_vector_of_bfloat16( + dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); + EnqueueWriteBuffer(cq, input_dram_buffer, input_vec, false); + + const std::vector runtime_args = { + l1_buffer->address(), + input_dram_buffer->address(), + input_bank_id, + output_dram_buffer->address(), + output_bank_id, + l1_buffer->size()}; + + SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); + + EnqueueProgram(cq, program, false); + Finish(cq); + + DumpDeviceProfileResults(device, program); + pass &= CloseDevice(device); + + } catch (const std::exception& e) { + tt::log_error(tt::LogTest, "Test failed with exception!"); + tt::log_error(tt::LogTest, "{}", e.what()); + + throw; + } + + if (pass) { + tt::log_info(tt::LogTest, "Test Passed"); + } else { + TT_THROW("Test Failed"); + } + + return 0; +} diff --git a/tt_metal/tools/profiler/event_metadata.hpp b/tt_metal/tools/profiler/event_metadata.hpp new file mode 100644 index 00000000000..c77c6d71e27 --- /dev/null +++ b/tt_metal/tools/profiler/event_metadata.hpp @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include // for std::memcpy + +struct alignas(uint64_t) KernelProfilerNocEventMetadata { + enum class NocEventType : unsigned char { + UNDEF = 0, + READ, + READ_SET_STATE, + READ_SET_TRID, + READ_WITH_STATE, + READ_WITH_STATE_AND_TRID, + READ_BARRIER_START, + READ_BARRIER_END, + READ_BARRIER_WITH_TRID, + READ_DRAM_SHARDED_SET_STATE, + READ_DRAM_SHARDED_WITH_STATE, + + WRITE_, + WRITE_WITH_TRID, + WRITE_INLINE, + WRITE_MULTICAST, + WRITE_SET_STATE, + WRITE_WITH_STATE, + WRITE_WITH_TRID_SET_STATE, + WRITE_WITH_TRID_WITH_STATE, + WRITE_BARRIER_START, + WRITE_BARRIER_END, + WRITE_BARRIER_WITH_TRID, + WRITE_FLUSH, + + FULL_BARRIER, + + ATOMIC_BARRIER, + SEMAPHORE_INC, + SEMAPHORE_WAIT, + SEMAPHORE_SET, + + UNSUPPORTED + }; + enum class NocType : unsigned char { UNDEF = 0, NOC_0 = 1, NOC_1 = 2 }; + using NocVirtualChannel = int8_t; + static constexpr int8_t INVALID_COORD_VAL = -1; + + KernelProfilerNocEventMetadata() = default; + + // used during deserialization + explicit KernelProfilerNocEventMetadata(const uint64_t raw_data) { + std::memcpy(this, &raw_data, sizeof(KernelProfilerNocEventMetadata)); + } + + // these can be compressed to bit-fields if needed, but byte orientated has less overhead + int8_t dst_x = INVALID_COORD_VAL; + int8_t dst_y = INVALID_COORD_VAL; + int8_t mcast_end_dst_x = INVALID_COORD_VAL; + int8_t mcast_end_dst_y = INVALID_COORD_VAL; + NocEventType noc_xfer_type; + NocType noc_type : 4; + NocVirtualChannel noc_vc : 4; + uint16_t num_bytes; + + uint64_t asU64() const { + uint64_t ret; + std::memcpy(&ret, this, sizeof(uint64_t)); + return ret; + } +}; +static_assert(sizeof(KernelProfilerNocEventMetadata) == sizeof(uint64_t)); diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 817f5e1847d..e6e23c33a18 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -268,8 +268,9 @@ __attribute__((noinline)) void finish_profiler() { } __attribute__((noinline)) void quick_push() { -#if defined(DISPATCH_KERNEL) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES) && \ - (defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_IDLE_ERISC)) +#if ( \ + defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC) || \ + defined(COMPILE_FOR_IDLE_ERISC)) SrcLocNameToHash("PROFILER-NOC-QUICK-SEND"); mark_time_at_index_inlined(wIndex, hash); @@ -381,6 +382,14 @@ struct profileScopeAccumulate { } }; +// performs quick push to DRAM if buffers appear full +template +inline __attribute__((always_inline)) void flush_to_dram_if_full() { + if (not bufferHasRoom()) { + quick_push(); + } +} + template inline __attribute__((always_inline)) void timeStampedData(uint64_t data) { if (bufferHasRoom()) { @@ -400,6 +409,8 @@ inline __attribute__((always_inline)) void recordEvent(uint16_t event_id) { } } // namespace kernel_profiler +#include "noc_event_profiler.hpp" + // Not dispatch #if ( \ !defined(DISPATCH_KERNEL) || \ @@ -485,4 +496,9 @@ inline __attribute__((always_inline)) void recordEvent(uint16_t event_id) { #define DeviceRecordEvent(event_id) +// null macros when noc tracing is disabled +#define RECORD_NOC_EVENT_WITH_ADDR(type, noc_addr, num_bytes, vc) +#define RECORD_NOC_EVENT_WITH_ID(type, noc_id, num_bytes, vc) +#define RECORD_NOC_EVENT(type) + #endif diff --git a/tt_metal/tools/profiler/noc_event_profiler.hpp b/tt_metal/tools/profiler/noc_event_profiler.hpp new file mode 100644 index 00000000000..8835e21d3c8 --- /dev/null +++ b/tt_metal/tools/profiler/noc_event_profiler.hpp @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#if defined(PROFILE_NOC_EVENTS) && (defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_BRISC) || \ + defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_IDLE_ERISC)) + +#include +#include +#include "event_metadata.hpp" +#include "risc_attribs.h" +#include + +namespace noc_event_profiler { + +FORCE_INLINE +std::pair decode_noc_coord_reg_to_coord(uint16_t noc_xy_bits) { + constexpr uint32_t NOC_COORD_MASK = 0x3F; + uint32_t x = noc_xy_bits & NOC_COORD_MASK; + uint32_t y = (noc_xy_bits >> NOC_ADDR_NODE_ID_BITS) & NOC_COORD_MASK; + return {x, y}; +} + +FORCE_INLINE +std::pair decode_noc_xy_to_coord(uint32_t noc_xy) { + // shift so that coordinate is in LSB + return decode_noc_coord_reg_to_coord(noc_xy >> NOC_COORD_REG_OFFSET); +} + +FORCE_INLINE +std::pair decode_noc_addr_to_coord(uint64_t noc_addr) { + return decode_noc_coord_reg_to_coord(noc_addr >> NOC_ADDR_LOCAL_BITS); +} + +FORCE_INLINE +std::tuple decode_noc_addr_to_multicast_coord(uint64_t noc_addr) { + // coordinates are stored as two packed pairs. End coordinate is in lower + // bits like normal noc address; Start coordinate is in higher bits + auto [xend, yend] = decode_noc_coord_reg_to_coord(noc_addr >> NOC_ADDR_LOCAL_BITS); + auto [xstart, ystart] = + decode_noc_coord_reg_to_coord(noc_addr >> NOC_ADDR_LOCAL_BITS + (2 * NOC_ADDR_NODE_ID_BITS)); + + return {xstart, ystart, xend, yend}; +} + +template +FORCE_INLINE std::pair decode_noc_id_into_coord(uint32_t id, uint8_t noc = noc_index) { + uint32_t bank_offset_index = interleaved_addr_gen::get_bank_offset_index(id); + uint32_t bank_index = interleaved_addr_gen::get_bank_index(id, bank_offset_index); + return decode_noc_xy_to_coord(interleaved_addr_gen::get_noc_xy(bank_index, noc)); +} + +template +FORCE_INLINE void recordNocEvent( + KernelProfilerNocEventMetadata::NocEventType noc_event_type, + int32_t dst_x = -1, + int32_t dst_y = -1, + uint32_t num_bytes = 0, + int8_t vc = -1, + uint8_t noc = noc_index) { + KernelProfilerNocEventMetadata ev_md; + ev_md.dst_x = dst_x; + ev_md.dst_y = dst_y; + ev_md.noc_xfer_type = noc_event_type; + ev_md.num_bytes = std::min(std::numeric_limits::max(), num_bytes); + ev_md.noc_vc = vc; + ev_md.noc_type = + (noc == 1) ? KernelProfilerNocEventMetadata::NocType::NOC_1 : KernelProfilerNocEventMetadata::NocType::NOC_0; + + kernel_profiler::flush_to_dram_if_full(); + kernel_profiler::timeStampedData(ev_md.asU64()); +} + +template +FORCE_INLINE void recordMulticastNocEvent( + KernelProfilerNocEventMetadata::NocEventType noc_event_type, + int32_t mcast_dst_start_x, + int32_t mcast_dst_start_y, + int32_t mcast_dst_end_x, + int32_t mcast_dst_end_y, + uint32_t num_bytes, + int8_t vc = -1, + uint8_t noc = noc_index) { + KernelProfilerNocEventMetadata ev_md; + ev_md.dst_x = mcast_dst_start_x; + ev_md.dst_y = mcast_dst_start_y; + ev_md.mcast_end_dst_x = mcast_dst_end_x; + ev_md.mcast_end_dst_y = mcast_dst_end_y; + + ev_md.noc_xfer_type = noc_event_type; + ev_md.num_bytes = std::min(std::numeric_limits::max(), num_bytes); + + ev_md.noc_vc = vc; + ev_md.noc_type = + (noc == 1) ? KernelProfilerNocEventMetadata::NocType::NOC_1 : KernelProfilerNocEventMetadata::NocType::NOC_0; + + kernel_profiler::flush_to_dram_if_full(); + kernel_profiler::timeStampedData(ev_md.asU64()); +} + +template +inline void recordNocEventWithID( + KernelProfilerNocEventMetadata::NocEventType noc_event_type, NocIDU32 noc_id, uint32_t num_bytes, int8_t vc) { + static_assert(std::is_same_v); + auto [decoded_x, decoded_y] = decode_noc_id_into_coord(noc_id); + recordNocEvent(noc_event_type, decoded_x, decoded_y, num_bytes, vc); +} + +template +inline void recordNocEventWithAddr( + KernelProfilerNocEventMetadata::NocEventType noc_event_type, NocAddrU64 noc_addr, uint32_t num_bytes, int8_t vc) { + static_assert(std::is_same_v); + auto [decoded_x, decoded_y] = decode_noc_addr_to_coord(noc_addr); + recordNocEvent(noc_event_type, decoded_x, decoded_y, num_bytes, vc); +} + +} // namespace noc_event_profiler + +#define RECORD_NOC_EVENT_WITH_ADDR(event_type, noc_addr, num_bytes, vc) \ + { \ + using NocEventType = KernelProfilerNocEventMetadata::NocEventType; \ + if constexpr (event_type != NocEventType::WRITE_MULTICAST) { \ + noc_event_profiler::recordNocEventWithAddr(event_type, noc_addr, num_bytes, vc); \ + } else { \ + auto [mcast_dst_start_x, mcast_dst_start_y, mcast_dst_end_x, mcast_dst_end_y] = \ + noc_event_profiler::decode_noc_addr_to_multicast_coord(noc_addr); \ + noc_event_profiler::recordMulticastNocEvent( \ + event_type, mcast_dst_start_x, mcast_dst_start_y, mcast_dst_end_x, mcast_dst_end_y, num_bytes, vc); \ + } \ + } + +#define RECORD_NOC_EVENT_WITH_ID(event_type, noc_id, num_bytes, vc) \ + { \ + using NocEventType = KernelProfilerNocEventMetadata::NocEventType; \ + noc_event_profiler::recordNocEventWithID(event_type, noc_id, num_bytes, vc); \ + } + +#define RECORD_NOC_EVENT(event_type) \ + { \ + using NocEventType = KernelProfilerNocEventMetadata::NocEventType; \ + noc_event_profiler::recordNocEvent(event_type); \ + } + +#else + +// null macros when noc tracing is disabled +#define RECORD_NOC_EVENT_WITH_ADDR(type, noc_addr, num_bytes, vc) +#define RECORD_NOC_EVENT_WITH_ID(type, noc_id, num_bytes, vc) +#define RECORD_NOC_EVENT(type) + +#endif diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index 537c5bd0779..ab9af3a1ed9 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -17,6 +17,7 @@ #include #include "tracy/Tracy.hpp" #include +#include "event_metadata.hpp" #include "llrt.hpp" @@ -29,7 +30,11 @@ static kernel_profiler::PacketTypes get_packet_type(uint32_t timer_id) { } void DeviceProfiler::readRiscProfilerResults( - IDevice* device, CoreCoord& worker_core) { + IDevice* device, + const CoreCoord& worker_core, + const std::optional& metadata, + std::ofstream& log_file_ofs, + nlohmann::ordered_json& noc_trace_json_log) { ZoneScoped; auto device_id = device->id(); @@ -66,6 +71,14 @@ void DeviceProfiler::readRiscProfilerResults( return; } + // helper function to lookup opname from runtime id if metadata is available + auto getOpNameIfAvailable = [&metadata](auto device_id, auto runtime_id) { + return (metadata.has_value()) ? metadata->getOpName(device_id, runtime_id) : ""; + }; + + // translate worker core virtual coord to phys coordinates + auto phys_coord = getPhysicalAddressFromVirtual(device, worker_core); + int riscNum = 0; for (int riscEndIndex = 0; riscEndIndex < riscCount; riscEndIndex++) { uint32_t bufferEndIndex = control_buffer[riscEndIndex]; @@ -99,6 +112,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t opTime_H = 0; uint32_t opTime_L = 0; + std::string opname; for (int index = bufferRiscShift; index < (bufferRiscShift + bufferEndIndex); index += kernel_profiler::PROFILER_L1_MARKER_UINT32_SIZE) { if (!newRunStart && profile_buffer[index] == 0 && profile_buffer[index + 1] == 0) { @@ -114,6 +128,8 @@ void DeviceProfiler::readRiscProfilerResults( runCounterRead = profile_buffer[index + 1] & 0xFFFF; runHostCounterRead = (profile_buffer[index + 1] >> 16) & 0xFFFF; + opname = getOpNameIfAvailable(device_id, runHostCounterRead); + } else { uint32_t timer_id = (profile_buffer[index] >> 12) & 0x7FFFF; kernel_profiler::PacketTypes packet_type = get_packet_type(timer_id); @@ -149,11 +165,15 @@ void DeviceProfiler::readRiscProfilerResults( worker_core.y, runCounterRead); - dumpResultToFile( + logPacketData( + device, + log_file_ofs, + noc_trace_json_log, runCounterRead, runHostCounterRead, + opname, device_id, - worker_core, + phys_coord, coreFlatID, riscType, 0, @@ -166,11 +186,15 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t time_H = opTime_H; uint32_t time_L = opTime_L; - dumpResultToFile( + logPacketData( + device, + log_file_ofs, + noc_trace_json_log, runCounterRead, runHostCounterRead, + opname, device_id, - worker_core, + phys_coord, coreFlatID, riscType, sum, @@ -185,11 +209,15 @@ void DeviceProfiler::readRiscProfilerResults( index += kernel_profiler::PROFILER_L1_MARKER_UINT32_SIZE; uint32_t data_H = profile_buffer[index]; uint32_t data_L = profile_buffer[index + 1]; - dumpResultToFile( + logPacketData( + device, + log_file_ofs, + noc_trace_json_log, runCounterRead, runHostCounterRead, + opname, device_id, - worker_core, + phys_coord, coreFlatID, riscType, (uint64_t(data_H) << 32) | data_L, @@ -200,11 +228,15 @@ void DeviceProfiler::readRiscProfilerResults( case kernel_profiler::TS_EVENT: { uint32_t time_H = profile_buffer[index] & 0xFFF; uint32_t time_L = profile_buffer[index + 1]; - dumpResultToFile( + logPacketData( + device, + log_file_ofs, + noc_trace_json_log, runCounterRead, runHostCounterRead, + opname, device_id, - worker_core, + phys_coord, coreFlatID, riscType, 0, @@ -234,9 +266,13 @@ void DeviceProfiler::firstTimestamp(uint64_t timestamp) { } } -void DeviceProfiler::dumpResultToFile( +void DeviceProfiler::logPacketData( + const IDevice* device, + std::ofstream& log_file_ofs, + nlohmann::ordered_json& noc_trace_json_log, uint32_t run_id, uint32_t run_host_id, + const std::string& opname, int device_id, CoreCoord core, int core_flat, @@ -244,10 +280,6 @@ void DeviceProfiler::dumpResultToFile( uint64_t data, uint32_t timer_id, uint64_t timestamp) { - std::pair deviceCore = {device_id, core}; - std::filesystem::path log_path = output_dir / DEVICE_SIDE_LOG; - std::ofstream log_file; - kernel_profiler::PacketTypes packet_type = get_packet_type(timer_id); uint32_t t_id = timer_id & 0xFFFF; std::string zone_name = ""; @@ -292,20 +324,27 @@ void DeviceProfiler::dumpResultToFile( firstTimestamp(timestamp); - if (!std::filesystem::exists(log_path)) { - log_file.open(log_path); - log_file << "ARCH: " << get_string_lowercase(device_architecture) - << ", CHIP_FREQ[MHz]: " << device_core_frequency << std::endl; - log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], data, run ID, " - "run host ID, zone name, type, source line, source file" - << std::endl; - } else { - log_file.open(log_path, std::ios_base::app); - } + logPacketDataToCSV( + device, + log_file_ofs, + device_id, + core.x, + core.y, + tracy::riscName[risc_num], + t_id, + timestamp, + data, + run_id, + run_host_id, + opname, + zone_name, + packet_type, + source_line, + source_file); - // log_file << fmt::format("{:4},{:3},{:3},{:>7},{:7},{:15},{:15},{:5},{:>25},{:>6},{:6},{}", - log_file << fmt::format( - "{},{},{},{},{},{},{},{},{},{},{},{},{}", + logNocTracePacketDataToJson( + device, + noc_trace_json_log, device_id, core.x, core.y, @@ -315,12 +354,216 @@ void DeviceProfiler::dumpResultToFile( data, run_id, run_host_id, + opname, zone_name, - magic_enum::enum_name(packet_type), + packet_type, source_line, source_file); - log_file << std::endl; - log_file.close(); +} + +void DeviceProfiler::logPacketDataToCSV( + const IDevice* device, + std::ofstream& log_file_ofs, + int device_id, + int core_x, + int core_y, + const std::string_view risc_name, + uint32_t timer_id, + uint64_t timestamp, + uint64_t data, + uint32_t run_id, + uint32_t run_host_id, + const std::string_view opname, + const std::string_view zone_name, + kernel_profiler::PacketTypes packet_type, + uint64_t source_line, + const std::string_view source_file) { + log_file_ofs << fmt::format( + "{},{},{},{},{},{},{},{},{},{},{},{},{}", + device_id, + core_x, + core_y, + risc_name, + timer_id, + timestamp, + data, + run_id, + run_host_id, + zone_name, + magic_enum::enum_name(packet_type), + source_line, + source_file) + << std::endl; +} + +void DeviceProfiler::logNocTracePacketDataToJson( + const IDevice* device, + nlohmann::ordered_json& noc_trace_json_log, + int device_id, + int core_x, + int core_y, + const std::string_view risc_name, + uint32_t timer_id, + uint64_t timestamp, + uint64_t data, + uint32_t run_id, + uint32_t run_host_id, + const std::string_view opname, + const std::string_view zone_name, + kernel_profiler::PacketTypes packet_type, + uint64_t source_line, + const std::string_view source_file) { + if (packet_type == kernel_profiler::ZONE_START || packet_type == kernel_profiler::ZONE_END) { + if ((risc_name == "NCRISC" || risc_name == "BRISC") && + (zone_name.starts_with("TRUE-KERNEL-END") || zone_name.ends_with("-KERNEL"))) { + tracy::TTDeviceEventPhase zone_phase = (packet_type == kernel_profiler::ZONE_END) + ? tracy::TTDeviceEventPhase::end + : tracy::TTDeviceEventPhase::begin; + noc_trace_json_log.push_back(nlohmann::ordered_json{ + {"run_id", run_id}, + {"run_host_id", run_host_id}, + {"op_name", opname}, + {"proc", risc_name}, + {"zone", zone_name}, + {"zone_phase", magic_enum::enum_name(zone_phase)}, + {"sx", core_x}, + {"sy", core_y}, + {"timestamp", timestamp}, + }); + } + + } else if (packet_type == kernel_profiler::TS_DATA) { + KernelProfilerNocEventMetadata ev_md(data); + + nlohmann::ordered_json data = { + {"run_id", run_id}, + {"run_host_id", run_host_id}, + {"op_name", opname}, + {"proc", risc_name}, + {"noc", magic_enum::enum_name(ev_md.noc_type)}, + {"vc", int(ev_md.noc_vc)}, + {"sx", core_x}, + {"sy", core_y}, + {"num_bytes", uint32_t(ev_md.num_bytes)}, + {"type", magic_enum::enum_name(ev_md.noc_xfer_type)}, + {"timestamp", timestamp}, + }; + + // handle dst coordinates correctly for different NocEventType + if (ev_md.dst_x == -1 || ev_md.dst_y == -1 || + ev_md.noc_xfer_type == KernelProfilerNocEventMetadata::NocEventType::READ_WITH_STATE || + ev_md.noc_xfer_type == KernelProfilerNocEventMetadata::NocEventType::WRITE_WITH_STATE) { + // DO NOT emit destination coord; it isn't meaningful + + } else if (ev_md.noc_xfer_type == KernelProfilerNocEventMetadata::NocEventType::WRITE_MULTICAST) { + auto phys_start_coord = getPhysicalAddressFromVirtual(device, {ev_md.dst_x, ev_md.dst_y}); + data["mcast_start_x"] = phys_start_coord.x; + data["mcast_start_y"] = phys_start_coord.y; + auto phys_end_coord = getPhysicalAddressFromVirtual(device, {ev_md.mcast_end_dst_x, ev_md.mcast_end_dst_y}); + data["mcast_end_x"] = phys_end_coord.x; + data["mcast_end_y"] = phys_end_coord.y; + } else { + auto phys_coord = getPhysicalAddressFromVirtual(device, {ev_md.dst_x, ev_md.dst_y}); + data["dx"] = phys_coord.x; + data["dy"] = phys_coord.y; + } + + noc_trace_json_log.push_back(std::move(data)); + } +} + +void DeviceProfiler::emitCSVHeader( + std::ofstream& log_file_ofs, const tt::ARCH& device_architecture, int device_core_frequency) const { + log_file_ofs << "ARCH: " << get_string_lowercase(device_architecture) + << ", CHIP_FREQ[MHz]: " << device_core_frequency << std::endl; + log_file_ofs << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], data, run ID, " + "run host ID, zone name, type, source line, source file" + << std::endl; +} + +void DeviceProfiler::serializeJsonNocTraces( + const nlohmann::ordered_json& noc_trace_json_log, const std::filesystem::path& output_dir, int device_id) { + // create output directory if it does not exist + std::filesystem::create_directories(output_dir); + if (!std::filesystem::is_directory(output_dir)) { + log_error( + "Could not write noc event json trace to '{}' because the directory path could not be created!", + output_dir); + return; + } + + // bin events by runtime id + using RuntimeID = uint32_t; + std::unordered_map events_by_opname; + for (auto& json_event : noc_trace_json_log) { + RuntimeID runtime_id = json_event.value("run_host_id", -1); + events_by_opname[runtime_id].push_back(json_event); + } + + // sort events in each opname group by proc first, then timestamp + for (auto& [runtime_id, events] : events_by_opname) { + std::sort(events.begin(), events.end(), [](const auto& a, const auto& b) { + auto sx_a = a.value("sx", 0); + auto sy_a = a.value("sy", 0); + auto sx_b = b.value("sx", 0); + auto sy_b = b.value("sy", 0); + auto proc_a = a.value("proc", ""); + auto proc_b = b.value("proc", ""); + auto timestamp_a = a.value("timestamp", 0); + auto timestamp_b = b.value("timestamp", 0); + return std::tie(sx_a, sy_a, proc_a, timestamp_a) < std::tie(sx_b, sy_b, proc_b, timestamp_b); + }); + } + + // for each opname in events_by_opname, adjust timestamps to be relative to the smallest timestamp within the group + // with identical sx,sy,proc + for (auto& [runtime_id, events] : events_by_opname) { + std::tuple reference_event_loc; + uint64_t reference_timestamp = 0; + for (auto& event : events) { + std::string zone = event.value("zone", ""); + std::string zone_phase = event.value("zone_phase", ""); + uint64_t curr_timestamp = event.value("timestamp", 0); + // if -KERNEL::begin event is found, reset the reference timestamp + if (zone.ends_with("-KERNEL") && zone_phase == "begin") { + reference_timestamp = curr_timestamp; + } + + // fix timestamp to be relative to reference_timestamp + event["timestamp"] = curr_timestamp - reference_timestamp; + } + } + + log_info("Writing profiler noc traces to '{}'", output_dir); + for (auto& [runtime_id, events] : events_by_opname) { + // dump events to a json file inside directory output_dir named after the opname + std::filesystem::path rpt_path = output_dir; + std::string op_name = events.front().value("op_name", "UnknownOP"); + if (!op_name.empty()) { + rpt_path /= fmt::format("noc_trace_dev{}_{}_ID{}.json", device_id, op_name, runtime_id); + } else { + rpt_path /= fmt::format("noc_trace_dev{}_ID{}.json", device_id, runtime_id); + } + std::ofstream rpt_ofs(rpt_path); + if (!rpt_ofs) { + log_error("Could not write noc event json trace to '{}'", rpt_path); + return; + } + rpt_ofs << nlohmann::json(std::move(events)).dump(4) << std::endl; + } +} + +CoreCoord DeviceProfiler::getPhysicalAddressFromVirtual(const IDevice* device, const CoreCoord& c) const { + if (c.x >= hal.get_virtual_worker_start_x() && c.y >= hal.get_virtual_worker_start_y()) { + auto logical_x = c.x - hal.get_virtual_worker_start_x(); + auto logical_y = c.y - hal.get_virtual_worker_start_y(); + + const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device->id()); + // if the core has an address in the 'virtual' space, it must be CoreType::WORKER + return soc_desc.get_physical_core_from_logical_core({logical_x, logical_y}, CoreType::WORKER); + } else { + return c; + } } DeviceProfiler::DeviceProfiler(const bool new_logs) { @@ -393,7 +636,11 @@ void DeviceProfiler::generateZoneSourceLocationsHashes() { } } -void DeviceProfiler::dumpResults(IDevice* device, const std::vector& worker_cores, ProfilerDumpState state) { +void DeviceProfiler::dumpResults( + IDevice* device, + const std::vector& worker_cores, + ProfilerDumpState state, + const std::optional& metadata) { #if defined(TRACY_ENABLE) ZoneScoped; @@ -418,8 +665,37 @@ void DeviceProfiler::dumpResults(IDevice* device, const std::vector& } } - for (auto worker_core : worker_cores) { - readRiscProfilerResults(device, worker_core); + // open CSV log file + std::filesystem::path log_path = output_dir / DEVICE_SIDE_LOG; + std::ofstream log_file_ofs; + + // append to existing CSV log file if it already exists + if (std::filesystem::exists(log_path)) { + log_file_ofs.open(log_path, std::ios_base::app); + } else { + log_file_ofs.open(log_path); + emitCSVHeader(log_file_ofs, device_architecture, device_core_frequency); + } + + // create nlohmann json log object + nlohmann::ordered_json noc_trace_json_log = nlohmann::json::array(); + + if (!log_file_ofs) { + log_error("Could not open kernel profiler dump file '{}'", log_path); + } else { + for (const auto& worker_core : worker_cores) { + readRiscProfilerResults(device, worker_core, metadata, log_file_ofs, noc_trace_json_log); + } + + // if defined, used profiler_noc_events_report_path to write json log. otherwise use output_dir + auto rpt_path = tt::llrt::RunTimeOptions::get_instance().get_profiler_noc_events_report_path(); + if (rpt_path.empty()) { + rpt_path = output_dir; + } + + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_noc_events_enabled()) { + serializeJsonNocTraces(noc_trace_json_log, rpt_path, device_id); + } } } else { log_warning("DRAM profiler buffer is not initialized"); diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 308367b5bd2..7b4c800c231 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -625,7 +625,7 @@ void InitDeviceProfiler(IDevice* device) { #endif } -void DumpDeviceProfileResults(IDevice* device, ProfilerDumpState state) { +void DumpDeviceProfileResults(IDevice* device, ProfilerDumpState state, const std::optional& metadata) { #if defined(TRACY_ENABLE) ZoneScoped; std::vector workerCores; @@ -640,8 +640,8 @@ void DumpDeviceProfileResults(IDevice* device, ProfilerDumpState state) { auto virtualCore = device->virtual_core_from_logical_core(core, CoreType::ETH); workerCores.push_back(virtualCore); } - device->push_work([device, workerCores, state]() mutable { - DumpDeviceProfileResults(device, workerCores, state); + device->push_work([device, workerCores, state, metadata]() mutable { + DumpDeviceProfileResults(device, workerCores, state, metadata); if (deviceDeviceTimePair.find(device->id()) != deviceDeviceTimePair.end() and state == ProfilerDumpState::CLOSE_DEVICE_SYNC) { for (auto& connected_device : deviceDeviceTimePair.at(device->id())) { @@ -654,7 +654,7 @@ void DumpDeviceProfileResults(IDevice* device, ProfilerDumpState state) { #endif } -void DumpDeviceProfileResults(IDevice* device, std::vector& worker_cores, ProfilerDumpState state) { +void DumpDeviceProfileResults(IDevice* device, std::vector& worker_cores, ProfilerDumpState state, const std::optional& metadata) { #if defined(TRACY_ENABLE) ZoneScoped; std::string name = fmt::format("Device Dump {}", device->id()); @@ -740,7 +740,7 @@ void DumpDeviceProfileResults(IDevice* device, std::vector& worker_co } } tt_metal_device_profiler_map.at(device_id).setDeviceArchitecture(device->arch()); - tt_metal_device_profiler_map.at(device_id).dumpResults(device, worker_cores, state); + tt_metal_device_profiler_map.at(device_id).dumpResults(device, worker_cores, state, metadata); if (state == ProfilerDumpState::LAST_CLOSE_DEVICE) { // Process is ending, no more device dumps are coming, reset your ref on the buffer so deallocate is the diff --git a/ttnn/cpp/pybind11/device.cpp b/ttnn/cpp/pybind11/device.cpp index dfd65d61d4f..ddebc507f9c 100644 --- a/ttnn/cpp/pybind11/device.cpp +++ b/ttnn/cpp/pybind11/device.cpp @@ -20,12 +20,17 @@ #include #include "ttnn/operations/experimental/auto_format/auto_format.hpp" #include +#include "tools/profiler/op_profiler.hpp" + using namespace tt::tt_metal; namespace py = pybind11; namespace { -inline void DumpDeviceProfiler(IDevice* device) { tt::tt_metal::detail::DumpDeviceProfileResults(device); } +inline void DumpDeviceProfiler(IDevice* device) { + ProfilerOptionalMetadata prof_metadata(tt::tt_metal::op_profiler::runtime_id_to_opname.exportMap()); + tt::tt_metal::detail::DumpDeviceProfileResults(device, ProfilerDumpState::NORMAL, prof_metadata); +} } // namespace namespace ttnn { diff --git a/ttnn/tools/profiler/op_profiler.hpp b/ttnn/tools/profiler/op_profiler.hpp index 597e077d9b3..024e97387ab 100644 --- a/ttnn/tools/profiler/op_profiler.hpp +++ b/ttnn/tools/profiler/op_profiler.hpp @@ -108,6 +108,46 @@ inline auto compute_program_hash( } #endif +class thread_safe_runtime_id_to_ops_map { + using DEVICE_ID = uint32_t; + using RUNTIME_ID = uint32_t; + using KEY_TYPE = std::pair; + using VAL_TYPE = std::string; + using RUNTIME_ID_TO_OP_MAP = std::map; + +public: + RUNTIME_ID_TO_OP_MAP::iterator find(DEVICE_ID device_id, RUNTIME_ID runtime_id) { + std::scoped_lock lock(map_mutex); + return map.find({device_id, runtime_id}); + } + RUNTIME_ID_TO_OP_MAP::iterator begin() { + std::scoped_lock lock(map_mutex); + return map.begin(); + } + RUNTIME_ID_TO_OP_MAP::iterator end() { + std::scoped_lock lock(map_mutex); + return map.end(); + } + VAL_TYPE at(DEVICE_ID device_id, RUNTIME_ID runtime_id) { + std::scoped_lock lock(map_mutex); + return map.at({device_id, runtime_id}); + } + void emplace(KEY_TYPE key, VAL_TYPE opname) { + std::scoped_lock lock(map_mutex); + map.emplace(key, opname); + } + RUNTIME_ID_TO_OP_MAP exportMap() { + std::scoped_lock lock(map_mutex); + return map; + } + +private: + std::mutex map_mutex; + RUNTIME_ID_TO_OP_MAP map; +}; + +inline thread_safe_runtime_id_to_ops_map runtime_id_to_opname{}; + static void start_tracy_zone(const string& source, const string& functName, uint32_t lineNum, uint32_t color = 0) { #if defined(TRACY_ENABLE) auto tracySrcLoc = @@ -375,6 +415,13 @@ inline std::string op_meta_data_serialized_json( const bool useCachedOps = std::getenv("TT_METAL_PROFILER_NO_CACHE_OP_INFO") == nullptr; auto program_hash = compute_program_hash(operation_attributes, tensor_args); + auto as_string = [](std::string_view v) -> std::string { return {v.data(), v.size()}; }; + std::string opName = as_string(tt::stl::get_type_name()); + if constexpr (requires { device_operation_t::get_type_name(operation_attributes); }) { + opName = device_operation_t::get_type_name(operation_attributes); + } + runtime_id_to_opname.emplace({device_id, program.get_runtime_id()}, opName); + if (!useCachedOps || (cached_ops.find(device_id) == cached_ops.end()) || (cached_ops.at(device_id).find(program_hash) == cached_ops.at(device_id).end())) { auto j = From 99c4b315b6577176751790c408e140f1fa7f05ad Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Fri, 7 Mar 2025 15:41:04 +0000 Subject: [PATCH 2/7] remove redundant IDevice* passed into logPacketData --- tt_metal/api/tt-metalium/profiler.hpp | 5 +---- tt_metal/tools/profiler/profiler.cpp | 22 +++++++--------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/tt_metal/api/tt-metalium/profiler.hpp b/tt_metal/api/tt-metalium/profiler.hpp index 70c9f37a148..0fdd5e4a8a5 100644 --- a/tt_metal/api/tt-metalium/profiler.hpp +++ b/tt_metal/api/tt-metalium/profiler.hpp @@ -72,11 +72,10 @@ class DeviceProfiler { std::ofstream& log_file_ofs, const tt::ARCH& device_architecture, int device_core_frequency) const; // translates potentially-virtual coordinates recorded on Device into physical coordinates - CoreCoord getPhysicalAddressFromVirtual(const IDevice* device, const CoreCoord& c) const; + CoreCoord getPhysicalAddressFromVirtual(int device_id, const CoreCoord& c) const; // Dumping profile result to file void logPacketData( - const IDevice* device, std::ofstream& log_file_ofs, nlohmann::ordered_json& noc_trace_json_log, uint32_t runID, @@ -92,7 +91,6 @@ class DeviceProfiler { // logs packet data to CSV file void logPacketDataToCSV( - const IDevice* device, std::ofstream& log_file_ofs, int device_id, int core_x, @@ -111,7 +109,6 @@ class DeviceProfiler { // dump noc trace related profile data to json file void logNocTracePacketDataToJson( - const IDevice* device, nlohmann::ordered_json& noc_trace_json_log, int device_id, int core_x, diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index ab9af3a1ed9..b551485fa1e 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -77,7 +77,7 @@ void DeviceProfiler::readRiscProfilerResults( }; // translate worker core virtual coord to phys coordinates - auto phys_coord = getPhysicalAddressFromVirtual(device, worker_core); + auto phys_coord = getPhysicalAddressFromVirtual(device_id, worker_core); int riscNum = 0; for (int riscEndIndex = 0; riscEndIndex < riscCount; riscEndIndex++) { @@ -166,7 +166,6 @@ void DeviceProfiler::readRiscProfilerResults( runCounterRead); logPacketData( - device, log_file_ofs, noc_trace_json_log, runCounterRead, @@ -187,7 +186,6 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t time_H = opTime_H; uint32_t time_L = opTime_L; logPacketData( - device, log_file_ofs, noc_trace_json_log, runCounterRead, @@ -210,7 +208,6 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t data_H = profile_buffer[index]; uint32_t data_L = profile_buffer[index + 1]; logPacketData( - device, log_file_ofs, noc_trace_json_log, runCounterRead, @@ -229,7 +226,6 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t time_H = profile_buffer[index] & 0xFFF; uint32_t time_L = profile_buffer[index + 1]; logPacketData( - device, log_file_ofs, noc_trace_json_log, runCounterRead, @@ -267,7 +263,6 @@ void DeviceProfiler::firstTimestamp(uint64_t timestamp) { } void DeviceProfiler::logPacketData( - const IDevice* device, std::ofstream& log_file_ofs, nlohmann::ordered_json& noc_trace_json_log, uint32_t run_id, @@ -325,7 +320,6 @@ void DeviceProfiler::logPacketData( firstTimestamp(timestamp); logPacketDataToCSV( - device, log_file_ofs, device_id, core.x, @@ -343,7 +337,6 @@ void DeviceProfiler::logPacketData( source_file); logNocTracePacketDataToJson( - device, noc_trace_json_log, device_id, core.x, @@ -362,7 +355,6 @@ void DeviceProfiler::logPacketData( } void DeviceProfiler::logPacketDataToCSV( - const IDevice* device, std::ofstream& log_file_ofs, int device_id, int core_x, @@ -397,7 +389,6 @@ void DeviceProfiler::logPacketDataToCSV( } void DeviceProfiler::logNocTracePacketDataToJson( - const IDevice* device, nlohmann::ordered_json& noc_trace_json_log, int device_id, int core_x, @@ -456,14 +447,15 @@ void DeviceProfiler::logNocTracePacketDataToJson( // DO NOT emit destination coord; it isn't meaningful } else if (ev_md.noc_xfer_type == KernelProfilerNocEventMetadata::NocEventType::WRITE_MULTICAST) { - auto phys_start_coord = getPhysicalAddressFromVirtual(device, {ev_md.dst_x, ev_md.dst_y}); + auto phys_start_coord = getPhysicalAddressFromVirtual(device_id, {ev_md.dst_x, ev_md.dst_y}); data["mcast_start_x"] = phys_start_coord.x; data["mcast_start_y"] = phys_start_coord.y; - auto phys_end_coord = getPhysicalAddressFromVirtual(device, {ev_md.mcast_end_dst_x, ev_md.mcast_end_dst_y}); + auto phys_end_coord = + getPhysicalAddressFromVirtual(device_id, {ev_md.mcast_end_dst_x, ev_md.mcast_end_dst_y}); data["mcast_end_x"] = phys_end_coord.x; data["mcast_end_y"] = phys_end_coord.y; } else { - auto phys_coord = getPhysicalAddressFromVirtual(device, {ev_md.dst_x, ev_md.dst_y}); + auto phys_coord = getPhysicalAddressFromVirtual(device_id, {ev_md.dst_x, ev_md.dst_y}); data["dx"] = phys_coord.x; data["dy"] = phys_coord.y; } @@ -553,12 +545,12 @@ void DeviceProfiler::serializeJsonNocTraces( } } -CoreCoord DeviceProfiler::getPhysicalAddressFromVirtual(const IDevice* device, const CoreCoord& c) const { +CoreCoord DeviceProfiler::getPhysicalAddressFromVirtual(int device_id, const CoreCoord& c) const { if (c.x >= hal.get_virtual_worker_start_x() && c.y >= hal.get_virtual_worker_start_y()) { auto logical_x = c.x - hal.get_virtual_worker_start_x(); auto logical_y = c.y - hal.get_virtual_worker_start_y(); - const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device->id()); + const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); // if the core has an address in the 'virtual' space, it must be CoreType::WORKER return soc_desc.get_physical_core_from_logical_core({logical_x, logical_y}, CoreType::WORKER); } else { From 556813e6b216382423885b39d541abd91565fdce Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Fri, 7 Mar 2025 15:47:16 +0000 Subject: [PATCH 3/7] cache opname using {device_id,program_hash} within op_profiler --- ttnn/tools/profiler/op_profiler.hpp | 41 ++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/ttnn/tools/profiler/op_profiler.hpp b/ttnn/tools/profiler/op_profiler.hpp index 024e97387ab..8fc6b2b4873 100644 --- a/ttnn/tools/profiler/op_profiler.hpp +++ b/ttnn/tools/profiler/op_profiler.hpp @@ -148,6 +148,34 @@ class thread_safe_runtime_id_to_ops_map { inline thread_safe_runtime_id_to_ops_map runtime_id_to_opname{}; +class thread_safe_device_program_hash_to_opname_map { + using DEVICE_ID = uint32_t; + using PROGRAM_HASH = tt::stl::hash::hash_t; + using KEY_TYPE = std::pair; + using VAL_TYPE = std::string; + +public: + VAL_TYPE find_if_exists(const KEY_TYPE& key) { + std::scoped_lock lock(map_mutex); + auto it = map.find(key); + if (it != map.end()) { + return it->second; + } else { + return ""; + } + } + void emplace(const KEY_TYPE& key, VAL_TYPE&& opname) { + std::scoped_lock lock(map_mutex); + map.emplace(key, opname); + } + +private: + std::mutex map_mutex; + std::map map; +}; + +inline thread_safe_device_program_hash_to_opname_map device_and_program_hash_to_opname{}; + static void start_tracy_zone(const string& source, const string& functName, uint32_t lineNum, uint32_t color = 0) { #if defined(TRACY_ENABLE) auto tracySrcLoc = @@ -415,13 +443,6 @@ inline std::string op_meta_data_serialized_json( const bool useCachedOps = std::getenv("TT_METAL_PROFILER_NO_CACHE_OP_INFO") == nullptr; auto program_hash = compute_program_hash(operation_attributes, tensor_args); - auto as_string = [](std::string_view v) -> std::string { return {v.data(), v.size()}; }; - std::string opName = as_string(tt::stl::get_type_name()); - if constexpr (requires { device_operation_t::get_type_name(operation_attributes); }) { - opName = device_operation_t::get_type_name(operation_attributes); - } - runtime_id_to_opname.emplace({device_id, program.get_runtime_id()}, opName); - if (!useCachedOps || (cached_ops.find(device_id) == cached_ops.end()) || (cached_ops.at(device_id).find(program_hash) == cached_ops.at(device_id).end())) { auto j = @@ -431,6 +452,10 @@ inline std::string op_meta_data_serialized_json( j["op_hash"] = program_hash; j["kernel_info"] = get_kernels_json(device_id, program); + auto opname = j["op_code"].template get(); + runtime_id_to_opname.emplace({device_id, program.get_runtime_id()}, opname); + device_and_program_hash_to_opname.emplace({device_id, program_hash}, std::move(opname)); + j["optional_input_tensors"] = std::vector{}; auto perfModel = [&]() { @@ -459,6 +484,8 @@ inline std::string op_meta_data_serialized_json( std::string ser = j.dump(4); return fmt::format("{}{} ->\n{}`", short_str, operation_id, ser); } else { + auto opname = device_and_program_hash_to_opname.find_if_exists({device_id, program_hash}); + runtime_id_to_opname.emplace({device_id, program.get_runtime_id()}, std::move(opname)); return fmt::format("{}{}`", cached_ops.at(device_id).at(program_hash), operation_id); } } From b53dfc2b84fb064c5892d88e421ac9214a46b70f Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Fri, 7 Mar 2025 19:41:55 +0000 Subject: [PATCH 4/7] fix .clangd --- .clangd | 3 --- 1 file changed, 3 deletions(-) diff --git a/.clangd b/.clangd index d749f7fa16d..51066a4a270 100644 --- a/.clangd +++ b/.clangd @@ -2,6 +2,3 @@ CompileFlags: Add: - "std=c++20" - - "-DPROFILE_NOC_EVENTS=1" - - "-DPROFILE_KERNEL=1" - - "-DCOMPILE_FOR_BRISC=1" From 42bd078adfe706282d0225d4297b6534a100368c Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Fri, 7 Mar 2025 20:18:50 +0000 Subject: [PATCH 5/7] try removing dataflow_api.h include from tunneling.h --- tt_metal/hw/inc/ethernet/tunneling.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tt_metal/hw/inc/ethernet/tunneling.h b/tt_metal/hw/inc/ethernet/tunneling.h index e090a75cedb..2a77e284bdb 100644 --- a/tt_metal/hw/inc/ethernet/tunneling.h +++ b/tt_metal/hw/inc/ethernet/tunneling.h @@ -8,7 +8,6 @@ #include "erisc.h" #include "eth_l1_address_map.h" #include "noc_nonblocking_api.h" -#include "hw/inc/dataflow_api.h" inline void RISC_POST_STATUS(uint32_t status) { volatile uint32_t* ptr = (volatile uint32_t*)(NOC_CFG(ROUTER_CFG_2)); From 9ba8a49ac118a24c527792b7e4edfdd60119ff63 Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Sat, 8 Mar 2025 18:37:46 +0000 Subject: [PATCH 6/7] fix irregular spacing in dataflow_api.h --- tt_metal/hw/inc/dataflow_api.h | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index c3bd17e9fa0..99e761bfe01 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -501,7 +501,6 @@ inline void noc_async_read( Read requests - use static VC Read responses - assigned VCs dynamically */ - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ,src_noc_addr,size, -1); if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) { @@ -522,7 +521,6 @@ void noc_async_read_one_packet_set_state(std::uint64_t src_noc_addr, std::uint32 Read requests - use static VC Read responses - assigned VCs dynamically */ - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ_SET_STATE, src_noc_addr, size, -1); WAYPOINT("RP3W"); @@ -559,7 +557,6 @@ FORCE_INLINE void noc_async_read_one_packet_with_state( Read requests - use static VC Read responses - assigned VCs dynamically */ - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ_WITH_STATE, static_cast(src_noc_addr), 0, -1); WAYPOINT("RP4W"); @@ -593,7 +590,6 @@ void noc_async_read_set_state(std::uint64_t src_noc_addr, uint8_t noc = noc_inde Read requests - use static VC Read responses - assigned VCs dynamically */ - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::READ_SET_STATE,src_noc_addr,0,-1); WAYPOINT("RP5W"); @@ -736,7 +732,6 @@ void noc_async_write_multicast_one_packet( bool linked = false, bool multicast_path_reserve = true, uint8_t noc = noc_index) { - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_MULTICAST,dst_noc_addr_multicast,size, NOC_MULTICAST_WRITE_VC); WAYPOINT("NWPW"); @@ -888,7 +883,6 @@ inline void noc_async_write( if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) { noc_async_write_one_packet(src_local_l1_addr, dst_noc_addr, size, noc); } else { - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_, dst_noc_addr, size, NOC_UNICAST_WRITE_VC); WAYPOINT("NAWW"); @@ -1195,7 +1189,6 @@ inline void noc_async_write_multicast_exclude_region( * Return value: None */ void noc_async_read_barrier(uint8_t noc = noc_index) { - RECORD_NOC_EVENT(NocEventType::READ_BARRIER_START); WAYPOINT("NRBW"); @@ -1222,7 +1215,6 @@ void noc_async_read_barrier(uint8_t noc = noc_index) { */ FORCE_INLINE void noc_async_write_barrier(uint8_t noc = noc_index) { - RECORD_NOC_EVENT(NocEventType::WRITE_BARRIER_START); WAYPOINT("NWBW"); @@ -1246,7 +1238,6 @@ void noc_async_write_barrier(uint8_t noc = noc_index) { */ FORCE_INLINE void noc_async_writes_flushed(uint8_t noc = noc_index) { - RECORD_NOC_EVENT(NocEventType::WRITE_FLUSH); WAYPOINT("NWFW"); @@ -1290,7 +1281,6 @@ void noc_async_posted_writes_flushed(uint8_t noc = noc_index) { */ FORCE_INLINE void noc_async_atomic_barrier(uint8_t noc_idx = noc_index) { - RECORD_NOC_EVENT(NocEventType::ATOMIC_BARRIER); WAYPOINT("NABW"); @@ -1361,7 +1351,6 @@ void noc_async_full_barrier(uint8_t noc_idx = noc_index) { // clang-format on FORCE_INLINE void noc_semaphore_wait(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { - RECORD_NOC_EVENT(NocEventType::SEMAPHORE_WAIT); WAYPOINT("NSW"); @@ -1388,7 +1377,6 @@ void noc_semaphore_wait(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { // clang-format on FORCE_INLINE void noc_semaphore_wait_min(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { - RECORD_NOC_EVENT(NocEventType::SEMAPHORE_WAIT); WAYPOINT("NSMW"); @@ -1415,7 +1403,6 @@ void noc_semaphore_wait_min(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) // clang-format on FORCE_INLINE void noc_semaphore_set(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { - RECORD_NOC_EVENT(NocEventType::SEMAPHORE_SET); // set semaphore value to val @@ -1446,7 +1433,6 @@ void noc_semaphore_set(volatile tt_l1_ptr uint32_t* sem_addr, uint32_t val) { // clang-format on FORCE_INLINE void noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t be = 0xF, uint8_t noc = noc_index) { - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::WRITE_INLINE, addr, 32, NOC_UNICAST_WRITE_VC); WAYPOINT("NWIW"); @@ -1485,7 +1471,6 @@ void noc_semaphore_inc(uint64_t addr, uint32_t incr, uint8_t noc_id = noc_index) [REFER TO grayskull/noc/noc.h for the documentation of noc_atomic_increment()] Generic increment with 32-bit wrap. */ - RECORD_NOC_EVENT_WITH_ADDR(NocEventType::SEMAPHORE_INC,addr,0, NOC_UNICAST_WRITE_VC); WAYPOINT("NSIW"); @@ -1583,7 +1568,6 @@ void noc_async_read_tile_dram_sharded_with_state_with_trid( FORCE_INLINE void noc_async_read_tile_dram_sharded_set_trid(uint32_t trid = 0, uint8_t noc = noc_index) { - RECORD_NOC_EVENT(NocEventType::READ_SET_TRID); WAYPOINT("NSTW"); From 42bc4d86a76b46dfd0fe0309ac9ddb7a0ef6b877 Mon Sep 17 00:00:00 2001 From: Brett Grady Date: Sat, 8 Mar 2025 18:45:44 +0000 Subject: [PATCH 7/7] use chip_id_t for device id within profiler --- tt_metal/api/tt-metalium/profiler.hpp | 12 +++++------- tt_metal/tools/profiler/profiler.cpp | 17 ++++++++--------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/tt_metal/api/tt-metalium/profiler.hpp b/tt_metal/api/tt-metalium/profiler.hpp index 0fdd5e4a8a5..17ebb2a3c31 100644 --- a/tt_metal/api/tt-metalium/profiler.hpp +++ b/tt_metal/api/tt-metalium/profiler.hpp @@ -66,13 +66,13 @@ class DeviceProfiler { // serialize all noc trace data into per-op json trace files void serializeJsonNocTraces( - const nlohmann::ordered_json& noc_trace_json_log, const std::filesystem::path& output_dir, int device_id); + const nlohmann::ordered_json& noc_trace_json_log, const std::filesystem::path& output_dir, chip_id_t device_id); void emitCSVHeader( std::ofstream& log_file_ofs, const tt::ARCH& device_architecture, int device_core_frequency) const; // translates potentially-virtual coordinates recorded on Device into physical coordinates - CoreCoord getPhysicalAddressFromVirtual(int device_id, const CoreCoord& c) const; + CoreCoord getPhysicalAddressFromVirtual(chip_id_t device_id, const CoreCoord& c) const; // Dumping profile result to file void logPacketData( @@ -81,7 +81,7 @@ class DeviceProfiler { uint32_t runID, uint32_t runHostID, const std::string& opname, - int device_id, + chip_id_t device_id, CoreCoord core, int core_flat, int risc_num, @@ -92,7 +92,7 @@ class DeviceProfiler { // logs packet data to CSV file void logPacketDataToCSV( std::ofstream& log_file_ofs, - int device_id, + chip_id_t device_id, int core_x, int core_y, const std::string_view risc_name, @@ -110,7 +110,7 @@ class DeviceProfiler { // dump noc trace related profile data to json file void logNocTracePacketDataToJson( nlohmann::ordered_json& noc_trace_json_log, - int device_id, + chip_id_t device_id, int core_x, int core_y, const std::string_view risc_name, @@ -169,8 +169,6 @@ class DeviceProfiler { // frequency scale double freqScale = 1.0; - uint32_t my_device_id = 0; - // Freshen device logs void freshDeviceLog(); diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index b551485fa1e..6392a92c845 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -36,9 +36,8 @@ void DeviceProfiler::readRiscProfilerResults( std::ofstream& log_file_ofs, nlohmann::ordered_json& noc_trace_json_log) { ZoneScoped; - auto device_id = device->id(); + chip_id_t device_id = device->id(); - my_device_id = device_id; HalProgrammableCoreType CoreType; int riscCount; @@ -268,7 +267,7 @@ void DeviceProfiler::logPacketData( uint32_t run_id, uint32_t run_host_id, const std::string& opname, - int device_id, + chip_id_t device_id, CoreCoord core, int core_flat, int risc_num, @@ -356,7 +355,7 @@ void DeviceProfiler::logPacketData( void DeviceProfiler::logPacketDataToCSV( std::ofstream& log_file_ofs, - int device_id, + chip_id_t device_id, int core_x, int core_y, const std::string_view risc_name, @@ -390,7 +389,7 @@ void DeviceProfiler::logPacketDataToCSV( void DeviceProfiler::logNocTracePacketDataToJson( nlohmann::ordered_json& noc_trace_json_log, - int device_id, + chip_id_t device_id, int core_x, int core_y, const std::string_view risc_name, @@ -474,7 +473,7 @@ void DeviceProfiler::emitCSVHeader( } void DeviceProfiler::serializeJsonNocTraces( - const nlohmann::ordered_json& noc_trace_json_log, const std::filesystem::path& output_dir, int device_id) { + const nlohmann::ordered_json& noc_trace_json_log, const std::filesystem::path& output_dir, chip_id_t device_id) { // create output directory if it does not exist std::filesystem::create_directories(output_dir); if (!std::filesystem::is_directory(output_dir)) { @@ -545,7 +544,7 @@ void DeviceProfiler::serializeJsonNocTraces( } } -CoreCoord DeviceProfiler::getPhysicalAddressFromVirtual(int device_id, const CoreCoord& c) const { +CoreCoord DeviceProfiler::getPhysicalAddressFromVirtual(chip_id_t device_id, const CoreCoord& c) const { if (c.x >= hal.get_virtual_worker_start_x() && c.y >= hal.get_virtual_worker_start_y()) { auto logical_x = c.x - hal.get_virtual_worker_start_x(); auto logical_y = c.y - hal.get_virtual_worker_start_y(); @@ -713,7 +712,7 @@ void DeviceProfiler::pushTracyDeviceResults() { static uint64_t cpuTime = 0; for (auto& device_core : device_cores) { - int device_id = device_core.first; + chip_id_t device_id = device_core.first; CoreCoord worker_core = device_core.second; if (device_core_sync_info.find(worker_core) != device_core_sync_info.end()) { @@ -730,7 +729,7 @@ void DeviceProfiler::pushTracyDeviceResults() { } for (auto& device_core : device_cores) { - int device_id = device_core.first; + chip_id_t device_id = device_core.first; CoreCoord worker_core = device_core.second; if (delay == 0.0 || frequency == 0.0) {