From 3027b5afc13b3b0ba702128fe0d9ce1a6171b792 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 18 Mar 2024 23:43:44 -0700 Subject: [PATCH] !wip feat(tracing): add nvtx provider Hook nvtx_push()/nvtx_pop() on existing lttng macros. WIP: Currently, this builds correctly but is untested. Will remain WIP until we figure out how to structure this in a way that aligns the required usages of NCCL_OFI_TRACE_POP() for nvtx with cases like NCCL_OFI_TRACE_SEND_WRITE_SEG COMPLETE/START. Probably, we should just redo all the lttng macros so that they all wrap a workload, rather than today where the majority just signal that an event took place. I would also like to support a separate course-grained type of probe definition within this module. lttng and nvtx are best-suited for fine-grained/range-based eventing around program behavior (not quite what we have today, but where we want to get: things like wrapping an entire event and/or supplying rich metadata around that event.) For this, we need to support: 1. NVTX because of the ecosystem this plugin exists in. 2. Something that's cheaper than userspace uprobe (see bpftime below) and in-process or nearly so. Some candidates: perfetto, redoing the existing lttng support, etc. Separate from this, we should also support builds with course entry/exit USDT probes for basically all nontrivial functions. This can be a lot more useful for development and for building debug tools. Some tooling that this would enable: + very generic and allows for cross-dependency analysis + https://github.com/eunomia-bpf/bpftime + bpftrace or bcc makes this cheap + certain `linux perf` calls can benefit from this. + potential to profile kernel via kprobes in the same script. + offcpu analysis These are just nop sleds and have zero runtime overhead; so they can be enabled on default/release builds. (See: [1] for how others use this) It's surprisingly difficult to do this in a way that does not require code changes. Can potentially do this with a small out-of-tree llvm pass (and/or a gcc equivilent, see "gcc python plugin" on github) that piggy-backs on -finstrument-functions's __cyg_profile_func_exit and __cyg_profile_func_entry calls. Putting the USDT probe in the __cyg_profile_func_exit impl itself is not viable. Need to dig more. [1]: https://www.brendangregg.com/Slides/reInvent2019_BPF_Performance_Analysis/ --- include/Makefile.am | 1 + include/nccl_ofi_tracepoint.h | 14 ++++++++++++++ include/tracing_impl/nvtx.h | 25 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 include/tracing_impl/nvtx.h diff --git a/include/Makefile.am b/include/Makefile.am index 687a16e9f..3262d2c8d 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -27,6 +27,7 @@ noinst_HEADERS = \ nccl_ofi_ofiutils.h \ nccl_ofi_tracepoint.h \ tracing_impl/lttng.h \ + tracing_impl/nvtx.h \ nccl-headers/net.h \ nccl-headers/error.h \ nccl-headers/nvidia/err.h \ diff --git a/include/nccl_ofi_tracepoint.h b/include/nccl_ofi_tracepoint.h index 17f569cab..511354c98 100644 --- a/include/nccl_ofi_tracepoint.h +++ b/include/nccl_ofi_tracepoint.h @@ -6,55 +6,69 @@ #pragma once #include "config.h" +#include "tracing_impl/nvtx.h" #include "tracing_impl/lttng.h" #define NCCL_OFI_TRACE_SEND(dev, size, comm, msg_seq_num, request, nccl_req) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Send, dev, size, comm, msg_seq_num, request, nccl_req); \ + nvtx_push("Send"); \ } while(0) #define NCCL_OFI_TRACE_SEND_CTRL_RECV(dev, rail_id, comm, msg_seq_num) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Send_ctrl_recv, dev, rail_id, comm, msg_seq_num); \ + nvtx_push("Send_ctrl_recv"); \ } while (0) #define NCCL_OFI_TRACE_SEND_WRITE_SEG_START(dev, rail_id, size, comm, msg_seq_num, request) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Send_write_segment_start, dev, rail_id, size, comm, msg_seq_num, request); \ + nvtx_push("Send_write_segment_start"); \ } while(0) #define NCCL_OFI_TRACE_SEND_WRITE_SEG_COMPLETE(dev, rail_id, comm, msg_seq_num, request) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Send_write_segment_complete, dev, rail_id, comm, msg_seq_num, request); \ + nvtx_push("Send_write_segment_complete"); \ } while(0) #define NCCL_OFI_TRACE_RECV(dev, tag, size, request, nccl_req) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Recv, dev, tag, size, request, nccl_req); \ + nvtx_push("Recv"); \ } while(0) #define NCCL_OFI_TRACE_RECV_CTRL_SEND_COMPLETE(request) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Recv_ctrl_send_complete, request); \ + nvtx_push("Recv_ctrl_send_complete"); \ } while(0) #define NCCL_OFI_TRACE_RECV_SEGMENT_COMPLETE(dev, rail_id, size, request) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Recv_segment_complete, dev, rail_id, size, request); \ + nvtx_push("Recv_segment_complete"); \ } while(0) #define NCCL_OFI_TRACE_EAGER_RECV(dev, rail_id, comm, msg_seq_num) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Eager_recv, dev, rail_id, comm, msg_seq_num); \ + nvtx_push("Eager_recv"); \ } while(0) #define NCCL_OFI_TRACE_COMPLETIONS(request,ctx) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, ProcessCompletions, request,ctx); \ + nvtx_push("ProcessCompletions"); \ } while(0) #define NCCL_OFI_TRACE_FLUSH(request, nccl_req) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Flush, request, nccl_req); \ + nvtx_push("Flush"); \ } while(0) #define NCCL_OFI_TRACE_PENDING_INSERT(request) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_insert, request); \ + nvtx_push("Pending_queue_insert"); \ } while(0) #define NCCL_OFI_TRACE_PENDING_REMOVE(request) do { \ lttng_ust_tracepoint(nccl_ofi_plugin, Pending_queue_remove, request); \ + nvtx_push("Pending_queue_remove"); \ } while(0) #define NCCL_OFI_TRACE_POP(...) do { \ + nvtx_pop(); \ } while(0) diff --git a/include/tracing_impl/nvtx.h b/include/tracing_impl/nvtx.h new file mode 100644 index 000000000..5f8680bec --- /dev/null +++ b/include/tracing_impl/nvtx.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2022-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + */ + +#pragma once +#if HAVE_NVTX_TRACING +#include "nvToolsExt.h" +static inline void nvtx_push(const char* name) { + const nvtxEventAttributes_t eventAttrib = { + .version = NVTX_VERSION, + .size = NVTX_EVENT_ATTRIB_STRUCT_SIZE, + .colorType = NVTX_COLOR_ARGB, + .color = 0xeb9234, + .messageType = NVTX_MESSAGE_TYPE_ASCII, + .message = { .ascii = name }, + }; + nvtxRangePushEx(&eventAttrib); +} +static inline void nvtx_pop(void) { + nvtxRangePop(); +} +#else +static inline void nvtx_push(const char* name){ (void)name; } +static inline void nvtx_pop(void){} +#endif