From a64f45913a4af7c70434cb51fe544dbe71cde224 Mon Sep 17 00:00:00 2001 From: Vinit1085 Date: Mon, 16 Dec 2024 18:44:03 +0530 Subject: [PATCH] EDGEML-8777 - [Linux NPU Driver]: Support firmware log buffer [Why] Log buffer support is required to enhance debugging support on NPU stack [How] 1. Allocate log buffer and share with firmware via 'start_event_trace' message Config start_event_trace request param, send via mgmt channel. Handle the send buffer resp. 2. Receive notifications about log buffer fullness, via interrupt from firmware. Handle channel interrupt, process further buffer data and prepare new buffer. 3. Add stop_event_trace_send api and it's handle to stop logging when aie2 shutdown. Signed-off-by: Vinit Shukla --- src/driver/amdxdna/Kbuild | 1 + src/driver/amdxdna/aie2_event_trace.c | 228 ++++++++++++++++++++ src/driver/amdxdna/aie2_message.c | 39 +++- src/driver/amdxdna/aie2_msg_priv.h | 50 +++++ src/driver/amdxdna/aie2_pci.c | 22 +- src/driver/amdxdna/aie2_pci.h | 12 ++ src/driver/amdxdna/amdxdna_mailbox.c | 15 +- src/driver/amdxdna/amdxdna_mailbox.h | 3 + src/driver/amdxdna/amdxdna_mailbox_helper.h | 17 ++ 9 files changed, 382 insertions(+), 5 deletions(-) create mode 100644 src/driver/amdxdna/aie2_event_trace.c diff --git a/src/driver/amdxdna/Kbuild b/src/driver/amdxdna/Kbuild index 31e4237..af17843 100644 --- a/src/driver/amdxdna/Kbuild +++ b/src/driver/amdxdna/Kbuild @@ -30,6 +30,7 @@ amdxdna-y := \ aie2_error.o \ aie2_debugfs.o \ aie2_message.o \ + aie2_event_trace.o \ aie2_pm.o \ aie2_pci.o \ npu1_regs.o \ diff --git a/src/driver/amdxdna/aie2_event_trace.c b/src/driver/amdxdna/aie2_event_trace.c new file mode 100644 index 0000000..0e03440 --- /dev/null +++ b/src/driver/amdxdna/aie2_event_trace.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025, Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include "aie2_msg_priv.h" +#include "aie2_pci.h" +#include "amdxdna_mailbox.h" + +#define LOG_BUFFER_IRQ 27 +#define MPIPU_IOHUB_INT_27_ALIAS 0xD7008 +#define LOG_BUFFER_MAILBOX MPIPU_IOHUB_INT_27_ALIAS + +uint8_t g_fwLogBuf[TRACE_EVENT_BUFFER_SIZE]; + +struct trace_event_metadata +{ + uint64_t tail_offset; + uint64_t head_offset; + uint32_t padding[12]; +}; + +struct trace_event_log_data +{ + uint64_t counter; + uint16_t payload_hi; + uint16_t type; + uint32_t payload_low; +}; + +struct event_trace_req_buf +{ + struct amdxdna_dev_hdl *ndev; + struct start_event_trace_req trace_req; + int log_ch_irq; + u8 *buf; +}; + +uint32_t get_trace_event_Content(struct event_trace_req_buf *trace_req_buf, uint8_t *kern_buf) +{ + u8 *sysBuf; + uint32_t rd_ptr, wr_ptr, wr_ptr_wrap; + uint32_t total_cnt = 0, cnt = 0; + uint32_t rb_size, offset = 0; + struct trace_event_metadata *trace_metadata; + + sysBuf = (u8 *)trace_req_buf->buf; + rb_size = TRACE_EVENT_BUFFER_SIZE - 64; + if (rb_size == 0) + { + return 0; + } + + trace_metadata = (struct trace_event_metadata *)(sysBuf + rb_size); + if (!trace_metadata) + { + printk(KERN_ERR "FW trace buffer metadata is null!"); + return 0; + } + + // Get the ring buffer read and write pointers, update the ring buffer content size + rd_ptr = (uint32_t)(trace_metadata->head_offset % rb_size); + wr_ptr = (uint32_t)(trace_metadata->tail_offset); + wr_ptr_wrap = wr_ptr % rb_size; + + do + { + if (wr_ptr_wrap > rd_ptr) + cnt = wr_ptr_wrap - rd_ptr; + else if (wr_ptr_wrap < rd_ptr) + cnt = rb_size - rd_ptr; + else + return 0; + + if (cnt > rb_size) + { + printk(KERN_ERR "counter > rb_size"); + ; + return 0; + } + // Copy the ring buffer content to the kernel buffer + memcpy(kern_buf + offset, (u8 *)(sysBuf + rd_ptr), cnt); + + offset += cnt; + total_cnt += cnt; + rd_ptr = (rd_ptr + cnt) % rb_size; + } while (rd_ptr < wr_ptr_wrap); // If the buffer is rolling over, continue to copy the content + + // Update the Ring Buffer read pointer + trace_metadata->head_offset = wr_ptr; + + return total_cnt; +} + +void print_trace_event_log(struct amdxdna_dev_hdl *ndev) +{ + uint32_t cnt; + uint64_t payload; + struct event_trace_req_buf *trace_req_buf; + struct trace_event_log_data *log_content; + trace_req_buf = ndev->event_trace_req; + + if (!trace_req_buf) + { + XDNA_ERR(ndev->xdna, "FW resp trace buffer is null!"); + return; + } + + cnt = get_trace_event_Content(trace_req_buf, g_fwLogBuf); + + if (cnt) + { + uint64_t fwTicks; + char *str = (char *)g_fwLogBuf; + char *end = ((char *)str + cnt); + g_fwLogBuf[cnt] = 0; + + // Print the raw counter value from NPU first + while (str < end) + { + log_content = (struct trace_event_log_data *)str; + payload = (uint64_t)((uint64_t)(log_content->payload_hi) << 32 | log_content->payload_low); + fwTicks = log_content->counter; + printk(KERN_ERR "[%llu][FW] type: 0x%04x payload:0x%016llx", fwTicks, log_content->type, payload); + str += MAX_ONE_TIME_LOG_INFO_LEN; + } + } +} + +static irqreturn_t log_buffer_irq_handler(int irq, void *data) +{ + struct amdxdna_dev_hdl *ndev = (struct amdxdna_dev_hdl *)data; + + if (!ndev) + { + printk(KERN_ERR "xdna dev is null !\n"); + return IRQ_NONE; + } + + print_trace_event_log(ndev); + return clear_logbuff_irq(irq, LOG_BUFFER_MAILBOX, ndev); +} + +int aie2_stop_event_trace_send(struct amdxdna_dev_hdl *ndev) +{ + struct amdxdna_dev *xdna = ndev->xdna; + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); + return aie2_stop_event_trace(ndev); +} + +int aie2_event_trace_alloc(struct amdxdna_dev_hdl *ndev) +{ + int ret; + struct amdxdna_dev *xdna = ndev->xdna; + struct event_trace_req_buf *req_buf; + + req_buf = kzalloc(sizeof(struct event_trace_req_buf), GFP_KERNEL); + if (!req_buf) + return -ENOMEM; + + req_buf->buf = dma_alloc_noncoherent(xdna->ddev.dev, TRACE_EVENT_BUFFER_SIZE, (dma_addr_t *)&req_buf->trace_req.dram_buffer_address, + DMA_BIDIRECTIONAL, GFP_KERNEL); + if (!req_buf->buf) + { + ret = -ENOMEM; + goto free_event_trace_req_buf; + } + req_buf->trace_req.dram_buffer_size = TRACE_EVENT_BUFFER_SIZE; + ndev->event_trace_req = req_buf; + req_buf->ndev = ndev; + + req_buf->log_ch_irq = pci_irq_vector(to_pci_dev(xdna->ddev.dev), LOG_BUFFER_IRQ); + ret = request_irq(req_buf->log_ch_irq, log_buffer_irq_handler, 0, "LOG_BUFFER", ndev); + if (ret) + { + printk(KERN_INFO "Failed to request irq %d ret %d", LOG_BUFFER_IRQ, ret); + } + + XDNA_INFO(xdna, "Start event trace buf addr: 0x%llx size 0x%x", + req_buf->trace_req.dram_buffer_address, req_buf->trace_req.dram_buffer_size); + return 0; + +free_event_trace_req_buf: + kfree(req_buf); + return ret; +} + +void aie2_event_trace_free(struct amdxdna_dev_hdl *ndev) +{ + struct amdxdna_dev *xdna = ndev->xdna; + struct event_trace_req_buf *req_buf = ndev->event_trace_req; + + if (!req_buf) + return; + + dma_free_noncoherent(xdna->ddev.dev, req_buf->trace_req.dram_buffer_size, req_buf->buf, + (dma_addr_t)req_buf->trace_req.dram_buffer_address, DMA_BIDIRECTIONAL); + free_irq(req_buf->log_ch_irq, ndev); + kfree(req_buf); +} + +int aie2_start_event_trace_send(struct amdxdna_dev_hdl *ndev) +{ + int ret; + struct event_trace_req_buf *trace_req_buf = NULL; + struct amdxdna_dev *xdna = ndev->xdna; + + ret = aie2_event_trace_alloc(ndev); + + if (!ret) + { + trace_req_buf = ndev->event_trace_req; + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); + drm_clflush_virt_range(trace_req_buf->buf, trace_req_buf->trace_req.dram_buffer_size); + ret = aie2_start_event_trace(ndev, trace_req_buf->trace_req.dram_buffer_address, + trace_req_buf->trace_req.dram_buffer_size, &trace_req_buf->trace_req); + } + else + { + XDNA_ERR(xdna, "Failed to allocate event trace buffer"); + } + + return ret; +} diff --git a/src/driver/amdxdna/aie2_message.c b/src/driver/amdxdna/aie2_message.c index 89a62f6..d2bbf11 100644 --- a/src/driver/amdxdna/aie2_message.c +++ b/src/driver/amdxdna/aie2_message.c @@ -16,6 +16,9 @@ #define DECLARE_AIE2_MSG(name, op) \ DECLARE_XDNA_MSG_COMMON(name, op, MAX_AIE2_STATUS_CODE) +#define DECLARE_AIE2_STOP_EVENT_TRACE_MSG(name, op) \ + DECLARE_XDNA_STOP_EVENT_TRACE_MSG(name, op, MAX_AIE2_STATUS_CODE) + #define aie2_send_mgmt_msg_wait(ndev, msg) \ aie2_send_mgmt_msg_wait_offset(ndev, msg, 0) @@ -64,8 +67,19 @@ aie2_send_mgmt_msg_wait_offset(struct amdxdna_dev_hdl *ndev, if (!ret && hdl->data[offset] != AIE2_STATUS_SUCCESS) { XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x", - msg->opcode, *hdl->data); + msg->opcode, *hdl->data); ret = -EINVAL; + } else if (ret) { + XDNA_ERR(xdna, "Send message failed, ret %d", ret); + } else { + if (msg->opcode == MSG_OP_START_EVENT_TRACE) { + struct start_event_trace_resp *resp = (struct start_event_trace_resp *)hdl->data; + XDNA_INFO(xdna, "Start event trace status %d msi %u ts 0x%lld", + resp->status, resp->msi_idx, resp->current_timestamp); + } else if (msg->opcode == MSG_OP_STOP_EVENT_TRACE) { + struct stop_event_trace_resp *resp = (struct stop_event_trace_resp *)hdl->data; + XDNA_INFO(xdna, "Stop event trace status %d \n", resp->status); + } } return ret; @@ -255,6 +269,29 @@ int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev, return 0; } +int aie2_start_event_trace(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, + u32 size, void *handle) +{ + DECLARE_AIE2_MSG(start_event_trace, MSG_OP_START_EVENT_TRACE); + + req.dram_buffer_address = addr; + req.dram_buffer_size = size; + req.event_trace_dest = EVENT_TRACE_DEST_DRAM; + req.event_trace_categories = 0xFFFFFFFF; + req.event_trace_timestamp = EVENT_TRACE_TIMESTAMP_CPU_CCOUNT; + + XDNA_INFO(ndev->xdna, "send start event trace msg"); + return aie2_send_mgmt_msg_wait(ndev, &msg); +} + +int aie2_stop_event_trace(struct amdxdna_dev_hdl *ndev) +{ + DECLARE_AIE2_STOP_EVENT_TRACE_MSG(stop_event_trace, MSG_OP_STOP_EVENT_TRACE); + + XDNA_INFO(ndev->xdna, "send stop event trace msg"); + return aie2_send_mgmt_msg_wait(ndev, &msg); +} + int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx) { DECLARE_AIE2_MSG(create_ctx, MSG_OP_CREATE_CONTEXT); diff --git a/src/driver/amdxdna/aie2_msg_priv.h b/src/driver/amdxdna/aie2_msg_priv.h index b56a8f0..5f768d5 100644 --- a/src/driver/amdxdna/aie2_msg_priv.h +++ b/src/driver/amdxdna/aie2_msg_priv.h @@ -40,6 +40,8 @@ enum aie2_msg_opcode { MSG_OP_SET_RUNTIME_CONFIG = 0x10A, MSG_OP_GET_RUNTIME_CONFIG = 0x10B, MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C, + MSG_OP_START_EVENT_TRACE = 0x10F, + MSG_OP_STOP_EVENT_TRACE = 0x110, MSG_OP_MAX_DRV_OPCODE, MSG_OP_GET_PROTOCOL_VERSION = 0x301, MSG_OP_MAX_OPCODE @@ -387,6 +389,54 @@ struct async_event_msg_resp { enum async_event_type type; } __packed; +/***************Start of event tracing data struct ****************/ +#define TRACE_EVENT_BUFFER_SIZE 0x2000 +#define MAX_ONE_TIME_LOG_INFO_LEN 16 + +enum event_trace_destination { + EVENT_TRACE_DEST_DEBUG_BUS, + EVENT_TRACE_DEST_DRAM, + EVENT_TRACE_DEST_COUNT +}; + +enum event_trace_timestamp { + EVENT_TRACE_TIMESTAMP_FW_CHRONO, + EVENT_TRACE_TIMESTAMP_CPU_CCOUNT, + EVENT_TRACE_TIMESTAMP_COUNT +}; + +struct start_event_trace_req { + uint32_t event_trace_categories; + enum event_trace_destination event_trace_dest; + enum event_trace_timestamp event_trace_timestamp; + //DRAM log buffer address and size + uint64_t dram_buffer_address; + uint32_t dram_buffer_size; +} __packed; + +struct start_event_trace_resp { + enum aie2_msg_status status; + uint32_t msi_idx; + uint64_t current_timestamp; +} __packed; + +struct stop_event_trace_req { + uint32_t :32; +} __packed; + +struct stop_event_trace_resp { + enum aie2_msg_status status; +} __packed; + +struct set_event_trace_categories_req { + uint32_t event_trace_categories; +}; + +struct set_event_trace_categories_resp { + enum aie2_msg_status status; +}; +/***************End of event tracing data structs ****************/ + #define MAX_CHAIN_CMDBUF_SIZE 0x1000 #define slot_cf_has_space(offset, payload_size) \ (MAX_CHAIN_CMDBUF_SIZE - ((offset) + (payload_size)) > \ diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c index 39f6b86..e3ea488 100644 --- a/src/driver/amdxdna/aie2_pci.c +++ b/src/driver/amdxdna/aie2_pci.c @@ -313,9 +313,11 @@ static int aie2_mgmt_fw_query(struct amdxdna_dev_hdl *ndev) static void aie2_mgmt_fw_fini(struct amdxdna_dev_hdl *ndev) { - if (aie2_suspend_fw(ndev)) + if (aie2_suspend_fw(ndev)) { XDNA_ERR(ndev->xdna, "suspend_fw failed"); - XDNA_DBG(ndev->xdna, "npu firmware suspended"); + } else { + XDNA_DBG(ndev->xdna, "npu firmware suspended"); + } } static int aie2_xrs_set_dft_dpm_level(struct drm_device *ddev, u32 dpm_level) @@ -615,6 +617,14 @@ static int aie2_init(struct amdxdna_dev *xdna) goto async_event_free; } + ret = aie2_start_event_trace_send(ndev); + if(ret) { + XDNA_ERR(xdna, "Send start event trace failed, ret %d", ret); + goto event_trace_free; + } else { + XDNA_INFO(xdna, "vs- Send start event trace success: %d", ret); + } + /* Just to make sure firmware handled async events */ ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver); if (ret) { @@ -627,6 +637,8 @@ static int aie2_init(struct amdxdna_dev *xdna) async_event_free: aie2_error_async_events_free(ndev); +event_trace_free: + aie2_event_trace_free(ndev); stop_hw: aie2_hw_stop(xdna); disable_sva: @@ -644,7 +656,13 @@ static void aie2_fini(struct amdxdna_dev *xdna) struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev); struct amdxdna_dev_hdl *ndev = xdna->dev_handle; + if (ndev && ndev->dev_status >= AIE2_DEV_START) + { + XDNA_ERR(xdna, "vs- send stop event trace \n"); + aie2_stop_event_trace_send(ndev); + } aie2_hw_stop(xdna); + aie2_event_trace_free(ndev); aie2_error_async_events_free(ndev); #ifdef AMDXDNA_DEVEL if (iommu_mode != AMDXDNA_IOMMU_PASID) diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h index 164ed23..93b5c40 100644 --- a/src/driver/amdxdna/aie2_pci.h +++ b/src/driver/amdxdna/aie2_pci.h @@ -249,6 +249,7 @@ struct amdxdna_dev_hdl { struct mailbox *mbox; struct mailbox_channel *mgmt_chann; struct async_events *async_events; + struct event_trace_req_buf *event_trace_req; u32 dev_status; u32 hwctx_num; @@ -342,6 +343,17 @@ void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev); int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev); int aie2_error_async_msg_thread(void *data); +/* aie2_event.c */ +int aie2_event_trace_alloc(struct amdxdna_dev_hdl *ndev); +void aie2_event_trace_free(struct amdxdna_dev_hdl *ndev); +int aie2_stop_event_trace_send(struct amdxdna_dev_hdl *ndev); +int aie2_start_event_trace_send(struct amdxdna_dev_hdl *ndev); +int aie2_start_event_trace(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, + u32 size, void *handle); +int aie2_stop_event_trace(struct amdxdna_dev_hdl *ndev); +void print_trace_event_log(struct amdxdna_dev_hdl * ndev); +uint32_t get_trace_event_Content(struct event_trace_req_buf *trace_req_buf, uint8_t *kern_buf); + /* aie2_message.c */ int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev); int aie2_resume_fw(struct amdxdna_dev_hdl *ndev); diff --git a/src/driver/amdxdna/amdxdna_mailbox.c b/src/driver/amdxdna/amdxdna_mailbox.c index 66f9dc5..ad5f9e0 100644 --- a/src/driver/amdxdna/amdxdna_mailbox.c +++ b/src/driver/amdxdna/amdxdna_mailbox.c @@ -25,6 +25,7 @@ #include #include "amdxdna_trace.h" #include "amdxdna_mailbox.h" +#include "aie2_pci.h" #define MB_ERR(chann, fmt, args...) \ ({ \ @@ -343,8 +344,6 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade return -EINVAL; } - MB_DBG(mb_chann, "opcode 0x%x size %d id 0x%x", - header->opcode, header->total_size, header->id); ret = mb_msg->notify_cb(mb_msg->handle, data, header->total_size); if (unlikely(ret)) MB_ERR(mb_chann, "Size %d opcode 0x%x ret %d", @@ -493,6 +492,18 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p) return IRQ_HANDLED; } +irqreturn_t clear_logbuff_irq(int irq, uint32_t mailbox_int_addr, struct amdxdna_dev_hdl *ndev) +{ + struct mailbox_channel *mb_chann = ndev->mgmt_chann; + trace_mbox_irq_handle("LOG_BUFFER", irq); + if (mb_chann->type == MB_CHANNEL_USER_POLL) + return IRQ_HANDLED; + + /* Clear IOHUB register */ + mailbox_reg_write(mb_chann, mailbox_int_addr, 0); + return IRQ_HANDLED; +} + #ifdef AMDXDNA_DEVEL static void mailbox_timer(struct timer_list *t) { diff --git a/src/driver/amdxdna/amdxdna_mailbox.h b/src/driver/amdxdna/amdxdna_mailbox.h index 3b68cb9..9e4cd38 100644 --- a/src/driver/amdxdna/amdxdna_mailbox.h +++ b/src/driver/amdxdna/amdxdna_mailbox.h @@ -8,6 +8,7 @@ struct mailbox; struct mailbox_channel; +struct amdxdna_dev_hdl; /* * xdna_mailbox_msg - message struct @@ -133,6 +134,8 @@ void xdna_mailbox_stop_channel(struct mailbox_channel *mailbox_chann); int xdna_mailbox_send_msg(struct mailbox_channel *mailbox_chann, struct xdna_mailbox_msg *msg, u64 tx_timeout); +irqreturn_t clear_logbuff_irq(int irq, uint32_t mailbox_int_addr, struct amdxdna_dev_hdl *ndev); + #if defined(CONFIG_DEBUG_FS) /* * xdna_mailbox_info_show() -- Show mailbox info for debug diff --git a/src/driver/amdxdna/amdxdna_mailbox_helper.h b/src/driver/amdxdna/amdxdna_mailbox_helper.h index ad1848b..1350eb2 100644 --- a/src/driver/amdxdna/amdxdna_mailbox_helper.h +++ b/src/driver/amdxdna/amdxdna_mailbox_helper.h @@ -37,6 +37,23 @@ struct xdna_notify { .notify_cb = xdna_msg_cb, \ } +#define DECLARE_XDNA_STOP_EVENT_TRACE_MSG(name, op, status) \ + struct name##_req req = {}; \ + struct name##_resp resp = { status }; \ + struct xdna_notify hdl = { \ + .error = 0, \ + .data = (u32 *)&resp, \ + .size = sizeof(resp), \ + .comp = COMPLETION_INITIALIZER_ONSTACK(hdl.comp), \ + }; \ + struct xdna_mailbox_msg msg = { \ + .send_data = (u8 *)&req, \ + .send_size = sizeof(req), \ + .handle = &hdl, \ + .opcode = op, \ + .notify_cb = xdna_msg_cb, \ + } + #define XDNA_STATUS_OFFSET(name) (offsetof(struct name##_resp, status) / sizeof(u32)) int xdna_msg_cb(void *handle, const u32 *data, size_t size);