From 0bce5f92fb51cf7d234557a0f81036d4fe63e877 Mon Sep 17 00:00:00 2001 From: William Easton Date: Thu, 13 Feb 2025 15:57:15 +0100 Subject: [PATCH 1/5] New branch for Nvidia GPU Integration --- packages/nvidia_gpu/LICENSE.txt | 93 + packages/nvidia_gpu/_dev/build/docs/README.md | 26 + packages/nvidia_gpu/changelog.yml | 6 + .../stats/agent/stream/stream.yml.hbs | 27 + .../elasticsearch/ingest_pipeline/default.yml | 189 + .../data_stream/stats/fields/base-fields.yml | 12 + .../data_stream/stats/fields/fields.yml | 269 ++ .../nvidia_gpu/data_stream/stats/manifest.yml | 68 + .../data_stream/stats/sample_event.json | 64 + packages/nvidia_gpu/docs/README.md | 144 + packages/nvidia_gpu/img/nvidia_logo.svg | 32 + ...-a3a5759a-1b3d-456d-8ab8-83e97f774030.json | 3521 +++++++++++++++ ...-bac121fe-6cd9-48a7-a349-f52ffa42d56b.json | 3933 +++++++++++++++++ ...-bc9ea2cd-816a-40a9-912e-a97c5781ba0c.json | 14 + packages/nvidia_gpu/manifest.yml | 34 + 15 files changed, 8432 insertions(+) create mode 100644 packages/nvidia_gpu/LICENSE.txt create mode 100644 packages/nvidia_gpu/_dev/build/docs/README.md create mode 100644 packages/nvidia_gpu/changelog.yml create mode 100644 packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs create mode 100644 packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml create mode 100644 packages/nvidia_gpu/data_stream/stats/fields/base-fields.yml create mode 100644 packages/nvidia_gpu/data_stream/stats/fields/fields.yml create mode 100644 packages/nvidia_gpu/data_stream/stats/manifest.yml create mode 100644 packages/nvidia_gpu/data_stream/stats/sample_event.json create mode 100644 packages/nvidia_gpu/docs/README.md create mode 100644 packages/nvidia_gpu/img/nvidia_logo.svg create mode 100644 packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-a3a5759a-1b3d-456d-8ab8-83e97f774030.json create mode 100644 packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-bac121fe-6cd9-48a7-a349-f52ffa42d56b.json create mode 100644 packages/nvidia_gpu/kibana/tag/nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c.json create mode 100644 packages/nvidia_gpu/manifest.yml diff --git a/packages/nvidia_gpu/LICENSE.txt b/packages/nvidia_gpu/LICENSE.txt new file mode 100644 index 00000000000..809108b857f --- /dev/null +++ b/packages/nvidia_gpu/LICENSE.txt @@ -0,0 +1,93 @@ +Elastic License 2.0 + +URL: https://www.elastic.co/licensing/elastic-license + +## Acceptance + +By using the software, you agree to all of the terms and conditions below. + +## Copyright License + +The licensor grants you a non-exclusive, royalty-free, worldwide, +non-sublicensable, non-transferable license to use, copy, distribute, make +available, and prepare derivative works of the software, in each case subject to +the limitations and conditions below. + +## Limitations + +You may not provide the software to third parties as a hosted or managed +service, where the service provides users with access to any substantial set of +the features or functionality of the software. + +You may not move, change, disable, or circumvent the license key functionality +in the software, and you may not remove or obscure any functionality in the +software that is protected by the license key. + +You may not alter, remove, or obscure any licensing, copyright, or other notices +of the licensor in the software. Any use of the licensor’s trademarks is subject +to applicable law. + +## Patents + +The licensor grants you a license, under any patent claims the licensor can +license, or becomes able to license, to make, have made, use, sell, offer for +sale, import and have imported the software, in each case subject to the +limitations and conditions in this license. This license does not cover any +patent claims that you cause to be infringed by modifications or additions to +the software. If you or your company make any written claim that the software +infringes or contributes to infringement of any patent, your patent license for +the software granted under these terms ends immediately. If your company makes +such a claim, your patent license ends immediately for work on behalf of your +company. + +## Notices + +You must ensure that anyone who gets a copy of any part of the software from you +also gets a copy of these terms. + +If you modify the software, you must include in any modified copies of the +software prominent notices stating that you have modified the software. + +## No Other Rights + +These terms do not imply any licenses other than those expressly granted in +these terms. + +## Termination + +If you use the software in violation of these terms, such use is not licensed, +and your licenses will automatically terminate. If the licensor provides you +with a notice of your violation, and you cease all violation of this license no +later than 30 days after you receive that notice, your licenses will be +reinstated retroactively. However, if you violate these terms after such +reinstatement, any additional violation of these terms will cause your licenses +to terminate automatically and permanently. + +## No Liability + +*As far as the law allows, the software comes as is, without any warranty or +condition, and the licensor will not be liable to you for any damages arising +out of these terms or the use or nature of the software, under any kind of +legal claim.* + +## Definitions + +The **licensor** is the entity offering these terms, and the **software** is the +software the licensor makes available under these terms, including any portion +of it. + +**you** refers to the individual or entity agreeing to these terms. + +**your company** is any legal entity, sole proprietorship, or other kind of +organization that you work for, plus all organizations that have control over, +are under the control of, or are under common control with that +organization. **control** means ownership of substantially all the assets of an +entity, or the power to direct its management and policies by vote, contract, or +otherwise. Control can be direct or indirect. + +**your licenses** are all the licenses granted to you for the software under +these terms. + +**use** means anything you do with the software requiring one of your licenses. + +**trademark** means trademarks, service marks, and similar rights. diff --git a/packages/nvidia_gpu/_dev/build/docs/README.md b/packages/nvidia_gpu/_dev/build/docs/README.md new file mode 100644 index 00000000000..e9abc7bd300 --- /dev/null +++ b/packages/nvidia_gpu/_dev/build/docs/README.md @@ -0,0 +1,26 @@ +# Nvidia GPU Monitoring + +Use the NVIDIA GPU Monitoring integration to monitor the health and performance of your NVIDIA GPUs. The integration collects metrics from the NVIDIA Datacenter GPU Manager and sends them to Elasticsearch. + +## Data streams + +**stats** give you insight into the state of the NVIDIA GPUs. +Metric data streams collected by the Nvidia GPU Monitoring integration include `stats`. See more details in the [Metrics](#metrics-reference). + +## Requirements + +You need Elasticsearch for storing and searching your data and Kibana for visualizing and managing it. +You can use our hosted Elasticsearch Service on Elastic Cloud, which is recommended, or self-manage the Elastic Stack on your own hardware. + +You need the NVIDIA Datacenter GPU Manager (DCGM) installed on your system (or exposed via a docker container with the GPU device mounted) to collect metrics from the NVIDIA GPUs. You can download the DCGM from the [NVIDIA website](https://developer.nvidia.com/dcgm). By default the DCGM exporter does not expose all available metrics. + +## Setup + +For step-by-step instructions on how to set up an integration, see the +[Getting started](https://www.elastic.co/guide/en/welcome-to-elastic/current/getting-started-observability.html) guide. + +When running on Kubernetes, you can use ${env.NODE_NAME} to get the node name for use in the hosts field. For example: `hosts: http://${env.NODE_NAME}:9400/metrics`. + + +{{event "stats"}} +{{fields "stats"}} \ No newline at end of file diff --git a/packages/nvidia_gpu/changelog.yml b/packages/nvidia_gpu/changelog.yml new file mode 100644 index 00000000000..0e2bf4ddc56 --- /dev/null +++ b/packages/nvidia_gpu/changelog.yml @@ -0,0 +1,6 @@ +# newer versions go on top +- version: "0.1.0" + changes: + - description: Initial introduction of Nvidia GPU Monitoring + type: enhancement + link: https://github.com/elastic/integrations/pull/11931 diff --git a/packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs b/packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs new file mode 100644 index 00000000000..ea3337eace3 --- /dev/null +++ b/packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs @@ -0,0 +1,27 @@ +hosts: +{{#each hosts}} + - {{this}} +{{/each}} +period: {{period}} +use_types: true +rate_counters: false +username: {{username}} +password: {{password}} +metrics_filters.exclude: +{{#each metrics_filters.exclude}} + - {{this}} +{{/each}} +metrics_filters.include: +{{#each metrics_filters.include}} + - {{this}} +{{/each}} +{{#if ssl.certificate_authorities}} +ssl.certificate_authorities: +{{#each ssl.certificate_authorities}} + - {{this}} +{{/each}} +{{/if}} +{{#if processors}} +processors: +{{processors}} +{{/if}} diff --git a/packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml b/packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml new file mode 100644 index 00000000000..158d45cb96c --- /dev/null +++ b/packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml @@ -0,0 +1,189 @@ +--- +description: Pipeline for NVIDIA GPU Metrics +processors: +- rename: + field: prometheus.DCGM_FI_DEV_MEM_COPY_UTIL.value + target_field: gpu.memory.copy_utilization + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_FB_USED.value + target_field: gpu.framebuffer.size.used + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_FB_FREE.value + target_field: gpu.framebuffer.size.free + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_POWER_USAGE.value + target_field: gpu.power.usage + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_SM_CLOCK.value + target_field: gpu.streaming_multiprocessor.frequency + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_ENC_UTIL.value + target_field: gpu.encoder.utilization + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_DEC_UTIL.value + target_field: gpu.decoder.utilization + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_GPU_TEMP.value + target_field: gpu.temperature + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_VGPU_LICENSE_STATUS.value + target_field: gpu.license.vgpu + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_MEM_CLOCK.value + target_field: gpu.memory.frequency + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION.counter + target_field: gpu.energy.total + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL.counter + target_field: gpu.nvlink.bandwidth.total + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_GPU_UTIL.value + target_field: gpu.utilization + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_MEMORY_TEMP.value + target_field: gpu.memory.temperature + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_PCIE_REPLAY_COUNTER.rate + target_field: gpu.pcie.replay + ignore_missing: true +- rename: + field: prometheus.labels.modelName + target_field: gpu.device.model + ignore_missing: true +- rename: + field: prometheus.labels.instance + target_field: prometheus.node.name + ignore_missing: true +- rename: + field: prometheus.labels.pci_bus_id + target_field: gpu.pci.bus.id + ignore_missing: true +- rename: + field: prometheus.labels.Hostname + target_field: prometheus.node.hostname + ignore_missing: true +- rename: + field: prometheus.labels.job + target_field: prometheus.node.job + ignore_missing: true +- rename: + field: prometheus.labels.DCGM_FI_DRIVER_VERSION + target_field: gpu.driver.version + ignore_missing: true +- rename: + field: prometheus.labels.UUID + target_field: gpu.device.uuid + ignore_missing: true +- rename: + field: prometheus.labels.device + target_field: gpu.device.name + ignore_missing: true +- rename: + field: prometheus.labels.gpu + target_field: gpu.device.id + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_XID_ERRORS.value + target_field: gpu.error.xid + ignore_missing: true +- rename: + field: prometheus.labels.err_code + target_field: gpu.error.code + ignore_missing: true +- rename: + field: prometheus.labels.err_msg + target_field: gpu.error.message + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL.rate + target_field: gpu.memory.errors.double_bit_persistent + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL.rate + target_field: gpu.memory.errors.single_bit_persistent + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL.rate + target_field: gpu.memory.errors.double_bit_volatile + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL.rate + target_field: gpu.memory.errors.single_bit_volatile + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_SYNC_BOOST_VIOLATION.rate + target_field: gpu.throttling.sync_boost + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_THERMAL_VIOLATION.rate + target_field: gpu.throttling.thermal + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_LOW_UTIL_VIOLATION.rate + target_field: gpu.throttling.low_utilization + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_BOARD_LIMIT_VIOLATION.rate + target_field: gpu.throttling.board_limit + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_POWER_VIOLATION.rate + target_field: gpu.throttling.power + ignore_missing: true +- rename: + field: prometheus.DCGM_FI_DEV_RELIABILITY_VIOLATION.rate + target_field: gpu.throttling.reliability + ignore_missing: true +- rename: + field: prometheus.labels.DCGM_FI_NVML_VERSION + target_field: gpu.driver.nvml_version + ignore_missing: true +- rename: + field: prometheus.labels.DCGM_FI_DEV_OEM_INFOROM_VER + target_field: gpu.device.info_rom.oem_version + ignore_missing: true +- rename: + field: prometheus.labels.DCGM_FI_DEV_INFOROM_IMAGE_VER + target_field: gpu.device.info_rom.version + ignore_missing: true +- rename: + field: prometheus.labels.DCGM_FI_DEV_VBIOS_VERSION + target_field: gpu.device.vbios.version + ignore_missing: true +- rename: + field: prometheus.labels.DCGM_FI_DEV_BRAND + target_field: gpu.device.brand + ignore_missing: true + +- remove: + field: + - "prometheus.DCGM_FI_DEV_PCIE_REPLAY_COUNTER" + - "prometheus.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL" + - "prometheus.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL" + - "prometheus.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL" + - "prometheus.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL" + - "prometheus.DCGM_FI_DEV_SYNC_BOOST_VIOLATION" + - "prometheus.DCGM_FI_DEV_THERMAL_VIOLATION" + - "prometheus.DCGM_FI_DEV_LOW_UTIL_VIOLATION" + - "prometheus.DCGM_FI_DEV_BOARD_LIMIT_VIOLATION" + - "prometheus.DCGM_FI_DEV_POWER_VIOLATION" + - "prometheus.DCGM_FI_DEV_RELIABILITY_VIOLATION" + - "prometheus.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION" + - "prometheus.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL" + ignore_missing: true + ignore_failure: true \ No newline at end of file diff --git a/packages/nvidia_gpu/data_stream/stats/fields/base-fields.yml b/packages/nvidia_gpu/data_stream/stats/fields/base-fields.yml new file mode 100644 index 00000000000..7c798f4534c --- /dev/null +++ b/packages/nvidia_gpu/data_stream/stats/fields/base-fields.yml @@ -0,0 +1,12 @@ +- name: data_stream.type + type: constant_keyword + description: Data stream type. +- name: data_stream.dataset + type: constant_keyword + description: Data stream dataset. +- name: data_stream.namespace + type: constant_keyword + description: Data stream namespace. +- name: '@timestamp' + type: date + description: Event timestamp. diff --git a/packages/nvidia_gpu/data_stream/stats/fields/fields.yml b/packages/nvidia_gpu/data_stream/stats/fields/fields.yml new file mode 100644 index 00000000000..9999108ee60 --- /dev/null +++ b/packages/nvidia_gpu/data_stream/stats/fields/fields.yml @@ -0,0 +1,269 @@ +--- +- name: gpu + type: group + description: > + Fields related to NVIDIA GPUs. + + fields: + - name: decoder.utilization + type: float + description: > + Utilization of the decoder engine in the GPU. + + - name: device + type: group + fields: + - name: brand + type: keyword + description: > + Brand of the GPU device. + + - name: id + type: keyword + description: > + ID of the GPU device. + + dimension: true + - name: info_rom.oem_version + type: keyword + description: > + OEM version of the info ROM. + + - name: info_rom.version + type: keyword + description: > + Version of the info ROM. + + - name: model + type: keyword + description: > + Model of the GPU device. + + - name: name + type: keyword + description: > + Name of the GPU device. + + - name: uuid + type: keyword + description: > + UUID of the GPU device. + + dimension: true + - name: vbios.version + type: keyword + description: > + Version of the vbios. + + - name: driver + type: group + fields: + - name: nvml_version + type: keyword + description: > + NVML version of the driver. + + - name: version + type: keyword + description: > + Version of the driver. + + - name: encoder.utilization + type: float + description: > + Utilization of the encoder engine in the GPU. + + - name: energy.total + type: long + metric_type: counter + description: > + Total energy consumption of the GPU since boot in Joules. + + - name: error + type: group + fields: + - name: code + type: keyword + description: > + Specific Error code for the XID error on the GPU. + + - name: message + type: keyword + description: > + Specific Error message for the XID error on the. + + - name: xid + type: keyword + description: > + The eXerience ID of the error being reported by the GPU. + + - name: framebuffer + type: group + fields: + - name: size.free + type: long + description: > + Free size of the framebuffer. + + - name: size.used + type: long + description: > + Used size of the framebuffer. + + - name: license.vgpu + type: keyword + description: > + License status related to vGPU. + + - name: memory + type: group + fields: + - name: size + type: long + description: > + Size of the GPU memory in MB. + + - name: used + type: long + description: > + Used size of the GPU memory in MB. + + - name: copy_utilization + type: float + description: > + Utilization of the GPU memory copy engine. + + - name: errors.double_bit_persistent + type: long + metric_type: gauge + description: > + Double-bit persistent errors count for GPU memory. + + - name: errors.double_bit_volatile + type: long + metric_type: gauge + description: > + Double-bit volatile errors count for GPU memory. + + - name: errors.single_bit_persistent + type: long + metric_type: gauge + description: > + Single-bit persistent errors count for GPU memory. + + - name: errors.single_bit_volatile + type: long + metric_type: gauge + description: > + Single-bit volatile errors count for GPU memory. + + - name: frequency + type: float + description: > + Clock frequency of the GPU memory. + + - name: temperature + type: float + description: > + Temperature of the GPU memory. + + - name: nvlink.bandwidth.total + type: long + metric_type: counter + description: > + Total bandwidth of NVLink. + + - name: pci.bus.id + type: keyword + description: > + Bus ID of the PCI device. + + - name: pcie.replay + type: long + metric_type: gauge + description: > + Replay counter for the PCIe connection. + + - name: power.usage + type: float + description: > + Current power usage of the GPU in Watts. + + - name: streaming_multiprocessor.frequency + type: float + description: > + Frequency of the streaming multiprocessor. + + - name: temperature + type: float + description: > + Temperature of the GPU. + + - name: throttling + type: group + fields: + - name: board_limit + type: float + metric_type: gauge + description: > + Number of microseconds throttled due to Board limit. + + - name: low_utilization + type: float + metric_type: gauge + description: > + Number of microseconds throttled due to low utilization. + + - name: power + type: float + metric_type: gauge + description: > + Number of microseconds throttled due to power. + + - name: reliability + type: float + metric_type: gauge + description: > + Number of microseconds throttled due to reliability. + + - name: sync_boost + type: float + metric_type: gauge + description: > + Number of microseconds throttled due to Sync Boost. + + - name: thermal + type: float + metric_type: gauge + description: > + Number of microseconds throttled due to thermals. + + - name: utilization + type: float + description: > + Overall utilization of the GPU. + +- name: prometheus + type: group + description: > + Fields related to Prometheus node metadata. + + fields: + - name: node.hostname + type: keyword + description: > + Hostname of the Prometheus node. + + dimension: true + - name: node.job + type: keyword + description: > + Job of the Prometheus node. + + dimension: true + - name: node.id + type: integer + description: > + ID of the Prometheus node. + + dimension: true diff --git a/packages/nvidia_gpu/data_stream/stats/manifest.yml b/packages/nvidia_gpu/data_stream/stats/manifest.yml new file mode 100644 index 00000000000..eeb1a3f507c --- /dev/null +++ b/packages/nvidia_gpu/data_stream/stats/manifest.yml @@ -0,0 +1,68 @@ +title: Collect GPU Statistics +type: metrics +streams: + - title: NVIDIA GPU Metrics + description: Collects Prometheus Metrics from NVIDIA Datacenter GPU Manager for NVIDIA GPUs + input: prometheus/metrics + template_path: stream.yml.hbs + vars: + - name: hosts + type: text + title: Hosts + multi: true + required: true + show_user: true + default: + - localhost:9090/metrics + - name: period + type: text + title: Period + multi: false + required: true + show_user: true + default: 10s + - name: metrics_filters.exclude + type: text + title: Metrics Filters Exclude + multi: true + required: false + show_user: false + default: [] + - name: metrics_filters.include + type: text + title: Metrics Filters Include + multi: true + required: false + show_user: false + default: [] + - name: ssl.certificate_authorities + type: text + title: SSL Certificate Authorities + multi: true + required: false + show_user: false + - name: username + type: text + title: Username + multi: false + required: false + show_user: true + - name: password + type: password + title: Password + secret: true + multi: false + required: false + show_user: true + - name: processors + type: yaml + title: Processors + multi: false + required: false + show_user: false + description: > + Processors are used to reduce the number of fields in the exported event or to enhance the event with metadata. This executes in the agent before the events are shipped. See [Processors](https://www.elastic.co/guide/en/fleet/current/elastic-agent-processor-configuration.html) for details. + +elasticsearch: + source_mode: synthetic + index_mode: time_series diff --git a/packages/nvidia_gpu/data_stream/stats/sample_event.json b/packages/nvidia_gpu/data_stream/stats/sample_event.json new file mode 100644 index 00000000000..ebdf09a000f --- /dev/null +++ b/packages/nvidia_gpu/data_stream/stats/sample_event.json @@ -0,0 +1,64 @@ +{ + "@timestamp": "2025-02-04T03:58:06.137Z", + "agent": { + "ephemeral_id": "33183a42-1f03-4d37-bf77-2a683c47eec1", + "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", + "name": "4b8c5ec8e940", + "type": "metricbeat", + "version": "8.16.1" + }, + "data_stream": { + "dataset": "nvidia_gpu.stats", + "namespace": "default", + "type": "metrics" + }, + "ecs": { + "version": "8.0.0" + }, + "elastic_agent": { + "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", + "snapshot": false, + "version": "8.16.1" + }, + "event": { + "agent_id_status": "verified", + "dataset": "nvidia_gpu.stats", + "duration": 3729334, + "ingested": "2025-02-04T03:58:16Z", + "module": "prometheus" + }, + "host": { + "architecture": "aarch64", + "containerized": false, + "hostname": "4b8c5ec8e940", + "ip": "172.17.0.3", + "mac": "02-42-AC-11-00-03", + "name": "4b8c5ec8e940", + "os": { + "codename": "noble", + "family": "debian", + "kernel": "6.10.14-linuxkit", + "name": "Ubuntu", + "platform": "ubuntu", + "type": "linux", + "version": "24.04.1 LTS (Noble Numbat)" + } + }, + "metricset": { + "name": "collector", + "period": 10000 + }, + "prometheus": { + "node": { + "job": "prometheus", + "name": "192.168.0.238:9400" + }, + "up": { + "value": 0 + } + }, + "service": { + "address": "http://192.168.0.238:9400/metrics", + "type": "prometheus" + } +} \ No newline at end of file diff --git a/packages/nvidia_gpu/docs/README.md b/packages/nvidia_gpu/docs/README.md new file mode 100644 index 00000000000..323d61d4daa --- /dev/null +++ b/packages/nvidia_gpu/docs/README.md @@ -0,0 +1,144 @@ +# Nvidia GPU Monitoring + +Use the NVIDIA GPU Monitoring integration to monitor the health and performance of your NVIDIA GPUs. The integration collects metrics from the NVIDIA Datacenter GPU Manager and sends them to Elasticsearch. + +## Data streams + +**stats** give you insight into the state of the NVIDIA GPUs. +Metric data streams collected by the Nvidia GPU Monitoring integration include `stats`. See more details in the [Metrics](#metrics-reference). + +## Requirements + +You need Elasticsearch for storing and searching your data and Kibana for visualizing and managing it. +You can use our hosted Elasticsearch Service on Elastic Cloud, which is recommended, or self-manage the Elastic Stack on your own hardware. + +You need the NVIDIA Datacenter GPU Manager (DCGM) installed on your system (or exposed via a docker container with the GPU device mounted) to collect metrics from the NVIDIA GPUs. You can download the DCGM from the [NVIDIA website](https://developer.nvidia.com/dcgm). By default the DCGM exporter does not expose all available metrics. + +## Setup + +For step-by-step instructions on how to set up an integration, see the +[Getting started](https://www.elastic.co/guide/en/welcome-to-elastic/current/getting-started-observability.html) guide. + +When running on Kubernetes, you can use ${env.NODE_NAME} to get the node name for use in the hosts field. For example: `hosts: http://${env.NODE_NAME}:9400/metrics`. + + +An example event for `stats` looks as following: + +```json +{ + "@timestamp": "2025-02-04T03:58:06.137Z", + "agent": { + "ephemeral_id": "33183a42-1f03-4d37-bf77-2a683c47eec1", + "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", + "name": "4b8c5ec8e940", + "type": "metricbeat", + "version": "8.16.1" + }, + "data_stream": { + "dataset": "nvidia_gpu.stats", + "namespace": "default", + "type": "metrics" + }, + "ecs": { + "version": "8.0.0" + }, + "elastic_agent": { + "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", + "snapshot": false, + "version": "8.16.1" + }, + "event": { + "agent_id_status": "verified", + "dataset": "nvidia_gpu.stats", + "duration": 3729334, + "ingested": "2025-02-04T03:58:16Z", + "module": "prometheus" + }, + "host": { + "architecture": "aarch64", + "containerized": false, + "hostname": "4b8c5ec8e940", + "ip": "172.17.0.3", + "mac": "02-42-AC-11-00-03", + "name": "4b8c5ec8e940", + "os": { + "codename": "noble", + "family": "debian", + "kernel": "6.10.14-linuxkit", + "name": "Ubuntu", + "platform": "ubuntu", + "type": "linux", + "version": "24.04.1 LTS (Noble Numbat)" + } + }, + "metricset": { + "name": "collector", + "period": 10000 + }, + "prometheus": { + "node": { + "job": "prometheus", + "name": "192.168.0.238:9400" + }, + "up": { + "value": 0 + } + }, + "service": { + "address": "http://192.168.0.238:9400/metrics", + "type": "prometheus" + } +} +``` +**Exported fields** + +| Field | Description | Type | Metric Type | +|---|---|---|---| +| @timestamp | Event timestamp. | date | | +| data_stream.dataset | Data stream dataset. | constant_keyword | | +| data_stream.namespace | Data stream namespace. | constant_keyword | | +| data_stream.type | Data stream type. | constant_keyword | | +| gpu.decoder.utilization | Utilization of the decoder engine in the GPU. | float | | +| gpu.device.brand | Brand of the GPU device. | keyword | | +| gpu.device.id | ID of the GPU device. | keyword | | +| gpu.device.info_rom.oem_version | OEM version of the info ROM. | keyword | | +| gpu.device.info_rom.version | Version of the info ROM. | keyword | | +| gpu.device.model | Model of the GPU device. | keyword | | +| gpu.device.name | Name of the GPU device. | keyword | | +| gpu.device.uuid | UUID of the GPU device. | keyword | | +| gpu.device.vbios.version | Version of the vbios. | keyword | | +| gpu.driver.nvml_version | NVML version of the driver. | keyword | | +| gpu.driver.version | Version of the driver. | keyword | | +| gpu.encoder.utilization | Utilization of the encoder engine in the GPU. | float | | +| gpu.energy.total | Total energy consumption of the GPU since boot in Joules. | long | counter | +| gpu.error.code | Specific Error code for the XID error on the GPU. | keyword | | +| gpu.error.message | Specific Error message for the XID error on the. | keyword | | +| gpu.error.xid | The eXerience ID of the error being reported by the GPU. | keyword | | +| gpu.framebuffer.size.free | Free size of the framebuffer. | long | | +| gpu.framebuffer.size.used | Used size of the framebuffer. | long | | +| gpu.license.vgpu | License status related to vGPU. | keyword | | +| gpu.memory.copy_utilization | Utilization of the GPU memory copy engine. | float | | +| gpu.memory.errors.double_bit_persistent | Double-bit persistent errors count for GPU memory. | long | gauge | +| gpu.memory.errors.double_bit_volatile | Double-bit volatile errors count for GPU memory. | long | gauge | +| gpu.memory.errors.single_bit_persistent | Single-bit persistent errors count for GPU memory. | long | gauge | +| gpu.memory.errors.single_bit_volatile | Single-bit volatile errors count for GPU memory. | long | gauge | +| gpu.memory.frequency | Clock frequency of the GPU memory. | float | | +| gpu.memory.size | Size of the GPU memory in MB. | long | | +| gpu.memory.temperature | Temperature of the GPU memory. | float | | +| gpu.memory.used | Used size of the GPU memory in MB. | long | | +| gpu.nvlink.bandwidth.total | Total bandwidth of NVLink. | long | counter | +| gpu.pci.bus.id | Bus ID of the PCI device. | keyword | | +| gpu.pcie.replay | Replay counter for the PCIe connection. | long | gauge | +| gpu.power.usage | Current power usage of the GPU in Watts. | float | | +| gpu.streaming_multiprocessor.frequency | Frequency of the streaming multiprocessor. | float | | +| gpu.temperature | Temperature of the GPU. | float | | +| gpu.throttling.board_limit | Number of microseconds throttled due to Board limit. | float | gauge | +| gpu.throttling.low_utilization | Number of microseconds throttled due to low utilization. | float | gauge | +| gpu.throttling.power | Number of microseconds throttled due to power. | float | gauge | +| gpu.throttling.reliability | Number of microseconds throttled due to reliability. | float | gauge | +| gpu.throttling.sync_boost | Number of microseconds throttled due to Sync Boost. | float | gauge | +| gpu.throttling.thermal | Number of microseconds throttled due to thermals. | float | gauge | +| gpu.utilization | Overall utilization of the GPU. | float | | +| prometheus.node.hostname | Hostname of the Prometheus node. | keyword | | +| prometheus.node.id | ID of the Prometheus node. | integer | | +| prometheus.node.job | Job of the Prometheus node. | keyword | | diff --git a/packages/nvidia_gpu/img/nvidia_logo.svg b/packages/nvidia_gpu/img/nvidia_logo.svg new file mode 100644 index 00000000000..93930024fb8 --- /dev/null +++ b/packages/nvidia_gpu/img/nvidia_logo.svg @@ -0,0 +1,32 @@ + + + + +generated by pstoedit version:3.44 from NVBadge_2D.eps + + + + diff --git a/packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-a3a5759a-1b3d-456d-8ab8-83e97f774030.json b/packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-a3a5759a-1b3d-456d-8ab8-83e97f774030.json new file mode 100644 index 00000000000..33ad33cd8ee --- /dev/null +++ b/packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-a3a5759a-1b3d-456d-8ab8-83e97f774030.json @@ -0,0 +1,3521 @@ +{ + "attributes": { + "controlGroupInput": { + "chainingSystem": "HIERARCHICAL", + "controlStyle": "oneLine", + "ignoreParentSettingsJSON": { + "ignoreFilters": false, + "ignoreQuery": false, + "ignoreTimerange": false, + "ignoreValidations": false + }, + "panelsJSON": {}, + "showApplySelections": false + }, + "description": "", + "kibanaSavedObjectMeta": { + "searchSourceJSON": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "optionsJSON": { + "hidePanelTitles": false, + "syncColors": false, + "syncCursor": true, + "syncTooltips": false, + "useMargins": true + }, + "panelsJSON": [ + { + "embeddableConfig": { + "attributes": { + "layout": "horizontal", + "links": [ + { + "destinationRefName": "link_41926dec-72cc-42a8-af6a-c51b41d2cf1f_dashboard", + "id": "41926dec-72cc-42a8-af6a-c51b41d2cf1f", + "label": "Overview", + "order": 0, + "type": "dashboardLink" + }, + { + "destinationRefName": "link_38a810ff-17a4-428a-93d2-795f81a727e3_dashboard", + "id": "38a810ff-17a4-428a-93d2-795f81a727e3", + "label": "GPU-Level Metrics", + "order": 1, + "type": "dashboardLink" + } + ] + }, + "enhancements": {} + }, + "gridData": { + "h": 3, + "i": "8215a9c0-1174-4a21-938b-1d21db1d110b", + "w": 48, + "x": 0, + "y": 0 + }, + "panelIndex": "8215a9c0-1174-4a21-938b-1d21db1d110b", + "type": "links" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPUs Monitored", + "operationType": "unique_count", + "params": { + "emptyAsNull": true + }, + "scale": "ratio", + "sourceField": "gpu.device.uuid" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "4814e6dd-165f-4d6f-9a9f-5b578d6d7428", + "w": 5, + "x": 0, + "y": 3 + }, + "panelIndex": "4814e6dd-165f-4d6f-9a9f-5b578d6d7428", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Nodes Monitored", + "operationType": "unique_count", + "params": { + "emptyAsNull": true + }, + "scale": "ratio", + "sourceField": "prometheus.node.name" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "5d1fe3c2-5112-4bbe-8557-a7facbb61291", + "w": 5, + "x": 5, + "y": 3 + }, + "panelIndex": "5d1fe3c2-5112-4bbe-8557-a7facbb61291", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Utilization", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "0b64682a-6f5c-4244-8396-34a5742116d1", + "w": 48, + "x": 0, + "y": 7 + }, + "panelIndex": "0b64682a-6f5c-4244-8396-34a5742116d1", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Average GPU Utilization", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "424db695-de4a-4086-bcf6-2d2e35e759c0", + "w": 10, + "x": 0, + "y": 11 + }, + "panelIndex": "424db695-de4a-4086-bcf6-2d2e35e759c0", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Average Encoder Utilization", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.encoder.utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "b42166d2-0952-44f8-b495-de6b72ae43c0", + "w": 9, + "x": 10, + "y": 11 + }, + "panelIndex": "b42166d2-0952-44f8-b495-de6b72ae43c0", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.decoder.utilization\": *" + }, + "isBucketed": false, + "label": "Average Decoder Utilization", + "operationType": "last_value", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + }, + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.decoder.utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "793a21c4-7204-444d-b858-cc63f19425b4", + "w": 9, + "x": 19, + "y": 11 + }, + "panelIndex": "793a21c4-7204-444d-b858-cc63f19425b4", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Average Memory Copy Engine Utilization", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.memory.copy_utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "f6056480-5da9-44b2-a125-8e30ec534ba9", + "w": 10, + "x": 28, + "y": 11 + }, + "panelIndex": "f6056480-5da9-44b2-a125-8e30ec534ba9", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "410f39f0-4944-4b03-920d-42950f279cfd", + "d1bbfa53-3b7f-4b6b-91d2-5821e9ca304f", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "410f39f0-4944-4b03-920d-42950f279cfd": { + "dataType": "string", + "isBucketed": true, + "label": "Top values of prometheus.node.name + 1 other", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": true, + "parentFormat": { + "id": "multi_terms" + }, + "secondaryFields": [ + "gpu.device.uuid" + ], + "size": 15 + }, + "scale": "ordinal", + "sourceField": "prometheus.node.name" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPU Utilization", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.utilization" + }, + "d1bbfa53-3b7f-4b6b-91d2-5821e9ca304f": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "splitAccessor": "410f39f0-4944-4b03-920d-42950f279cfd", + "xAccessor": "d1bbfa53-3b7f-4b6b-91d2-5821e9ca304f" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "6e50d26f-50e0-4f98-adc6-a0d535f58453", + "w": 48, + "x": 0, + "y": 21 + }, + "panelIndex": "6e50d26f-50e0-4f98-adc6-a0d535f58453", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "410f39f0-4944-4b03-920d-42950f279cfd", + "4683e7c9-3714-4fa8-a43b-ed19499398be", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "410f39f0-4944-4b03-920d-42950f279cfd": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Node Name", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "secondaryFields": [], + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "prometheus.node.name" + }, + "4683e7c9-3714-4fa8-a43b-ed19499398be": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "GPU UUID", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPU Utilization", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.utilization" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "columns": [ + { + "columnId": "410f39f0-4944-4b03-920d-42950f279cfd", + "width": 565.5 + }, + { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + }, + { + "columnId": "4683e7c9-3714-4fa8-a43b-ed19499398be", + "isMetric": false, + "isTransposed": false + } + ], + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsDatatable" + }, + "enhancements": {} + }, + "gridData": { + "h": 11, + "i": "3eb4830a-d697-49a2-b55f-e09a5e964fa0", + "w": 24, + "x": 0, + "y": 31 + }, + "panelIndex": "3eb4830a-d697-49a2-b55f-e09a5e964fa0", + "title": "Most Utilized GPUs", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "410f39f0-4944-4b03-920d-42950f279cfd", + "4683e7c9-3714-4fa8-a43b-ed19499398be", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "410f39f0-4944-4b03-920d-42950f279cfd": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Node Name", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "asc", + "otherBucket": true, + "parentFormat": { + "id": "terms" + }, + "secondaryFields": [], + "size": 100 + }, + "scale": "ordinal", + "sourceField": "prometheus.node.name" + }, + "4683e7c9-3714-4fa8-a43b-ed19499398be": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "GPU UUID", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPU Utilization", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.utilization" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "columns": [ + { + "columnId": "410f39f0-4944-4b03-920d-42950f279cfd", + "width": 565.5 + }, + { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + }, + { + "columnId": "4683e7c9-3714-4fa8-a43b-ed19499398be", + "isMetric": false, + "isTransposed": false + } + ], + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "sorting": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "direction": "asc" + } + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsDatatable" + }, + "enhancements": {} + }, + "gridData": { + "h": 11, + "i": "6f55585b-8a57-472b-a8d9-904f56042d21", + "w": 24, + "x": 24, + "y": 31 + }, + "panelIndex": "6f55585b-8a57-472b-a8d9-904f56042d21", + "title": "Least Utilized GPUs\\", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Energy and Power Consumption", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "c304d2d5-e610-4fc7-81b7-a0849695e1af", + "w": 48, + "x": 0, + "y": 42 + }, + "panelIndex": "c304d2d5-e610-4fc7-81b7-a0849695e1af", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "dff8f29d-dae5-40e8-bf4d-99cde54e5d4d", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Current Power Consumption", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "compact": true, + "decimals": 2, + "suffix": " Watts" + } + }, + "formula": "last_value(gpu.power.usage)", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.power.usage\": *" + }, + "isBucketed": false, + "label": "Part of Current Power Consumption", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.power.usage" + }, + "dff8f29d-dae5-40e8-bf4d-99cde54e5d4d": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Total Power Consumption", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "fallback": true, + "type": "alphabetical" + }, + "orderDirection": "asc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "breakdownByAccessor": "dff8f29d-dae5-40e8-bf4d-99cde54e5d4d", + "collapseFn": "sum", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 5, + "i": "b882721f-9ce6-4214-838e-7535a01fbd5c", + "w": 24, + "x": 0, + "y": 46 + }, + "panelIndex": "b882721f-9ce6-4214-838e-7535a01fbd5c", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "dff8f29d-dae5-40e8-bf4d-99cde54e5d4d", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Average Power Consumption", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "compact": true, + "decimals": 2, + "suffix": " Watts" + } + }, + "formula": "last_value(gpu.power.usage)", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.power.usage\": *" + }, + "isBucketed": false, + "label": "Part of Current Power Consumption", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.power.usage" + }, + "dff8f29d-dae5-40e8-bf4d-99cde54e5d4d": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Average Power Consumption", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "fallback": true, + "type": "alphabetical" + }, + "orderDirection": "asc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "breakdownByAccessor": "dff8f29d-dae5-40e8-bf4d-99cde54e5d4d", + "collapseFn": "avg", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 5, + "i": "80fb0a69-ef0c-49cd-9254-eee3b254afca", + "w": 24, + "x": 24, + "y": 46 + }, + "panelIndex": "80fb0a69-ef0c-49cd-9254-eee3b254afca", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "0054d1c5-3702-4509-aaf2-21527d53da7d", + "9a3609cd-2ed9-46d8-9e00-5d026efed08c", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "0054d1c5-3702-4509-aaf2-21527d53da7d": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Nvidia GPU UUID", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 100 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPUs by Power Consumption (Watts)", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": " W" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.power.usage" + }, + "9a3609cd-2ed9-46d8-9e00-5d026efed08c": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "splitAccessor": "0054d1c5-3702-4509-aaf2-21527d53da7d", + "xAccessor": "9a3609cd-2ed9-46d8-9e00-5d026efed08c" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 13, + "i": "6bff0535-3972-413e-a519-eface6353204", + "w": 48, + "x": 0, + "y": 51 + }, + "panelIndex": "6bff0535-3972-413e-a519-eface6353204", + "title": "Power Consumption", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "410f39f0-4944-4b03-920d-42950f279cfd", + "4683e7c9-3714-4fa8-a43b-ed19499398be", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "410f39f0-4944-4b03-920d-42950f279cfd": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Node Name", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": true, + "parentFormat": { + "id": "terms" + }, + "secondaryFields": [], + "size": 100 + }, + "scale": "ordinal", + "sourceField": "prometheus.node.name" + }, + "4683e7c9-3714-4fa8-a43b-ed19499398be": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "GPU UUID", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Power Consumption", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": " Watts" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.power.usage" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "columns": [ + { + "columnId": "410f39f0-4944-4b03-920d-42950f279cfd", + "width": 565.5 + }, + { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + }, + { + "columnId": "4683e7c9-3714-4fa8-a43b-ed19499398be", + "isMetric": false, + "isTransposed": false + } + ], + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsDatatable" + }, + "enhancements": {} + }, + "gridData": { + "h": 11, + "i": "febcbfbe-591c-4d16-b509-d8bd4e909c9d", + "w": 24, + "x": 0, + "y": 64 + }, + "panelIndex": "febcbfbe-591c-4d16-b509-d8bd4e909c9d", + "title": "Highest Power Consumption", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "410f39f0-4944-4b03-920d-42950f279cfd", + "4683e7c9-3714-4fa8-a43b-ed19499398be", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "410f39f0-4944-4b03-920d-42950f279cfd": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Node Name", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "asc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "secondaryFields": [], + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "prometheus.node.name" + }, + "4683e7c9-3714-4fa8-a43b-ed19499398be": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "GPU UUID", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Power Consumption", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": " Watts" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.power.usage" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "columns": [ + { + "columnId": "410f39f0-4944-4b03-920d-42950f279cfd", + "width": 565.5 + }, + { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + }, + { + "columnId": "4683e7c9-3714-4fa8-a43b-ed19499398be", + "isMetric": false, + "isTransposed": false + } + ], + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsDatatable" + }, + "enhancements": {} + }, + "gridData": { + "h": 11, + "i": "677fec35-8fd8-4bda-ac5f-f6aac442fad3", + "w": 24, + "x": 24, + "y": 64 + }, + "panelIndex": "677fec35-8fd8-4bda-ac5f-f6aac442fad3", + "title": "Lowest Power Consumption", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Errors", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "081a306a-eddd-4fb9-8d68-355f0d4d8810", + "w": 48, + "x": 0, + "y": 75 + }, + "panelIndex": "081a306a-eddd-4fb9-8d68-355f0d4d8810", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "9f25a2f8-3098-4223-a60a-6ddc6af505a1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "3be91d1f-e167-4e1e-87c5-d33d0ceb70f8", + "76bc0840-25e3-4334-aec2-72152dad38ba" + ], + "columns": { + "3be91d1f-e167-4e1e-87c5-d33d0ceb70f8": { + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.error.xid\": *" + }, + "isBucketed": false, + "label": "Last value of gpu.error.xid", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.error.xid" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.error.code\": *" + }, + "isBucketed": false, + "label": "Error Code", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.error.code" + }, + "76bc0840-25e3-4334-aec2-72152dad38ba": { + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.error.message\": *" + }, + "isBucketed": false, + "label": "Last value of gpu.error.message", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.error.message" + }, + "9f25a2f8-3098-4223-a60a-6ddc6af505a1": { + "dataType": "string", + "isBucketed": true, + "label": "Top values of prometheus.node.name + 2 others", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "fallback": true, + "type": "alphabetical" + }, + "orderDirection": "asc", + "otherBucket": true, + "parentFormat": { + "id": "multi_terms" + }, + "secondaryFields": [ + "gpu.device.uuid", + "gpu.device.model" + ], + "size": 100 + }, + "scale": "ordinal", + "sourceField": "prometheus.node.name" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.error.xid:* and not gpu.error.xid : 0" + }, + "visualization": { + "columns": [ + { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + }, + { + "columnId": "9f25a2f8-3098-4223-a60a-6ddc6af505a1", + "isMetric": false, + "isTransposed": false + }, + { + "columnId": "3be91d1f-e167-4e1e-87c5-d33d0ceb70f8", + "isMetric": true, + "isTransposed": false + }, + { + "columnId": "76bc0840-25e3-4334-aec2-72152dad38ba", + "isMetric": true, + "isTransposed": false + } + ], + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsDatatable" + }, + "enhancements": {} + }, + "gridData": { + "h": 14, + "i": "a225a9ef-4f25-47d3-ad79-fc94c7c8c43c", + "w": 23, + "x": 0, + "y": 79 + }, + "panelIndex": "a225a9ef-4f25-47d3-ad79-fc94c7c8c43c", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "3f2489ca-d389-45c2-bba1-a07e55433958", + "a44b4f5e-18c9-4463-8698-3d257f5f470a" + ], + "columns": { + "3f2489ca-d389-45c2-bba1-a07e55433958": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + }, + "a44b4f5e-18c9-4463-8698-3d257f5f470a": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPUs with Errors", + "operationType": "count", + "params": { + "emptyAsNull": true + }, + "scale": "ratio", + "sourceField": "___records___" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.error.xid:* and not gpu.error.xid : 0" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "a44b4f5e-18c9-4463-8698-3d257f5f470a" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "xAccessor": "3f2489ca-d389-45c2-bba1-a07e55433958" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 14, + "i": "5758146e-aca7-4685-8231-29cee9ed461c", + "w": 25, + "x": 23, + "y": 79 + }, + "panelIndex": "5758146e-aca7-4685-8231-29cee9ed461c", + "title": "GPUs with Errors", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Throttle Duration", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "46331776-49d0-43b9-83c5-1308d7462463", + "w": 48, + "x": 0, + "y": 93 + }, + "panelIndex": "46331776-49d0-43b9-83c5-1308d7462463", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "d367fe4e-0e5e-4a8b-ab00-debe249e1878", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Total time spent throttled", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "compact": true, + "decimals": 0, + "suffix": " μs" + } + }, + "formula": "sum(gpu.throttling.board_limit)+sum(gpu.throttling.low_utilization) + sum(gpu.throttling.power) + sum(gpu.throttling.reliability) + sum(gpu.throttling.sync_boost) + sum(gpu.throttling.thermal)", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "sum", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.board_limit" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "sum", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.low_utilization" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "sum", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.power" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "sum", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.reliability" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "sum", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.sync_boost" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "sum", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.thermal" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "math", + "params": { + "tinymathAst": { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5" + ], + "location": { + "max": 192, + "min": 0 + }, + "name": "add", + "text": "sum(gpu.throttling.board_limit)+sum(gpu.throttling.low_utilization) + sum(gpu.throttling.power) + sum(gpu.throttling.reliability) + sum(gpu.throttling.sync_boost) + sum(gpu.throttling.thermal)", + "type": "function" + } + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5" + ], + "scale": "ratio" + }, + "d367fe4e-0e5e-4a8b-ab00-debe249e1878": { + "dataType": "string", + "isBucketed": true, + "label": "Top 1000 values of gpu.device.uuid", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "fallback": true, + "type": "alphabetical" + }, + "orderDirection": "asc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "breakdownByAccessor": "d367fe4e-0e5e-4a8b-ab00-debe249e1878", + "collapseFn": "sum", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "1930c226-5712-46ab-9c6d-22910f00609a", + "w": 8, + "x": 0, + "y": 97 + }, + "panelIndex": "1930c226-5712-46ab-9c6d-22910f00609a", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "1597e54f-56f4-4185-ba74-51c9ef1b6e59", + "6904586d-05b8-4b63-8966-208279e18963", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6" + ], + "columns": { + "1597e54f-56f4-4185-ba74-51c9ef1b6e59": { + "dataType": "string", + "isBucketed": true, + "label": "Top values of prometheus.node.name + 1 other", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "fallback": false, + "type": "alphabetical" + }, + "orderDirection": "asc", + "otherBucket": true, + "parentFormat": { + "id": "multi_terms" + }, + "secondaryFields": [ + "gpu.device.uuid" + ], + "size": 100 + }, + "scale": "ordinal", + "sourceField": "prometheus.node.name" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Time spent throttled", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.board_limit))+(max(gpu.throttling.low_utilization)) + (max(gpu.throttling.power)) + (max(gpu.throttling.reliability)) + (max(gpu.throttling.sync_boost)) + (max(gpu.throttling.thermal))", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.board_limit" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.low_utilization" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.power" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.reliability" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.sync_boost" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.thermal" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Time spent throttled", + "operationType": "math", + "params": { + "tinymathAst": { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5" + ], + "location": { + "max": 204, + "min": 0 + }, + "name": "add", + "text": "(max(gpu.throttling.board_limit))+(max(gpu.throttling.low_utilization)) + (max(gpu.throttling.power)) + (max(gpu.throttling.reliability)) + (max(gpu.throttling.sync_boost)) + (max(gpu.throttling.thermal))", + "type": "function" + } + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5" + ], + "scale": "ratio" + }, + "6904586d-05b8-4b63-8966-208279e18963": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "splitAccessor": "1597e54f-56f4-4185-ba74-51c9ef1b6e59", + "xAccessor": "6904586d-05b8-4b63-8966-208279e18963" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "9bd3ebe9-891b-460e-9907-809f17299516", + "w": 40, + "x": 8, + "y": 97 + }, + "panelIndex": "9bd3ebe9-891b-460e-9907-809f17299516", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "afb305a0-07ca-4c36-9279-cf880b3fb78d", + "6904586d-05b8-4b63-8966-208279e18963", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45", + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45X0", + "4da7efe0-7bbc-4b9e-930e-957708588099", + "4da7efe0-7bbc-4b9e-930e-957708588099X0", + "0fdf8523-bae1-4e22-bf82-11c7a39f415b", + "0fdf8523-bae1-4e22-bf82-11c7a39f415bX0", + "77b81114-280e-46d2-a299-a436f8007f95", + "77b81114-280e-46d2-a299-a436f8007f95X0", + "c436daa7-959c-4195-9f64-a813e322407f", + "c436daa7-959c-4195-9f64-a813e322407fX0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "columns": { + "0fdf8523-bae1-4e22-bf82-11c7a39f415b": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Reliability Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.reliability)) ", + "isFormulaBroken": false + }, + "references": [ + "0fdf8523-bae1-4e22-bf82-11c7a39f415bX0" + ], + "scale": "ratio" + }, + "0fdf8523-bae1-4e22-bf82-11c7a39f415bX0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Reliability Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.reliability" + }, + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Low Utilization", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.low_utilization))", + "isFormulaBroken": false + }, + "references": [ + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45X0" + ], + "scale": "ratio", + "timeScale": "s" + }, + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Low Utilization", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.low_utilization" + }, + "4da7efe0-7bbc-4b9e-930e-957708588099": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Power Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.power))", + "isFormulaBroken": false + }, + "references": [ + "4da7efe0-7bbc-4b9e-930e-957708588099X0" + ], + "scale": "ratio" + }, + "4da7efe0-7bbc-4b9e-930e-957708588099X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Power Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.power" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Board Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.board_limit))", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Board Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.board_limit" + }, + "6904586d-05b8-4b63-8966-208279e18963": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + }, + "77b81114-280e-46d2-a299-a436f8007f95": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Sync Boost Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.sync_boost))", + "isFormulaBroken": false + }, + "references": [ + "77b81114-280e-46d2-a299-a436f8007f95X0" + ], + "scale": "ratio" + }, + "77b81114-280e-46d2-a299-a436f8007f95X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Sync Boost Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.sync_boost" + }, + "afb305a0-07ca-4c36-9279-cf880b3fb78d": { + "dataType": "string", + "isBucketed": true, + "label": "Top 1000 values of gpu.device.uuid", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "fallback": true, + "type": "alphabetical" + }, + "orderDirection": "asc", + "otherBucket": false, + "parentFormat": { + "id": "terms" + }, + "size": 1000 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + }, + "c436daa7-959c-4195-9f64-a813e322407f": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Thermally Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.thermal))", + "isFormulaBroken": false + }, + "references": [ + "c436daa7-959c-4195-9f64-a813e322407fX0" + ], + "scale": "ratio" + }, + "c436daa7-959c-4195-9f64-a813e322407fX0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Thermally Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.thermal" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45", + "4da7efe0-7bbc-4b9e-930e-957708588099", + "0fdf8523-bae1-4e22-bf82-11c7a39f415b", + "77b81114-280e-46d2-a299-a436f8007f95", + "c436daa7-959c-4195-9f64-a813e322407f" + ], + "collapseFn": "sum", + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "splitAccessor": "afb305a0-07ca-4c36-9279-cf880b3fb78d", + "xAccessor": "6904586d-05b8-4b63-8966-208279e18963" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide", + "yTitle": "Time spent throttled in μs" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 13, + "i": "e44b68c3-b5ff-42f2-8dc1-236d64d07652", + "w": 48, + "x": 0, + "y": 107 + }, + "panelIndex": "e44b68c3-b5ff-42f2-8dc1-236d64d07652", + "title": "Throttle duration by reason", + "type": "lens" + } + ], + "timeRestore": false, + "title": "[Metrics Nvidia GPU] Overview", + "version": 2 + }, + "coreMigrationVersion": "8.8.0", + "created_at": "2025-02-04T03:04:23.167Z", + "id": "nvidia_gpu-a3a5759a-1b3d-456d-8ab8-83e97f774030", + "managed": false, + "references": [ + { + "id": "metrics-*", + "name": "4814e6dd-165f-4d6f-9a9f-5b578d6d7428:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "5d1fe3c2-5112-4bbe-8557-a7facbb61291:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "424db695-de4a-4086-bcf6-2d2e35e759c0:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "b42166d2-0952-44f8-b495-de6b72ae43c0:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "793a21c4-7204-444d-b858-cc63f19425b4:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "f6056480-5da9-44b2-a125-8e30ec534ba9:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "6e50d26f-50e0-4f98-adc6-a0d535f58453:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "3eb4830a-d697-49a2-b55f-e09a5e964fa0:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "6f55585b-8a57-472b-a8d9-904f56042d21:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "b882721f-9ce6-4214-838e-7535a01fbd5c:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "80fb0a69-ef0c-49cd-9254-eee3b254afca:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "6bff0535-3972-413e-a519-eface6353204:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "febcbfbe-591c-4d16-b509-d8bd4e909c9d:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "677fec35-8fd8-4bda-ac5f-f6aac442fad3:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "a225a9ef-4f25-47d3-ad79-fc94c7c8c43c:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "5758146e-aca7-4685-8231-29cee9ed461c:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "1930c226-5712-46ab-9c6d-22910f00609a:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "9bd3ebe9-891b-460e-9907-809f17299516:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "e44b68c3-b5ff-42f2-8dc1-236d64d07652:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c", + "name": "tag-ref-nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c", + "type": "tag" + }, + { + "id": "nvidia_gpu-a3a5759a-1b3d-456d-8ab8-83e97f774030", + "name": "8215a9c0-1174-4a21-938b-1d21db1d110b:link_41926dec-72cc-42a8-af6a-c51b41d2cf1f_dashboard", + "type": "dashboard" + }, + { + "id": "nvidia_gpu-bac121fe-6cd9-48a7-a349-f52ffa42d56b", + "name": "8215a9c0-1174-4a21-938b-1d21db1d110b:link_38a810ff-17a4-428a-93d2-795f81a727e3_dashboard", + "type": "dashboard" + } + ], + "type": "dashboard", + "typeMigrationVersion": "10.2.0", + "updated_by": "u_24320541_cloud" +} \ No newline at end of file diff --git a/packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-bac121fe-6cd9-48a7-a349-f52ffa42d56b.json b/packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-bac121fe-6cd9-48a7-a349-f52ffa42d56b.json new file mode 100644 index 00000000000..e26eafd8426 --- /dev/null +++ b/packages/nvidia_gpu/kibana/dashboard/nvidia_gpu-bac121fe-6cd9-48a7-a349-f52ffa42d56b.json @@ -0,0 +1,3933 @@ +{ + "attributes": { + "controlGroupInput": { + "chainingSystem": "HIERARCHICAL", + "controlStyle": "oneLine", + "ignoreParentSettingsJSON": { + "ignoreFilters": false, + "ignoreQuery": false, + "ignoreTimerange": false, + "ignoreValidations": false + }, + "panelsJSON": { + "44b3932b-aab8-4442-863c-67d6f8c77752": { + "explicitInput": { + "dataViewId": "metrics-*", + "exclude": null, + "existsSelected": null, + "fieldName": "gpu.device.uuid", + "hideActionBar": null, + "hideExclude": null, + "hideExists": null, + "hideSort": null, + "id": "44b3932b-aab8-4442-863c-67d6f8c77752", + "placeholder": null, + "runPastTimeout": null, + "searchTechnique": "prefix", + "selectedOptions": [], + "singleSelect": true, + "sort": { + "by": "_count", + "direction": "desc" + }, + "title": "GPU UUID" + }, + "grow": true, + "order": 1, + "type": "optionsListControl", + "width": "medium" + }, + "4be0316f-4108-45c4-8ac1-9d9ad45ef032": { + "explicitInput": { + "dataViewId": "metrics-*", + "exclude": false, + "existsSelected": false, + "fieldName": "prometheus.node.hostname", + "id": "4be0316f-4108-45c4-8ac1-9d9ad45ef032", + "searchTechnique": "prefix", + "selectedOptions": [], + "singleSelect": true, + "sort": { + "by": "_count", + "direction": "desc" + }, + "title": "Node Name" + }, + "grow": true, + "order": 0, + "type": "optionsListControl", + "width": "medium" + }, + "908bb943-35f9-4cd9-a47f-74ff40843cad": { + "explicitInput": { + "dataViewId": "metrics-*", + "fieldName": "gpu.device.name", + "id": "908bb943-35f9-4cd9-a47f-74ff40843cad", + "searchTechnique": "prefix", + "selectedOptions": [], + "singleSelect": true, + "sort": { + "by": "_count", + "direction": "desc" + }, + "title": "GPU Name" + }, + "grow": true, + "order": 2, + "type": "optionsListControl", + "width": "medium" + } + }, + "showApplySelections": false + }, + "description": "", + "kibanaSavedObjectMeta": { + "searchSourceJSON": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "optionsJSON": { + "hidePanelTitles": false, + "syncColors": false, + "syncCursor": true, + "syncTooltips": false, + "useMargins": true + }, + "panelsJSON": [ + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "The following dashboard is meant to display data for a single GPU. Use the selectors above to pick a specific GPU by UUID or Name", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 3, + "i": "05a3c90a-2a22-4239-b000-b296acf809f5", + "w": 48, + "x": 0, + "y": 3 + }, + "panelIndex": "05a3c90a-2a22-4239-b000-b296acf809f5", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "layout": "horizontal", + "links": [ + { + "destinationRefName": "link_41926dec-72cc-42a8-af6a-c51b41d2cf1f_dashboard", + "id": "41926dec-72cc-42a8-af6a-c51b41d2cf1f", + "label": "Overview", + "order": 0, + "type": "dashboardLink" + }, + { + "destinationRefName": "link_38a810ff-17a4-428a-93d2-795f81a727e3_dashboard", + "id": "38a810ff-17a4-428a-93d2-795f81a727e3", + "label": "GPU-Level Metrics", + "order": 1, + "type": "dashboardLink" + } + ] + }, + "enhancements": {} + }, + "gridData": { + "h": 3, + "i": "0167d345-46b0-4a5d-ada9-f7f5ca02d274", + "w": 48, + "x": 0, + "y": 0 + }, + "panelIndex": "0167d345-46b0-4a5d-ada9-f7f5ca02d274", + "type": "links" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPUs Selected", + "operationType": "unique_count", + "params": { + "emptyAsNull": true + }, + "scale": "ratio", + "sourceField": "gpu.device.uuid" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "537a4add-dc09-4235-99ea-a8242e8f0ca8", + "w": 5, + "x": 0, + "y": 6 + }, + "panelIndex": "537a4add-dc09-4235-99ea-a8242e8f0ca8", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "date", + "filter": { + "language": "kuery", + "query": "\"@timestamp\": *" + }, + "isBucketed": false, + "label": "Last Gathered", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and prometheus.up.value : 1" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "11c13eee-8630-4ad0-881a-89e3bce725eb", + "w": 8, + "x": 5, + "y": 6 + }, + "panelIndex": "11c13eee-8630-4ad0-881a-89e3bce725eb", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.device.brand\": *" + }, + "isBucketed": false, + "label": "Brand", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.device.brand" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "18d7ea2c-16dd-43de-a65b-b3f9597ce903", + "w": 8, + "x": 13, + "y": 6 + }, + "panelIndex": "18d7ea2c-16dd-43de-a65b-b3f9597ce903", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.device.model\": *" + }, + "isBucketed": false, + "label": "Brand", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.device.model" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "cdd39585-6b6e-4a00-a71b-a532fda43859", + "w": 9, + "x": 21, + "y": 6 + }, + "panelIndex": "cdd39585-6b6e-4a00-a71b-a532fda43859", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.device.vbios.version\": *" + }, + "isBucketed": false, + "label": "vBIOS Version", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.device.vbios.version" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "de6bb254-3119-49e7-a73d-c89a0c0319cd", + "w": 9, + "x": 30, + "y": 6 + }, + "panelIndex": "de6bb254-3119-49e7-a73d-c89a0c0319cd", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.device.info_rom.version\": *" + }, + "isBucketed": false, + "label": "InfoROM Version", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.device.info_rom.version" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "c09896fc-8faa-445d-a1c5-e3b038538f3e", + "w": 9, + "x": 39, + "y": 6 + }, + "panelIndex": "c09896fc-8faa-445d-a1c5-e3b038538f3e", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Utilization", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "a04a5db8-dc22-4a3d-a0cd-e58e68cb3ce2", + "w": 48, + "x": 0, + "y": 10 + }, + "panelIndex": "a04a5db8-dc22-4a3d-a0cd-e58e68cb3ce2", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.utilization\": *" + }, + "isBucketed": false, + "label": "GPU Utilization", + "operationType": "last_value", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + }, + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "45e3c02a-0fed-47eb-b7e1-4c99a6594346", + "w": 10, + "x": 0, + "y": 14 + }, + "panelIndex": "45e3c02a-0fed-47eb-b7e1-4c99a6594346", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Framebuffer Utilization (Memory)", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + }, + "formula": "last_value(gpu.framebuffer.size.used)/(last_value(gpu.framebuffer.size.free)+last_value(gpu.framebuffer.size.used))\n", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.framebuffer.size.used\": *" + }, + "isBucketed": false, + "label": "Part of Framebuffer Utilization (Memory)", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.framebuffer.size.used" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.framebuffer.size.free\": *" + }, + "isBucketed": false, + "label": "Part of Framebuffer Utilization (Memory)", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.framebuffer.size.free" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.framebuffer.size.used\": *" + }, + "isBucketed": false, + "label": "Part of Framebuffer Utilization (Memory)", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.framebuffer.size.used" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Framebuffer Utilization (Memory)", + "operationType": "math", + "params": { + "tinymathAst": { + "args": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + { + "args": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2" + ], + "location": { + "max": 114, + "min": 39 + }, + "name": "add", + "text": "last_value(gpu.framebuffer.size.free)+last_value(gpu.framebuffer.size.used)", + "type": "function" + } + ], + "location": { + "max": 116, + "min": 0 + }, + "name": "divide", + "text": "last_value(gpu.framebuffer.size.used)/(last_value(gpu.framebuffer.size.free)+last_value(gpu.framebuffer.size.used))\n", + "type": "function" + } + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2" + ], + "scale": "ratio" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "59aa9a74-ed2d-4a77-b0b0-3dfe7a8aee9f", + "w": 10, + "x": 10, + "y": 14 + }, + "panelIndex": "59aa9a74-ed2d-4a77-b0b0-3dfe7a8aee9f", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.encoder.utilization\": *" + }, + "isBucketed": false, + "label": "Encoder Utilization", + "operationType": "last_value", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + }, + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.encoder.utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "2630dd5a-5a7c-41d9-9b8c-f89670bc4ea3", + "w": 9, + "x": 20, + "y": 14 + }, + "panelIndex": "2630dd5a-5a7c-41d9-9b8c-f89670bc4ea3", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.decoder.utilization\": *" + }, + "isBucketed": false, + "label": "Decoder Utilization", + "operationType": "last_value", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + }, + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.decoder.utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "3da7d141-47f5-4fdc-a8e0-3aef09cca549", + "w": 9, + "x": 29, + "y": 14 + }, + "panelIndex": "3da7d141-47f5-4fdc-a8e0-3aef09cca549", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "51a20240-0f2c-4a18-b7a9-b891cff1591c", + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4" + ], + "columns": { + "515ab9a4-7abf-480e-9ce4-cf775cfa95c4": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 100", + "operationType": "static_value", + "params": { + "value": "100" + }, + "references": [], + "scale": "ratio" + }, + "51a20240-0f2c-4a18-b7a9-b891cff1591c": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.memory.copy_utilization\": *" + }, + "isBucketed": false, + "label": "Memory Copy Engine Utilization", + "operationType": "last_value", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + }, + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.memory.copy_utilization" + }, + "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8": { + "dataType": "number", + "isBucketed": false, + "isStaticValue": true, + "label": "Static value: 0", + "operationType": "static_value", + "params": { + "value": "0" + }, + "references": [], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.utilization:*" + }, + "visualization": { + "labelMajorMode": "auto", + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "maxAccessor": "515ab9a4-7abf-480e-9ce4-cf775cfa95c4", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "minAccessor": "caa2c077-8d21-49c3-aa21-e2c51f8fdfe8", + "shape": "semiCircle", + "ticksPosition": "auto" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsGauge" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "f33ab5fc-6c0d-4cc9-afeb-ef6a815a76b1", + "w": 10, + "x": 38, + "y": 14 + }, + "panelIndex": "f33ab5fc-6c0d-4cc9-afeb-ef6a815a76b1", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "d1bbfa53-3b7f-4b6b-91d2-5821e9ca304f", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPU Utilization", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": "%" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.utilization" + }, + "d1bbfa53-3b7f-4b6b-91d2-5821e9ca304f": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "xAccessor": "d1bbfa53-3b7f-4b6b-91d2-5821e9ca304f" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "f42dd770-5684-46ee-8ca6-5888947d9ffe", + "w": 48, + "x": 0, + "y": 24 + }, + "panelIndex": "f42dd770-5684-46ee-8ca6-5888947d9ffe", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Energy and Power Consumption", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "2880af70-98c6-4aa3-b790-8b57ca31b6c6", + "w": 48, + "x": 0, + "y": 34 + }, + "panelIndex": "2880af70-98c6-4aa3-b790-8b57ca31b6c6", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.energy.total\": *" + }, + "isBucketed": false, + "label": "Total Energy Usage since Boot (Joules)", + "operationType": "last_value", + "params": { + "format": { + "id": "number", + "params": { + "compact": true, + "decimals": 2, + "suffix": " Joules" + } + }, + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.energy.total" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 5, + "i": "b8cc1323-2996-4167-a311-f819a90dfd4f", + "w": 23, + "x": 0, + "y": 38 + }, + "panelIndex": "b8cc1323-2996-4167-a311-f819a90dfd4f", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "filter": { + "language": "kuery", + "query": "\"gpu.power.usage\": *" + }, + "isBucketed": false, + "label": "Current Power Consumption", + "operationType": "last_value", + "params": { + "format": { + "id": "number", + "params": { + "compact": true, + "decimals": 2, + "suffix": " Watts" + } + }, + "sortField": "@timestamp" + }, + "scale": "ratio", + "sourceField": "gpu.power.usage" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 5, + "i": "32858f76-8501-4681-823f-4441b34d565b", + "w": 25, + "x": 23, + "y": 38 + }, + "panelIndex": "32858f76-8501-4681-823f-4441b34d565b", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "0054d1c5-3702-4509-aaf2-21527d53da7d", + "9a3609cd-2ed9-46d8-9e00-5d026efed08c", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "0054d1c5-3702-4509-aaf2-21527d53da7d": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "Nvidia GPU UUID", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": true, + "parentFormat": { + "id": "terms" + }, + "size": 10 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "GPU Power Usage (Watts)", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 1, + "suffix": " W" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.power.usage" + }, + "9a3609cd-2ed9-46d8-9e00-5d026efed08c": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "splitAccessor": "0054d1c5-3702-4509-aaf2-21527d53da7d", + "xAccessor": "9a3609cd-2ed9-46d8-9e00-5d026efed08c" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 13, + "i": "4317faf8-ca0e-498a-9b6c-99d3099f3fa9", + "w": 48, + "x": 0, + "y": 43 + }, + "panelIndex": "4317faf8-ca0e-498a-9b6c-99d3099f3fa9", + "title": "Power Consumption", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Errors", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "27c5e85e-f60b-4d79-ba85-30ef0e2eb36f", + "w": 48, + "x": 0, + "y": 56 + }, + "panelIndex": "27c5e85e-f60b-4d79-ba85-30ef0e2eb36f", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.error.xid\": *" + }, + "isBucketed": false, + "label": "Error ID", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.error.xid" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.error.xid:* and not gpu.error.xid : 0" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "969a7903-e17c-418b-adc2-e0dd53117b28", + "w": 9, + "x": 0, + "y": 60 + }, + "panelIndex": "969a7903-e17c-418b-adc2-e0dd53117b28", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "3f2489ca-d389-45c2-bba1-a07e55433958", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "f50f8276-a1fa-49dd-a38e-8799bc809a0c" + ], + "columns": { + "3f2489ca-d389-45c2-bba1-a07e55433958": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "PCIe Replay Rate", + "operationType": "counter_rate", + "references": [ + "f50f8276-a1fa-49dd-a38e-8799bc809a0c" + ], + "scale": "ratio", + "timeScale": "s" + }, + "f50f8276-a1fa-49dd-a38e-8799bc809a0c": { + "dataType": "number", + "isBucketed": false, + "label": "Maximum of gpu.pcie.replay", + "operationType": "max", + "params": { + "emptyAsNull": true + }, + "scale": "ratio", + "sourceField": "gpu.pcie.replay" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.pcie.replay :*" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "xAccessor": "3f2489ca-d389-45c2-bba1-a07e55433958" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 12, + "i": "708b82ce-d8c9-4f4f-94c3-5de5a5a0ba68", + "w": 39, + "x": 9, + "y": 60 + }, + "panelIndex": "708b82ce-d8c9-4f4f-94c3-5de5a5a0ba68", + "title": "PCIe Packet Replays per Second", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.error.message\": *" + }, + "isBucketed": false, + "label": "Error Message", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.error.message" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.error.xid:*" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "be5f296b-1b90-44b3-9886-d4d439aee9af", + "w": 9, + "x": 0, + "y": 64 + }, + "panelIndex": "be5f296b-1b90-44b3-9886-d4d439aee9af", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "string", + "filter": { + "language": "kuery", + "query": "\"gpu.error.code\": *" + }, + "isBucketed": false, + "label": "Error Code", + "operationType": "last_value", + "params": { + "sortField": "@timestamp" + }, + "scale": "ordinal", + "sourceField": "gpu.error.code" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats and gpu.error.xid:* and not gpu.error.xid : 0" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 4, + "i": "bcfaa8ee-21af-4563-bac1-68e1d9ae3b60", + "w": 9, + "x": 0, + "y": 68 + }, + "panelIndex": "bcfaa8ee-21af-4563-bac1-68e1d9ae3b60", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Throttle Duration", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "8eeaa9cc-f47b-4254-b5b4-1d3812c1cb04", + "w": 48, + "x": 0, + "y": 72 + }, + "panelIndex": "8eeaa9cc-f47b-4254-b5b4-1d3812c1cb04", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Total time spent throttled", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "compact": true, + "decimals": 0, + "suffix": " μs" + } + }, + "formula": "max(gpu.throttling.board_limit)+max(gpu.throttling.low_utilization) + max(gpu.throttling.power) + max(gpu.throttling.reliability) + max(gpu.throttling.sync_boost) + max(gpu.throttling.thermal)", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.board_limit" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.low_utilization" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.power" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.reliability" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.sync_boost" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.thermal" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Total time spent throttled", + "operationType": "math", + "params": { + "tinymathAst": { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5" + ], + "location": { + "max": 192, + "min": 0 + }, + "name": "add", + "text": "max(gpu.throttling.board_limit)+max(gpu.throttling.low_utilization) + max(gpu.throttling.power) + max(gpu.throttling.reliability) + max(gpu.throttling.sync_boost) + max(gpu.throttling.thermal)", + "type": "function" + } + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5" + ], + "scale": "ratio" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "metricAccessor": "537d8f07-0a3c-440b-81f7-c357e525c6c5" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsMetric" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "313e1d01-32b5-447f-a62f-29133f3ef378", + "w": 8, + "x": 0, + "y": 76 + }, + "panelIndex": "313e1d01-32b5-447f-a62f-29133f3ef378", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "6904586d-05b8-4b63-8966-208279e18963", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X7", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X8", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X9", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X10", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X11", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X12" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Time spent throttled", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "counter_rate(max(gpu.throttling.board_limit))+counter_rate(max(gpu.throttling.low_utilization)) + counter_rate(max(gpu.throttling.power)) + counter_rate(max(gpu.throttling.reliability)) + counter_rate(max(gpu.throttling.sync_boost)) + counter_rate(max(gpu.throttling.thermal))", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X12" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.board_limit" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "counter_rate", + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "scale": "ratio", + "timeScale": "s" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X10": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.thermal" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X11": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "counter_rate", + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X10" + ], + "scale": "ratio", + "timeScale": "s" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X12": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "math", + "params": { + "tinymathAst": { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + { + "args": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X7" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X9" + ], + "name": "add", + "type": "function" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X11" + ], + "location": { + "max": 276, + "min": 0 + }, + "name": "add", + "text": "counter_rate(max(gpu.throttling.board_limit))+counter_rate(max(gpu.throttling.low_utilization)) + counter_rate(max(gpu.throttling.power)) + counter_rate(max(gpu.throttling.reliability)) + counter_rate(max(gpu.throttling.sync_boost)) + counter_rate(max(gpu.throttling.thermal))", + "type": "function" + } + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X1", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X7", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X9", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X11" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.low_utilization" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X3": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "counter_rate", + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X2" + ], + "scale": "ratio", + "timeScale": "s" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.power" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "counter_rate", + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X4" + ], + "scale": "ratio", + "timeScale": "s" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.reliability" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X7": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "counter_rate", + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X6" + ], + "scale": "ratio", + "timeScale": "s" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X8": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.sync_boost" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X9": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Throttling Duration in μs", + "operationType": "counter_rate", + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X8" + ], + "scale": "ratio", + "timeScale": "s" + }, + "6904586d-05b8-4b63-8966-208279e18963": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "xAccessor": "6904586d-05b8-4b63-8966-208279e18963" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "0b03013d-c45b-41c6-81d3-b57428a7a8a6", + "w": 40, + "x": 8, + "y": 76 + }, + "panelIndex": "0b03013d-c45b-41c6-81d3-b57428a7a8a6", + "title": "", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "currentIndexPatternId": "metrics-*", + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "6904586d-05b8-4b63-8966-208279e18963", + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0", + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45", + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45X0", + "4da7efe0-7bbc-4b9e-930e-957708588099", + "4da7efe0-7bbc-4b9e-930e-957708588099X0", + "0fdf8523-bae1-4e22-bf82-11c7a39f415b", + "0fdf8523-bae1-4e22-bf82-11c7a39f415bX0", + "77b81114-280e-46d2-a299-a436f8007f95", + "77b81114-280e-46d2-a299-a436f8007f95X0", + "c436daa7-959c-4195-9f64-a813e322407f", + "c436daa7-959c-4195-9f64-a813e322407fX0" + ], + "columns": { + "0fdf8523-bae1-4e22-bf82-11c7a39f415b": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Reliability Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.reliability)) ", + "isFormulaBroken": false + }, + "references": [ + "0fdf8523-bae1-4e22-bf82-11c7a39f415bX0" + ], + "scale": "ratio" + }, + "0fdf8523-bae1-4e22-bf82-11c7a39f415bX0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Reliability Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.reliability" + }, + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Low Utilization", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.low_utilization))", + "isFormulaBroken": false + }, + "references": [ + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45X0" + ], + "scale": "ratio", + "timeScale": "s" + }, + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Low Utilization", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.low_utilization" + }, + "4da7efe0-7bbc-4b9e-930e-957708588099": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Power Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.power))", + "isFormulaBroken": false + }, + "references": [ + "4da7efe0-7bbc-4b9e-930e-957708588099X0" + ], + "scale": "ratio" + }, + "4da7efe0-7bbc-4b9e-930e-957708588099X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Power Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.power" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Board Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.board_limit))", + "isFormulaBroken": false + }, + "references": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0" + ], + "scale": "ratio" + }, + "537d8f07-0a3c-440b-81f7-c357e525c6c5X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Board Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.board_limit" + }, + "6904586d-05b8-4b63-8966-208279e18963": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + }, + "77b81114-280e-46d2-a299-a436f8007f95": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Sync Boost Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.sync_boost))", + "isFormulaBroken": false + }, + "references": [ + "77b81114-280e-46d2-a299-a436f8007f95X0" + ], + "scale": "ratio" + }, + "77b81114-280e-46d2-a299-a436f8007f95X0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Sync Boost Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.sync_boost" + }, + "c436daa7-959c-4195-9f64-a813e322407f": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Thermally Limited", + "operationType": "formula", + "params": { + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "μs" + } + }, + "formula": "(max(gpu.throttling.thermal))", + "isFormulaBroken": false + }, + "references": [ + "c436daa7-959c-4195-9f64-a813e322407fX0" + ], + "scale": "ratio" + }, + "c436daa7-959c-4195-9f64-a813e322407fX0": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Part of Thermally Limited", + "operationType": "max", + "params": { + "emptyAsNull": false + }, + "scale": "ratio", + "sourceField": "gpu.throttling.thermal" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "indexPatternId": "metrics-*", + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "1b3be1f1-3c90-4989-9c95-a9a01c50fd45", + "4da7efe0-7bbc-4b9e-930e-957708588099", + "0fdf8523-bae1-4e22-bf82-11c7a39f415b", + "77b81114-280e-46d2-a299-a436f8007f95", + "c436daa7-959c-4195-9f64-a813e322407f" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "xAccessor": "6904586d-05b8-4b63-8966-208279e18963" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide", + "yTitle": "Time spent throttled in μs" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 13, + "i": "042690b1-aff1-4cfb-878e-cb8dd4eebd1c", + "w": 48, + "x": 0, + "y": 86 + }, + "panelIndex": "042690b1-aff1-4cfb-878e-cb8dd4eebd1c", + "title": "Throttle duration by reason", + "type": "lens" + }, + { + "embeddableConfig": { + "enhancements": { + "dynamicActions": { + "events": [] + } + }, + "savedVis": { + "data": { + "aggs": [], + "searchSource": { + "filter": [], + "query": { + "language": "kuery", + "query": "" + } + } + }, + "description": "", + "id": "", + "params": { + "fontSize": 12, + "markdown": "## Clock Frequency", + "openLinksInNewTab": false + }, + "title": "", + "type": "markdown", + "uiState": {} + } + }, + "gridData": { + "h": 4, + "i": "e30e1939-d469-48eb-869d-e52802636bd4", + "w": 48, + "x": 0, + "y": 99 + }, + "panelIndex": "e30e1939-d469-48eb-869d-e52802636bd4", + "title": "", + "type": "visualization" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "6904586d-05b8-4b63-8966-208279e18963", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Streaming Multiprocessor Clock", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "decimals": 0, + "suffix": "MHz" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.streaming_multiprocessor.frequency" + }, + "6904586d-05b8-4b63-8966-208279e18963": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "xAccessor": "6904586d-05b8-4b63-8966-208279e18963" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "f6421632-ebfe-4e44-b356-d670a0712644", + "w": 23, + "x": 0, + "y": 103 + }, + "panelIndex": "f6421632-ebfe-4e44-b356-d670a0712644", + "title": "Streaming Multiprocessor Frequency", + "type": "lens" + }, + { + "embeddableConfig": { + "attributes": { + "references": [ + { + "id": "metrics-*", + "name": "indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + } + ], + "state": { + "adHocDataViews": {}, + "datasourceStates": { + "formBased": { + "layers": { + "3a6cbb5a-de88-4c00-92b1-7082af725a29": { + "columnOrder": [ + "74698ed6-b80e-4d75-b7ff-65e7cc2269ab", + "6904586d-05b8-4b63-8966-208279e18963", + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "columns": { + "537d8f07-0a3c-440b-81f7-c357e525c6c5": { + "customLabel": true, + "dataType": "number", + "isBucketed": false, + "label": "Memory Clock", + "operationType": "average", + "params": { + "emptyAsNull": true, + "format": { + "id": "number", + "params": { + "compact": false, + "decimals": 0, + "suffix": "MHz" + } + } + }, + "scale": "ratio", + "sourceField": "gpu.memory.frequency" + }, + "6904586d-05b8-4b63-8966-208279e18963": { + "dataType": "date", + "isBucketed": true, + "label": "@timestamp", + "operationType": "date_histogram", + "params": { + "dropPartials": false, + "includeEmptyRows": true, + "interval": "auto" + }, + "scale": "interval", + "sourceField": "@timestamp" + }, + "74698ed6-b80e-4d75-b7ff-65e7cc2269ab": { + "customLabel": true, + "dataType": "string", + "isBucketed": true, + "label": "GPU UUID", + "operationType": "terms", + "params": { + "exclude": [], + "excludeIsRegex": false, + "include": [], + "includeIsRegex": false, + "missingBucket": false, + "orderBy": { + "columnId": "537d8f07-0a3c-440b-81f7-c357e525c6c5", + "type": "column" + }, + "orderDirection": "desc", + "otherBucket": true, + "parentFormat": { + "id": "terms" + }, + "size": 10 + }, + "scale": "ordinal", + "sourceField": "gpu.device.uuid" + } + }, + "ignoreGlobalFilters": false, + "incompleteColumns": {}, + "sampling": 1 + } + } + }, + "indexpattern": { + "layers": {} + }, + "textBased": { + "layers": {} + } + }, + "filters": [], + "internalReferences": [], + "query": { + "language": "kuery", + "query": "data_stream.type : metrics and data_stream.dataset : nvidia_gpu.stats" + }, + "visualization": { + "axisTitlesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "fittingFunction": "None", + "gridlinesVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "labelsOrientation": { + "x": 0, + "yLeft": 0, + "yRight": 0 + }, + "layers": [ + { + "accessors": [ + "537d8f07-0a3c-440b-81f7-c357e525c6c5" + ], + "colorMapping": { + "assignments": [], + "colorMode": { + "type": "categorical" + }, + "paletteId": "eui_amsterdam_color_blind", + "specialAssignments": [ + { + "color": { + "type": "loop" + }, + "rule": { + "type": "other" + }, + "touched": false + } + ] + }, + "layerId": "3a6cbb5a-de88-4c00-92b1-7082af725a29", + "layerType": "data", + "seriesType": "line", + "splitAccessor": "74698ed6-b80e-4d75-b7ff-65e7cc2269ab", + "xAccessor": "6904586d-05b8-4b63-8966-208279e18963" + } + ], + "legend": { + "isVisible": true, + "position": "right" + }, + "preferredSeriesType": "line", + "tickLabelsVisibilitySettings": { + "x": true, + "yLeft": true, + "yRight": true + }, + "valueLabels": "hide" + } + }, + "title": "", + "type": "lens", + "visualizationType": "lnsXY" + }, + "enhancements": {} + }, + "gridData": { + "h": 10, + "i": "979c8732-bc79-4702-855e-3a608f5462a2", + "w": 25, + "x": 23, + "y": 103 + }, + "panelIndex": "979c8732-bc79-4702-855e-3a608f5462a2", + "title": "Memory Frequency", + "type": "lens" + } + ], + "timeRestore": false, + "title": "[Metrics Nvidia GPU] GPU-Specific Overview", + "version": 2 + }, + "coreMigrationVersion": "8.8.0", + "created_at": "2025-02-04T03:04:20.742Z", + "id": "nvidia_gpu-bac121fe-6cd9-48a7-a349-f52ffa42d56b", + "managed": false, + "references": [ + { + "id": "metrics-*", + "name": "537a4add-dc09-4235-99ea-a8242e8f0ca8:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "11c13eee-8630-4ad0-881a-89e3bce725eb:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "18d7ea2c-16dd-43de-a65b-b3f9597ce903:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "cdd39585-6b6e-4a00-a71b-a532fda43859:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "de6bb254-3119-49e7-a73d-c89a0c0319cd:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "c09896fc-8faa-445d-a1c5-e3b038538f3e:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "45e3c02a-0fed-47eb-b7e1-4c99a6594346:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "59aa9a74-ed2d-4a77-b0b0-3dfe7a8aee9f:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "2630dd5a-5a7c-41d9-9b8c-f89670bc4ea3:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "3da7d141-47f5-4fdc-a8e0-3aef09cca549:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "f33ab5fc-6c0d-4cc9-afeb-ef6a815a76b1:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "f42dd770-5684-46ee-8ca6-5888947d9ffe:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "b8cc1323-2996-4167-a311-f819a90dfd4f:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "32858f76-8501-4681-823f-4441b34d565b:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "4317faf8-ca0e-498a-9b6c-99d3099f3fa9:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "969a7903-e17c-418b-adc2-e0dd53117b28:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "708b82ce-d8c9-4f4f-94c3-5de5a5a0ba68:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "be5f296b-1b90-44b3-9886-d4d439aee9af:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "bcfaa8ee-21af-4563-bac1-68e1d9ae3b60:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "313e1d01-32b5-447f-a62f-29133f3ef378:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "0b03013d-c45b-41c6-81d3-b57428a7a8a6:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "042690b1-aff1-4cfb-878e-cb8dd4eebd1c:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "f6421632-ebfe-4e44-b356-d670a0712644:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "979c8732-bc79-4702-855e-3a608f5462a2:indexpattern-datasource-layer-3a6cbb5a-de88-4c00-92b1-7082af725a29", + "type": "index-pattern" + }, + { + "id": "nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c", + "name": "tag-ref-nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c", + "type": "tag" + }, + { + "id": "nvidia_gpu-a3a5759a-1b3d-456d-8ab8-83e97f774030", + "name": "0167d345-46b0-4a5d-ada9-f7f5ca02d274:link_41926dec-72cc-42a8-af6a-c51b41d2cf1f_dashboard", + "type": "dashboard" + }, + { + "id": "nvidia_gpu-bac121fe-6cd9-48a7-a349-f52ffa42d56b", + "name": "0167d345-46b0-4a5d-ada9-f7f5ca02d274:link_38a810ff-17a4-428a-93d2-795f81a727e3_dashboard", + "type": "dashboard" + }, + { + "id": "metrics-*", + "name": "controlGroup_4be0316f-4108-45c4-8ac1-9d9ad45ef032:optionsListDataView", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "controlGroup_44b3932b-aab8-4442-863c-67d6f8c77752:optionsListDataView", + "type": "index-pattern" + }, + { + "id": "metrics-*", + "name": "controlGroup_908bb943-35f9-4cd9-a47f-74ff40843cad:optionsListDataView", + "type": "index-pattern" + } + ], + "type": "dashboard", + "typeMigrationVersion": "10.2.0", + "updated_by": "u_24320541_cloud" +} \ No newline at end of file diff --git a/packages/nvidia_gpu/kibana/tag/nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c.json b/packages/nvidia_gpu/kibana/tag/nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c.json new file mode 100644 index 00000000000..3322d128b31 --- /dev/null +++ b/packages/nvidia_gpu/kibana/tag/nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c.json @@ -0,0 +1,14 @@ +{ + "attributes": { + "color": "#00c9cb", + "description": "", + "name": "NVIDIA GPU" + }, + "coreMigrationVersion": "8.8.0", + "created_at": "2025-02-04T03:01:07.591Z", + "id": "nvidia_gpu-bc9ea2cd-816a-40a9-912e-a97c5781ba0c", + "managed": true, + "references": [], + "type": "tag", + "typeMigrationVersion": "8.0.0" +} \ No newline at end of file diff --git a/packages/nvidia_gpu/manifest.yml b/packages/nvidia_gpu/manifest.yml new file mode 100644 index 00000000000..d476eaad2e7 --- /dev/null +++ b/packages/nvidia_gpu/manifest.yml @@ -0,0 +1,34 @@ +format_version: 3.2.0 +name: nvidia_gpu +title: "NVIDIA GPU Monitoring" +version: 0.1.0 +source: + license: "Elastic-2.0" +description: "Monitor NVIDIA GPUs via NVIDIA Data Center GPU Manager" +type: integration +categories: + - cloud + - custom + - kubernetes + - os_system +conditions: + kibana: + version: "^8.13.0" + elastic: + subscription: "basic" +icons: + - src: /img/nvidia_logo.svg + title: NVIDIA Logo + size: 351×259 + type: image/svg+xml +policy_templates: + - name: nvidia_gpu + title: NVIDIA GPU Metrics + description: Monitor NVIDIA GPUs + inputs: + - type: prometheus/metrics + title: Collect NVIDIA GPU Metrics via Prometheus + description: Collects Prometheus Metrics from NVIDIA Datacenter GPU Manager for NVIDIA GPUs +owner: + github: elastic/integrations + type: elastic From ec8bb4ad6d3218923a75b73ec0b2eafbd1a4f767 Mon Sep 17 00:00:00 2001 From: William Easton Date: Thu, 13 Feb 2025 17:27:58 +0100 Subject: [PATCH 2/5] add codeowners --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 75a251d64e7..8f4765ca2d0 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -305,6 +305,7 @@ /packages/nginx @elastic/obs-infraobs-integrations /packages/nginx_ingress_controller @elastic/obs-cloudnative-monitoring /packages/nginx_ingress_controller_otel @elastic/obs-infraobs-integrations +/packages/nvidia_gpu @elastic/obs-infraobs-integrations /packages/o365 @elastic/security-service-integrations /packages/okta @elastic/security-service-integrations /packages/openai @elastic/obs-infraobs-integrations From 4aaef179b4824a6cfcaa051756959f80ec9bae26 Mon Sep 17 00:00:00 2001 From: William Easton Date: Tue, 18 Feb 2025 20:11:55 -0600 Subject: [PATCH 3/5] Set owner and add support for k8s-related labels --- .../elasticsearch/ingest_pipeline/default.yml | 12 ++++++++++++ .../data_stream/stats/fields/kubernetes.yml | 16 ++++++++++++++++ packages/nvidia_gpu/manifest.yml | 2 +- 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml diff --git a/packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml b/packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml index 158d45cb96c..5a9e072b920 100644 --- a/packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml +++ b/packages/nvidia_gpu/data_stream/stats/elasticsearch/ingest_pipeline/default.yml @@ -109,6 +109,18 @@ processors: field: prometheus.labels.err_msg target_field: gpu.error.message ignore_missing: true +- rename: + field: prometheus.labels.container + target_field: kubernetes.container.name + ignore_missing: true +- rename: + field: prometheus.labels.namespace + target_field: kubernetes.namespace + ignore_missing: true +- rename: + field: prometheus.labels.pod + target_field: kubernetes.pod.name + ignore_missing: true - rename: field: prometheus.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL.rate target_field: gpu.memory.errors.double_bit_persistent diff --git a/packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml b/packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml new file mode 100644 index 00000000000..f7ec4a92e0b --- /dev/null +++ b/packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml @@ -0,0 +1,16 @@ +--- +- name: kubernetes + type: group + fields: + - name: pod.name + type: keyword + description: > + Kubernetes pod name + - name: container.name + type: keyword + description: > + Kubernetes container name + - name: namespace + type: keyword + description: > + Kubernetes namespace \ No newline at end of file diff --git a/packages/nvidia_gpu/manifest.yml b/packages/nvidia_gpu/manifest.yml index d476eaad2e7..56609a4331f 100644 --- a/packages/nvidia_gpu/manifest.yml +++ b/packages/nvidia_gpu/manifest.yml @@ -30,5 +30,5 @@ policy_templates: title: Collect NVIDIA GPU Metrics via Prometheus description: Collects Prometheus Metrics from NVIDIA Datacenter GPU Manager for NVIDIA GPUs owner: - github: elastic/integrations + github: elastic/obs-infraobs-integrations type: elastic From 58dae3b0e0dad1b60ac874018716cc31f4f76c04 Mon Sep 17 00:00:00 2001 From: William Easton Date: Wed, 19 Feb 2025 09:39:45 -0600 Subject: [PATCH 4/5] Updates from PR Feedback --- packages/nvidia_gpu/_dev/build/docs/README.md | 7 +- packages/nvidia_gpu/changelog.yml | 2 +- .../stats/agent/stream/stream.yml.hbs | 6 +- .../data_stream/stats/fields/fields.yml | 100 +++++++++++------- .../data_stream/stats/fields/kubernetes.yml | 6 +- .../nvidia_gpu/data_stream/stats/manifest.yml | 8 +- .../data_stream/stats/sample_event.json | 63 ++++++++++- packages/nvidia_gpu/docs/README.md | 98 ++++++++++++++--- packages/nvidia_gpu/manifest.yml | 3 +- 9 files changed, 226 insertions(+), 67 deletions(-) diff --git a/packages/nvidia_gpu/_dev/build/docs/README.md b/packages/nvidia_gpu/_dev/build/docs/README.md index e9abc7bd300..cf1610c0e20 100644 --- a/packages/nvidia_gpu/_dev/build/docs/README.md +++ b/packages/nvidia_gpu/_dev/build/docs/README.md @@ -12,7 +12,11 @@ Metric data streams collected by the Nvidia GPU Monitoring integration include ` You need Elasticsearch for storing and searching your data and Kibana for visualizing and managing it. You can use our hosted Elasticsearch Service on Elastic Cloud, which is recommended, or self-manage the Elastic Stack on your own hardware. -You need the NVIDIA Datacenter GPU Manager (DCGM) installed on your system (or exposed via a docker container with the GPU device mounted) to collect metrics from the NVIDIA GPUs. You can download the DCGM from the [NVIDIA website](https://developer.nvidia.com/dcgm). By default the DCGM exporter does not expose all available metrics. +You need the NVIDIA Datacenter GPU Manager (DCGM) installed on your system (or exposed via a docker container with the GPU device mounted) to collect metrics from the NVIDIA GPUs. You can download the DCGM from the [NVIDIA website](https://developer.nvidia.com/dcgm). By default the DCGM exporter does not expose all available metrics, to customize the list of available metrics, a csv file with the desired metrics is required. For instructions on how to do this, review the dcgm-exporter documentation. + +If DCGM Exporter is configured to provide enrichment of Kubernetes data, the pod, namespace, and container information will be attached to the corresponding metrics. This is useful for monitoring and attributing GPU usage in Kubernetes environments. + +This integration has been tested with version 3.3.9 of the DCGM exporter. ## Setup @@ -21,6 +25,5 @@ For step-by-step instructions on how to set up an integration, see the When running on Kubernetes, you can use ${env.NODE_NAME} to get the node name for use in the hosts field. For example: `hosts: http://${env.NODE_NAME}:9400/metrics`. - {{event "stats"}} {{fields "stats"}} \ No newline at end of file diff --git a/packages/nvidia_gpu/changelog.yml b/packages/nvidia_gpu/changelog.yml index 0e2bf4ddc56..368e12d660c 100644 --- a/packages/nvidia_gpu/changelog.yml +++ b/packages/nvidia_gpu/changelog.yml @@ -3,4 +3,4 @@ changes: - description: Initial introduction of Nvidia GPU Monitoring type: enhancement - link: https://github.com/elastic/integrations/pull/11931 + link: https://github.com/elastic/integrations/pull/12768 diff --git a/packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs b/packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs index ea3337eace3..fd1a0e6721d 100644 --- a/packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs +++ b/packages/nvidia_gpu/data_stream/stats/agent/stream/stream.yml.hbs @@ -16,10 +16,8 @@ metrics_filters.include: - {{this}} {{/each}} {{#if ssl.certificate_authorities}} -ssl.certificate_authorities: -{{#each ssl.certificate_authorities}} - - {{this}} -{{/each}} +ssl: + {{ssl}} {{/if}} {{#if processors}} processors: diff --git a/packages/nvidia_gpu/data_stream/stats/fields/fields.yml b/packages/nvidia_gpu/data_stream/stats/fields/fields.yml index 9999108ee60..665571acea7 100644 --- a/packages/nvidia_gpu/data_stream/stats/fields/fields.yml +++ b/packages/nvidia_gpu/data_stream/stats/fields/fields.yml @@ -7,6 +7,7 @@ fields: - name: decoder.utilization type: float + metric_type: gauge description: > Utilization of the decoder engine in the GPU. @@ -102,11 +103,13 @@ fields: - name: size.free type: long + metric_type: gauge description: > Free size of the framebuffer. - name: size.used type: long + metric_type: gauge description: > Used size of the framebuffer. @@ -120,56 +123,67 @@ fields: - name: size type: long + metric_type: gauge description: > Size of the GPU memory in MB. - name: used type: long + metric_type: gauge description: > Used size of the GPU memory in MB. - name: copy_utilization type: float - description: > - Utilization of the GPU memory copy engine. - - - name: errors.double_bit_persistent - type: long - metric_type: gauge - description: > - Double-bit persistent errors count for GPU memory. - - - name: errors.double_bit_volatile - type: long - metric_type: gauge - description: > - Double-bit volatile errors count for GPU memory. - - - name: errors.single_bit_persistent - type: long metric_type: gauge description: > - Single-bit persistent errors count for GPU memory. + Utilization of the GPU memory copy engine. - - name: errors.single_bit_volatile - type: long - metric_type: gauge + - name: errors + type: group description: > - Single-bit volatile errors count for GPU memory. + Fields related to errors metrics. + + fields: + - name: double_bit_persistent + type: long + metric_type: gauge + description: > + Double-bit persistent errors count for GPU memory. + + - name: double_bit_volatile + type: long + metric_type: gauge + description: > + Double-bit volatile errors count for GPU memory. + + - name: single_bit_persistent + type: long + metric_type: gauge + description: > + Single-bit persistent errors count for GPU memory. + + - name: single_bit_volatile + type: long + metric_type: gauge + description: > + Single-bit volatile errors count for GPU memory. - name: frequency type: float + metric_type: gauge description: > Clock frequency of the GPU memory. - name: temperature type: float + metric_type: gauge description: > Temperature of the GPU memory. - name: nvlink.bandwidth.total type: long - metric_type: counter + metric_type: gauge description: > Total bandwidth of NVLink. @@ -186,15 +200,18 @@ - name: power.usage type: float + metric_type: gauge description: > Current power usage of the GPU in Watts. - name: streaming_multiprocessor.frequency type: float + metric_type: gauge description: > Frequency of the streaming multiprocessor. - name: temperature + metric_type: gauge type: float description: > Temperature of the GPU. @@ -240,30 +257,37 @@ - name: utilization type: float + metric_type: gauge description: > Overall utilization of the GPU. - name: prometheus type: group description: > - Fields related to Prometheus node metadata. + Fields related to Prometheus metadata. fields: - - name: node.hostname - type: keyword + - name: node + type: group description: > - Hostname of the Prometheus node. + Fields related to Prometheus node metadata. - dimension: true - - name: node.job - type: keyword - description: > - Job of the Prometheus node. + fields: + - name: hostname + type: keyword + description: > + Hostname of the Prometheus node. - dimension: true - - name: node.id - type: integer - description: > - ID of the Prometheus node. + dimension: true + - name: job + type: keyword + description: > + Job of the Prometheus node. - dimension: true + dimension: true + - name: id + type: integer + description: > + ID of the Prometheus node. + + dimension: true diff --git a/packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml b/packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml index f7ec4a92e0b..5330dfb48f3 100644 --- a/packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml +++ b/packages/nvidia_gpu/data_stream/stats/fields/kubernetes.yml @@ -6,11 +6,13 @@ type: keyword description: > Kubernetes pod name + - name: container.name type: keyword description: > Kubernetes container name + - name: namespace type: keyword - description: > - Kubernetes namespace \ No newline at end of file + description: >- + Kubernetes namespace diff --git a/packages/nvidia_gpu/data_stream/stats/manifest.yml b/packages/nvidia_gpu/data_stream/stats/manifest.yml index eeb1a3f507c..6be7293f5e7 100644 --- a/packages/nvidia_gpu/data_stream/stats/manifest.yml +++ b/packages/nvidia_gpu/data_stream/stats/manifest.yml @@ -35,9 +35,11 @@ streams: required: false show_user: false default: [] - - name: ssl.certificate_authorities - type: text - title: SSL Certificate Authorities + - name: ssl + type: yaml + title: SSL Configuration + description: > + Configure SSL for the Prometheus endpoint in YAML format. Use with caution as incorrect settings may cause issues with your configuration. multi: true required: false show_user: false diff --git a/packages/nvidia_gpu/data_stream/stats/sample_event.json b/packages/nvidia_gpu/data_stream/stats/sample_event.json index ebdf09a000f..9cd378c2e65 100644 --- a/packages/nvidia_gpu/data_stream/stats/sample_event.json +++ b/packages/nvidia_gpu/data_stream/stats/sample_event.json @@ -27,6 +27,67 @@ "ingested": "2025-02-04T03:58:16Z", "module": "prometheus" }, + "gpu": { + "decoder": { + "utilization": 0 + }, + "device": { + "brand": "GeForce", + "id": "0", + "info_rom": { + "oem_version": "1.1", + "version": "G001.0000.02.04" + }, + "model": "NVIDIA GeForce RTX 2060 SUPER", + "name": "nvidia0", + "uuid": "GPU-72ca939a-a640-eb0b-df2b-4ac1d7081736", + "vbios": { + "version": "90.06.44.00.2f" + } + }, + "driver": { + "nvml_version": "12.560.35.02", + "version": "560.94" + }, + "encoder": { + "utilization": 0 + }, + "energy": { + "total": 9333403 + }, + "framebuffer": { + "size": { + "free": 6990, + "used": 1015 + } + }, + "license": { + "vgpu": "0" + }, + "memory": { + "copy_utilization": 10, + "frequency": 405, + "temperature": 0 + }, + "nvlink": { + "bandwidth": { + "total": 0 + } + }, + "pci": { + "bus": { + "id": "00000000:01:00.0" + } + }, + "power": { + "usage": 19.131 + }, + "streaming_multiprocessor": { + "frequency": 375 + }, + "temperature": 43, + "utilization": 18 + }, "host": { "architecture": "aarch64", "containerized": false, @@ -61,4 +122,4 @@ "address": "http://192.168.0.238:9400/metrics", "type": "prometheus" } -} \ No newline at end of file +} diff --git a/packages/nvidia_gpu/docs/README.md b/packages/nvidia_gpu/docs/README.md index 323d61d4daa..537532d59c3 100644 --- a/packages/nvidia_gpu/docs/README.md +++ b/packages/nvidia_gpu/docs/README.md @@ -12,7 +12,11 @@ Metric data streams collected by the Nvidia GPU Monitoring integration include ` You need Elasticsearch for storing and searching your data and Kibana for visualizing and managing it. You can use our hosted Elasticsearch Service on Elastic Cloud, which is recommended, or self-manage the Elastic Stack on your own hardware. -You need the NVIDIA Datacenter GPU Manager (DCGM) installed on your system (or exposed via a docker container with the GPU device mounted) to collect metrics from the NVIDIA GPUs. You can download the DCGM from the [NVIDIA website](https://developer.nvidia.com/dcgm). By default the DCGM exporter does not expose all available metrics. +You need the NVIDIA Datacenter GPU Manager (DCGM) installed on your system (or exposed via a docker container with the GPU device mounted) to collect metrics from the NVIDIA GPUs. You can download the DCGM from the [NVIDIA website](https://developer.nvidia.com/dcgm). By default the DCGM exporter does not expose all available metrics, to customize the list of available metrics, a csv file with the desired metrics is required. For instructions on how to do this, review the dcgm-exporter documentation. + +If DCGM Exporter is configured to provide enrichment of Kubernetes data, the pod, namespace, and container information will be attached to the corresponding metrics. This is useful for monitoring and attributing GPU usage in Kubernetes environments. + +This integration has been tested with version 3.3.9 of the DCGM exporter. ## Setup @@ -21,7 +25,6 @@ For step-by-step instructions on how to set up an integration, see the When running on Kubernetes, you can use ${env.NODE_NAME} to get the node name for use in the hosts field. For example: `hosts: http://${env.NODE_NAME}:9400/metrics`. - An example event for `stats` looks as following: ```json @@ -54,6 +57,67 @@ An example event for `stats` looks as following: "ingested": "2025-02-04T03:58:16Z", "module": "prometheus" }, + "gpu": { + "decoder": { + "utilization": 0 + }, + "device": { + "brand": "GeForce", + "id": "0", + "info_rom": { + "oem_version": "1.1", + "version": "G001.0000.02.04" + }, + "model": "NVIDIA GeForce RTX 2060 SUPER", + "name": "nvidia0", + "uuid": "GPU-72ca939a-a640-eb0b-df2b-4ac1d7081736", + "vbios": { + "version": "90.06.44.00.2f" + } + }, + "driver": { + "nvml_version": "12.560.35.02", + "version": "560.94" + }, + "encoder": { + "utilization": 0 + }, + "energy": { + "total": 9333403 + }, + "framebuffer": { + "size": { + "free": 6990, + "used": 1015 + } + }, + "license": { + "vgpu": "0" + }, + "memory": { + "copy_utilization": 10, + "frequency": 405, + "temperature": 0 + }, + "nvlink": { + "bandwidth": { + "total": 0 + } + }, + "pci": { + "bus": { + "id": "00000000:01:00.0" + } + }, + "power": { + "usage": 19.131 + }, + "streaming_multiprocessor": { + "frequency": 375 + }, + "temperature": 43, + "utilization": 18 + }, "host": { "architecture": "aarch64", "containerized": false, @@ -89,6 +153,7 @@ An example event for `stats` looks as following: "type": "prometheus" } } + ``` **Exported fields** @@ -98,7 +163,7 @@ An example event for `stats` looks as following: | data_stream.dataset | Data stream dataset. | constant_keyword | | | data_stream.namespace | Data stream namespace. | constant_keyword | | | data_stream.type | Data stream type. | constant_keyword | | -| gpu.decoder.utilization | Utilization of the decoder engine in the GPU. | float | | +| gpu.decoder.utilization | Utilization of the decoder engine in the GPU. | float | gauge | | gpu.device.brand | Brand of the GPU device. | keyword | | | gpu.device.id | ID of the GPU device. | keyword | | | gpu.device.info_rom.oem_version | OEM version of the info ROM. | keyword | | @@ -114,31 +179,34 @@ An example event for `stats` looks as following: | gpu.error.code | Specific Error code for the XID error on the GPU. | keyword | | | gpu.error.message | Specific Error message for the XID error on the. | keyword | | | gpu.error.xid | The eXerience ID of the error being reported by the GPU. | keyword | | -| gpu.framebuffer.size.free | Free size of the framebuffer. | long | | -| gpu.framebuffer.size.used | Used size of the framebuffer. | long | | +| gpu.framebuffer.size.free | Free size of the framebuffer. | long | gauge | +| gpu.framebuffer.size.used | Used size of the framebuffer. | long | gauge | | gpu.license.vgpu | License status related to vGPU. | keyword | | -| gpu.memory.copy_utilization | Utilization of the GPU memory copy engine. | float | | +| gpu.memory.copy_utilization | Utilization of the GPU memory copy engine. | float | gauge | | gpu.memory.errors.double_bit_persistent | Double-bit persistent errors count for GPU memory. | long | gauge | | gpu.memory.errors.double_bit_volatile | Double-bit volatile errors count for GPU memory. | long | gauge | | gpu.memory.errors.single_bit_persistent | Single-bit persistent errors count for GPU memory. | long | gauge | | gpu.memory.errors.single_bit_volatile | Single-bit volatile errors count for GPU memory. | long | gauge | -| gpu.memory.frequency | Clock frequency of the GPU memory. | float | | -| gpu.memory.size | Size of the GPU memory in MB. | long | | -| gpu.memory.temperature | Temperature of the GPU memory. | float | | -| gpu.memory.used | Used size of the GPU memory in MB. | long | | -| gpu.nvlink.bandwidth.total | Total bandwidth of NVLink. | long | counter | +| gpu.memory.frequency | Clock frequency of the GPU memory. | float | gauge | +| gpu.memory.size | Size of the GPU memory in MB. | long | gauge | +| gpu.memory.temperature | Temperature of the GPU memory. | float | gauge | +| gpu.memory.used | Used size of the GPU memory in MB. | long | gauge | +| gpu.nvlink.bandwidth.total | Total bandwidth of NVLink. | long | gauge | | gpu.pci.bus.id | Bus ID of the PCI device. | keyword | | | gpu.pcie.replay | Replay counter for the PCIe connection. | long | gauge | -| gpu.power.usage | Current power usage of the GPU in Watts. | float | | -| gpu.streaming_multiprocessor.frequency | Frequency of the streaming multiprocessor. | float | | -| gpu.temperature | Temperature of the GPU. | float | | +| gpu.power.usage | Current power usage of the GPU in Watts. | float | gauge | +| gpu.streaming_multiprocessor.frequency | Frequency of the streaming multiprocessor. | float | gauge | +| gpu.temperature | Temperature of the GPU. | float | gauge | | gpu.throttling.board_limit | Number of microseconds throttled due to Board limit. | float | gauge | | gpu.throttling.low_utilization | Number of microseconds throttled due to low utilization. | float | gauge | | gpu.throttling.power | Number of microseconds throttled due to power. | float | gauge | | gpu.throttling.reliability | Number of microseconds throttled due to reliability. | float | gauge | | gpu.throttling.sync_boost | Number of microseconds throttled due to Sync Boost. | float | gauge | | gpu.throttling.thermal | Number of microseconds throttled due to thermals. | float | gauge | -| gpu.utilization | Overall utilization of the GPU. | float | | +| gpu.utilization | Overall utilization of the GPU. | float | gauge | +| kubernetes.container.name | Kubernetes container name | keyword | | +| kubernetes.namespace | Kubernetes namespace | keyword | | +| kubernetes.pod.name | Kubernetes pod name | keyword | | | prometheus.node.hostname | Hostname of the Prometheus node. | keyword | | | prometheus.node.id | ID of the Prometheus node. | integer | | | prometheus.node.job | Job of the Prometheus node. | keyword | | diff --git a/packages/nvidia_gpu/manifest.yml b/packages/nvidia_gpu/manifest.yml index 56609a4331f..ccd0f6dce4f 100644 --- a/packages/nvidia_gpu/manifest.yml +++ b/packages/nvidia_gpu/manifest.yml @@ -8,12 +8,13 @@ description: "Monitor NVIDIA GPUs via NVIDIA Data Center GPU Manager" type: integration categories: - cloud + - observability - custom - kubernetes - os_system conditions: kibana: - version: "^8.13.0" + version: "^8.13.0 || ^9.0.0" elastic: subscription: "basic" icons: From 11d52631af95e22a957c102940d1bc3a2b69d8c4 Mon Sep 17 00:00:00 2001 From: William Easton Date: Wed, 19 Feb 2025 10:28:26 -0600 Subject: [PATCH 5/5] Small formatting updates --- packages/nvidia_gpu/_dev/build/docs/README.md | 11 +- .../nvidia_gpu/data_stream/stats/manifest.yml | 1 + .../data_stream/stats/sample_event.json | 228 +++++++++--------- packages/nvidia_gpu/docs/README.md | 12 +- 4 files changed, 127 insertions(+), 125 deletions(-) diff --git a/packages/nvidia_gpu/_dev/build/docs/README.md b/packages/nvidia_gpu/_dev/build/docs/README.md index cf1610c0e20..792360b95eb 100644 --- a/packages/nvidia_gpu/_dev/build/docs/README.md +++ b/packages/nvidia_gpu/_dev/build/docs/README.md @@ -2,11 +2,6 @@ Use the NVIDIA GPU Monitoring integration to monitor the health and performance of your NVIDIA GPUs. The integration collects metrics from the NVIDIA Datacenter GPU Manager and sends them to Elasticsearch. -## Data streams - -**stats** give you insight into the state of the NVIDIA GPUs. -Metric data streams collected by the Nvidia GPU Monitoring integration include `stats`. See more details in the [Metrics](#metrics-reference). - ## Requirements You need Elasticsearch for storing and searching your data and Kibana for visualizing and managing it. @@ -25,5 +20,11 @@ For step-by-step instructions on how to set up an integration, see the When running on Kubernetes, you can use ${env.NODE_NAME} to get the node name for use in the hosts field. For example: `hosts: http://${env.NODE_NAME}:9400/metrics`. +## Data streams + +**stats** give you insight into the state of the NVIDIA GPUs. +Metric data streams collected by the Nvidia GPU Monitoring integration include `stats`. See more details in the [Metrics](#metrics-reference). + {{event "stats"}} + {{fields "stats"}} \ No newline at end of file diff --git a/packages/nvidia_gpu/data_stream/stats/manifest.yml b/packages/nvidia_gpu/data_stream/stats/manifest.yml index 6be7293f5e7..52e23ad9386 100644 --- a/packages/nvidia_gpu/data_stream/stats/manifest.yml +++ b/packages/nvidia_gpu/data_stream/stats/manifest.yml @@ -40,6 +40,7 @@ streams: title: SSL Configuration description: > Configure SSL for the Prometheus endpoint in YAML format. Use with caution as incorrect settings may cause issues with your configuration. + multi: true required: false show_user: false diff --git a/packages/nvidia_gpu/data_stream/stats/sample_event.json b/packages/nvidia_gpu/data_stream/stats/sample_event.json index 9cd378c2e65..779365d014c 100644 --- a/packages/nvidia_gpu/data_stream/stats/sample_event.json +++ b/packages/nvidia_gpu/data_stream/stats/sample_event.json @@ -1,125 +1,125 @@ { - "@timestamp": "2025-02-04T03:58:06.137Z", - "agent": { - "ephemeral_id": "33183a42-1f03-4d37-bf77-2a683c47eec1", - "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", - "name": "4b8c5ec8e940", - "type": "metricbeat", - "version": "8.16.1" + "@timestamp": "2025-02-04T03:58:06.137Z", + "agent": { + "ephemeral_id": "33183a42-1f03-4d37-bf77-2a683c47eec1", + "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", + "name": "4b8c5ec8e940", + "type": "metricbeat", + "version": "8.16.1" + }, + "data_stream": { + "dataset": "nvidia_gpu.stats", + "namespace": "default", + "type": "metrics" + }, + "ecs": { + "version": "8.0.0" + }, + "elastic_agent": { + "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", + "snapshot": false, + "version": "8.16.1" + }, + "event": { + "agent_id_status": "verified", + "dataset": "nvidia_gpu.stats", + "duration": 3729334, + "ingested": "2025-02-04T03:58:16Z", + "module": "prometheus" + }, + "gpu": { + "decoder": { + "utilization": 0 }, - "data_stream": { - "dataset": "nvidia_gpu.stats", - "namespace": "default", - "type": "metrics" + "device": { + "brand": "GeForce", + "id": "0", + "info_rom": { + "oem_version": "1.1", + "version": "G001.0000.02.04" + }, + "model": "NVIDIA GeForce RTX 2060 SUPER", + "name": "nvidia0", + "uuid": "GPU-72ca939a-a640-eb0b-df2b-4ac1d7081736", + "vbios": { + "version": "90.06.44.00.2f" + } }, - "ecs": { - "version": "8.0.0" + "driver": { + "nvml_version": "12.560.35.02", + "version": "560.94" }, - "elastic_agent": { - "id": "b6f2a8e1-c701-4a92-a1a2-3a9362ad4af7", - "snapshot": false, - "version": "8.16.1" + "encoder": { + "utilization": 0 }, - "event": { - "agent_id_status": "verified", - "dataset": "nvidia_gpu.stats", - "duration": 3729334, - "ingested": "2025-02-04T03:58:16Z", - "module": "prometheus" + "energy": { + "total": 9333403 }, - "gpu": { - "decoder": { - "utilization": 0 - }, - "device": { - "brand": "GeForce", - "id": "0", - "info_rom": { - "oem_version": "1.1", - "version": "G001.0000.02.04" - }, - "model": "NVIDIA GeForce RTX 2060 SUPER", - "name": "nvidia0", - "uuid": "GPU-72ca939a-a640-eb0b-df2b-4ac1d7081736", - "vbios": { - "version": "90.06.44.00.2f" - } - }, - "driver": { - "nvml_version": "12.560.35.02", - "version": "560.94" - }, - "encoder": { - "utilization": 0 - }, - "energy": { - "total": 9333403 - }, - "framebuffer": { - "size": { - "free": 6990, - "used": 1015 - } - }, - "license": { - "vgpu": "0" - }, - "memory": { - "copy_utilization": 10, - "frequency": 405, - "temperature": 0 - }, - "nvlink": { - "bandwidth": { - "total": 0 - } - }, - "pci": { - "bus": { - "id": "00000000:01:00.0" - } - }, - "power": { - "usage": 19.131 - }, - "streaming_multiprocessor": { - "frequency": 375 - }, - "temperature": 43, - "utilization": 18 + "framebuffer": { + "size": { + "free": 6990, + "used": 1015 + } }, - "host": { - "architecture": "aarch64", - "containerized": false, - "hostname": "4b8c5ec8e940", - "ip": "172.17.0.3", - "mac": "02-42-AC-11-00-03", - "name": "4b8c5ec8e940", - "os": { - "codename": "noble", - "family": "debian", - "kernel": "6.10.14-linuxkit", - "name": "Ubuntu", - "platform": "ubuntu", - "type": "linux", - "version": "24.04.1 LTS (Noble Numbat)" - } + "license": { + "vgpu": "0" }, - "metricset": { - "name": "collector", - "period": 10000 + "memory": { + "copy_utilization": 10, + "frequency": 405, + "temperature": 0 }, - "prometheus": { - "node": { - "job": "prometheus", - "name": "192.168.0.238:9400" - }, - "up": { - "value": 0 - } + "nvlink": { + "bandwidth": { + "total": 0 + } }, - "service": { - "address": "http://192.168.0.238:9400/metrics", - "type": "prometheus" + "pci": { + "bus": { + "id": "00000000:01:00.0" + } + }, + "power": { + "usage": 19.131 + }, + "streaming_multiprocessor": { + "frequency": 375 + }, + "temperature": 43, + "utilization": 18 + }, + "host": { + "architecture": "aarch64", + "containerized": false, + "hostname": "4b8c5ec8e940", + "ip": "172.17.0.3", + "mac": "02-42-AC-11-00-03", + "name": "4b8c5ec8e940", + "os": { + "codename": "noble", + "family": "debian", + "kernel": "6.10.14-linuxkit", + "name": "Ubuntu", + "platform": "ubuntu", + "type": "linux", + "version": "24.04.1 LTS (Noble Numbat)" + } + }, + "metricset": { + "name": "collector", + "period": 10000 + }, + "prometheus": { + "node": { + "job": "prometheus", + "name": "192.168.0.238:9400" + }, + "up": { + "value": 0 } -} + }, + "service": { + "address": "http://192.168.0.238:9400/metrics", + "type": "prometheus" + } +} \ No newline at end of file diff --git a/packages/nvidia_gpu/docs/README.md b/packages/nvidia_gpu/docs/README.md index 537532d59c3..44727328df6 100644 --- a/packages/nvidia_gpu/docs/README.md +++ b/packages/nvidia_gpu/docs/README.md @@ -2,11 +2,6 @@ Use the NVIDIA GPU Monitoring integration to monitor the health and performance of your NVIDIA GPUs. The integration collects metrics from the NVIDIA Datacenter GPU Manager and sends them to Elasticsearch. -## Data streams - -**stats** give you insight into the state of the NVIDIA GPUs. -Metric data streams collected by the Nvidia GPU Monitoring integration include `stats`. See more details in the [Metrics](#metrics-reference). - ## Requirements You need Elasticsearch for storing and searching your data and Kibana for visualizing and managing it. @@ -25,6 +20,11 @@ For step-by-step instructions on how to set up an integration, see the When running on Kubernetes, you can use ${env.NODE_NAME} to get the node name for use in the hosts field. For example: `hosts: http://${env.NODE_NAME}:9400/metrics`. +## Data streams + +**stats** give you insight into the state of the NVIDIA GPUs. +Metric data streams collected by the Nvidia GPU Monitoring integration include `stats`. See more details in the [Metrics](#metrics-reference). + An example event for `stats` looks as following: ```json @@ -153,8 +153,8 @@ An example event for `stats` looks as following: "type": "prometheus" } } - ``` + **Exported fields** | Field | Description | Type | Metric Type |