diff --git a/ai-ml/nvidia-triton-server/addons.tf b/ai-ml/nvidia-triton-server/addons.tf index dd2b5167c..e76e7206b 100644 --- a/ai-ml/nvidia-triton-server/addons.tf +++ b/ai-ml/nvidia-triton-server/addons.tf @@ -138,6 +138,7 @@ module "eks_blueprints_addons" { }) ] chart_version = "48.1.1" + namespace = "monitoring" set_sensitive = [ { name = "grafana.adminPassword" diff --git a/ai-ml/nvidia-triton-server/helm-values/prometheus-adapter.yaml b/ai-ml/nvidia-triton-server/helm-values/prometheus-adapter.yaml index b3f88b348..73f61ab93 100644 --- a/ai-ml/nvidia-triton-server/helm-values/prometheus-adapter.yaml +++ b/ai-ml/nvidia-triton-server/helm-values/prometheus-adapter.yaml @@ -12,3 +12,14 @@ rules: matches: "num_requests_running" as: "" metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>) + - seriesQuery: 'nv_inference_queue_duration_us{namespace!="", pod!=""}' + resources: + overrides: + namespace: + resource: "namespace" + pod: + resource: "pod" + name: + matches: "nv_inference_queue_duration_us" + as: "nv_inference_queue_duration_ms" + metricsQuery: 'avg(rate(nv_inference_queue_duration_us{<<.LabelMatchers>>}[1m])/1000) by (<<.GroupBy>>)' diff --git a/ai-ml/nvidia-triton-server/nvidia-triton-server.tf b/ai-ml/nvidia-triton-server/nvidia-triton-server.tf index e99c3bf39..e2a76eccf 100644 --- a/ai-ml/nvidia-triton-server/nvidia-triton-server.tf +++ b/ai-ml/nvidia-triton-server/nvidia-triton-server.tf @@ -61,7 +61,17 @@ module "triton_server_vllm" { nodeSelector: NodeGroupType: g5-gpu-karpenter type: karpenter - + hpa: + minReplicas: 1 + maxReplicas: 5 + metrics: + - type: Pods + pods: + metric: + name: nv_inference_queue_duration_ms + target: + type: AverageValue + averageValue: 10 tolerations: - key: "nvidia.com/gpu" operator: "Exists"