Switch agent to use vllm inference engine by default

Signed-off-by: Dolpher Du <dolpher.du@intel.com>
opea-project · Jan 22, 2025 · 6f81f39 · 6f81f39
1 parent 9c808f8
commit 6f81f39
Show file tree

Hide file tree

Showing 8 changed files with 104 additions and 79 deletions.
diff --git a/.github/workflows/_helm-e2e.yaml b/.github/workflows/_helm-e2e.yaml
@@ -68,8 +68,8 @@ jobs:
         run: |
           CHART_NAME="${workload_path##*/}"
           echo "CHART_NAME=$CHART_NAME" >> $GITHUB_ENV
-          echo "RELEASE_NAME=${CHART_NAME}$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
-          echo "NAMESPACE=${CHART_NAME}-$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
+          echo "RELEASE_NAME=${CHART_NAME}$(date +%d%H%M%S)" >> $GITHUB_ENV
+          echo "NAMESPACE=Infra-${CHART_NAME}-$(date +%d%H%M%S)" >> $GITHUB_ENV
           echo "ROLLOUT_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
           echo "TEST_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
           echo "KUBECTL_TIMEOUT_SECONDS=60s" >> $GITHUB_ENV

diff --git a/helm-charts/agentqna/gaudi-tgi-values.yaml b/helm-charts/agentqna/gaudi-tgi-values.yaml
@@ -0,0 +1,30 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
+  enabled: true
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+supervisor:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+ragagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
+sqlagent:
+  llm_endpoint_url: http://{{ .Release.Name }}-tgi
+  llm_engine: tgi
diff --git a/helm-charts/agentqna/gaudi-values.yaml b/helm-charts/agentqna/gaudi-values.yaml
@@ -4,27 +4,13 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
-tgi:
+vllm:
   enabled: true
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+  image:
+    repository: opea/vllm-gaudi
 supervisor:
-  llm_endpoint_url: http://{{ .Release.Name }}-tgi
-  llm_engine: tgi
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
 ragagent:
-  llm_endpoint_url: http://{{ .Release.Name }}-tgi
-  llm_engine: tgi
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
 sqlagent:
-  llm_endpoint_url: http://{{ .Release.Name }}-tgi
-  llm_engine: tgi
+  llm_endpoint_url: http://{{ .Release.Name }}-vllm
diff --git a/helm-charts/agentqna/vllm-gaudi-values.yaml b/helm-charts/agentqna/vllm-gaudi-values.yaml
diff --git a/helm-charts/common/agent/gaudi-tgi-values.yaml b/helm-charts/common/agent/gaudi-tgi-values.yaml
@@ -0,0 +1,24 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
+  enabled: true
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+
+llm_endpoint_url: http://{{ .Release.Name }}-tgi
+llm_engine: tgi
diff --git a/helm-charts/common/agent/gaudi-values.yaml b/helm-charts/common/agent/gaudi-values.yaml
@@ -4,35 +4,8 @@
 # Accelerate inferencing in heaviest components to improve performance
 # by overriding their subchart values
 
-tgi:
+vllm:
   enabled: true
-  accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
-  resources:
-    limits:
-      habana.ai/gaudi: 4
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  extraCmdArgs: ["--sharded","true","--num-shard","4"]
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+    repository: opea/vllm-gaudi
+llm_endpoint_url: http://{{ .Release.Name }}-vllm
diff --git a/helm-charts/common/agent/templates/configmap.yaml b/helm-charts/common/agent/templates/configmap.yaml
@@ -20,7 +20,7 @@ data:
   {{- if .Values.llm_endpoint_url }}
   llm_endpoint_url: {{ tpl .Values.llm_endpoint_url . | quote }}
   {{- else }}
-  llm_endpoint_url: "http://{{ .Release.Name }}-tgi"
+  llm_endpoint_url: "http://{{ .Release.Name }}-vllm"
   {{- end }}
   # {{- if .Values.port }}
   # port: {{ .Values.port | quote }}

diff --git a/helm-charts/common/agent/values.yaml b/helm-charts/common/agent/values.yaml
@@ -7,27 +7,55 @@
 
 tgi:
   enabled: false
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 4
   LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct"
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
+  MAX_INPUT_LENGTH: 4096
+  MAX_TOTAL_TOKENS: 8192
+  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: true
+  ENABLE_HPU_GRAPH: true
+  LIMIT_HPU_GRAPH: true
+  USE_FLASH_ATTENTION: true
+  FLASH_ATTENTION_RECOMPUTE: true
+  extraCmdArgs: ["--sharded", "true", "--num-shard", "4"]
 
 vllm:
   enabled: false
-  LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
-  extraCmdArgs: ["--tensor-parallel-size", "1", "--block-size", "128", "--max-num-seqs", "4096", "--max-seq_len-to-capture", "8192", "--enable-auto-tool-choice", "--tool-call-parser", "mistral"]
+  accelDevice: "gaudi"
+  image:
+    repository: opea/vllm-gaudi
+  resources:
+    limits:
+      habana.ai/gaudi: 4
+  LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct"
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: true
+  VLLM_SKIP_WARMUP: true
+  extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384"]
 
 replicaCount: 1
+
+toolPath: "/mnt/tools"
+strategy: react_llama
+recursion_limit: 10
+llm_engine: vllm
 llm_endpoint_url: ""
 model: "meta-llama/Meta-Llama-3.1-70B-Instruct"
+temperature: "0.01"
 max_new_tokens: "4096"
-llm_engine: "tgi"
-strategy: "react_langchain"
-recursion_limit: "15"
-require_human_feedback: "false"
-
+stream: "false"
+tools: /home/user/tools/supervisor_agent_tools.yaml
+require_human_feedback: false
 # Set it as a non-null string, such as true, if you want to enable logging facility,
 # otherwise, keep it as "" to disable it.
-LOGFLAG: "True"
+LOGFLAG: ""
 
 image:
   repository: opea/agent