From 6f81f39232e60d129e98241771396a101a41b65c Mon Sep 17 00:00:00 2001 From: Dolpher Du Date: Wed, 22 Jan 2025 02:03:34 +0000 Subject: [PATCH] Switch agent to use vllm inference engine by default Signed-off-by: Dolpher Du --- .github/workflows/_helm-e2e.yaml | 4 +- helm-charts/agentqna/gaudi-tgi-values.yaml | 30 ++++++++++++ helm-charts/agentqna/gaudi-values.yaml | 26 +++------- helm-charts/agentqna/vllm-gaudi-values.yaml | 16 ------- .../common/agent/gaudi-tgi-values.yaml | 24 ++++++++++ helm-charts/common/agent/gaudi-values.yaml | 33 ++----------- .../common/agent/templates/configmap.yaml | 2 +- helm-charts/common/agent/values.yaml | 48 +++++++++++++++---- 8 files changed, 104 insertions(+), 79 deletions(-) create mode 100644 helm-charts/agentqna/gaudi-tgi-values.yaml delete mode 100644 helm-charts/agentqna/vllm-gaudi-values.yaml create mode 100644 helm-charts/common/agent/gaudi-tgi-values.yaml diff --git a/.github/workflows/_helm-e2e.yaml b/.github/workflows/_helm-e2e.yaml index 989018d3..a5a8b135 100644 --- a/.github/workflows/_helm-e2e.yaml +++ b/.github/workflows/_helm-e2e.yaml @@ -68,8 +68,8 @@ jobs: run: | CHART_NAME="${workload_path##*/}" echo "CHART_NAME=$CHART_NAME" >> $GITHUB_ENV - echo "RELEASE_NAME=${CHART_NAME}$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV - echo "NAMESPACE=${CHART_NAME}-$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV + echo "RELEASE_NAME=${CHART_NAME}$(date +%d%H%M%S)" >> $GITHUB_ENV + echo "NAMESPACE=Infra-${CHART_NAME}-$(date +%d%H%M%S)" >> $GITHUB_ENV echo "ROLLOUT_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV echo "TEST_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV echo "KUBECTL_TIMEOUT_SECONDS=60s" >> $GITHUB_ENV diff --git a/helm-charts/agentqna/gaudi-tgi-values.yaml b/helm-charts/agentqna/gaudi-tgi-values.yaml new file mode 100644 index 00000000..5aad1a44 --- /dev/null +++ b/helm-charts/agentqna/gaudi-tgi-values.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: true + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi diff --git a/helm-charts/agentqna/gaudi-values.yaml b/helm-charts/agentqna/gaudi-values.yaml index 5aad1a44..2d171ea2 100644 --- a/helm-charts/agentqna/gaudi-values.yaml +++ b/helm-charts/agentqna/gaudi-values.yaml @@ -4,27 +4,13 @@ # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values -tgi: +vllm: enabled: true - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 + image: + repository: opea/vllm-gaudi supervisor: - llm_endpoint_url: http://{{ .Release.Name }}-tgi - llm_engine: tgi + llm_endpoint_url: http://{{ .Release.Name }}-vllm ragagent: - llm_endpoint_url: http://{{ .Release.Name }}-tgi - llm_engine: tgi + llm_endpoint_url: http://{{ .Release.Name }}-vllm sqlagent: - llm_endpoint_url: http://{{ .Release.Name }}-tgi - llm_engine: tgi + llm_endpoint_url: http://{{ .Release.Name }}-vllm diff --git a/helm-charts/agentqna/vllm-gaudi-values.yaml b/helm-charts/agentqna/vllm-gaudi-values.yaml deleted file mode 100644 index 2d171ea2..00000000 --- a/helm-charts/agentqna/vllm-gaudi-values.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Accelerate inferencing in heaviest components to improve performance -# by overriding their subchart values - -vllm: - enabled: true - image: - repository: opea/vllm-gaudi -supervisor: - llm_endpoint_url: http://{{ .Release.Name }}-vllm -ragagent: - llm_endpoint_url: http://{{ .Release.Name }}-vllm -sqlagent: - llm_endpoint_url: http://{{ .Release.Name }}-vllm diff --git a/helm-charts/common/agent/gaudi-tgi-values.yaml b/helm-charts/common/agent/gaudi-tgi-values.yaml new file mode 100644 index 00000000..bd00f4b0 --- /dev/null +++ b/helm-charts/common/agent/gaudi-tgi-values.yaml @@ -0,0 +1,24 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: true + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + +llm_endpoint_url: http://{{ .Release.Name }}-tgi +llm_engine: tgi diff --git a/helm-charts/common/agent/gaudi-values.yaml b/helm-charts/common/agent/gaudi-values.yaml index babbfaa7..f5e8313b 100644 --- a/helm-charts/common/agent/gaudi-values.yaml +++ b/helm-charts/common/agent/gaudi-values.yaml @@ -4,35 +4,8 @@ # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values -tgi: +vllm: enabled: true - accelDevice: "gaudi" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.3.1" - resources: - limits: - habana.ai/gaudi: 4 - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - extraCmdArgs: ["--sharded","true","--num-shard","4"] - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 1 - failureThreshold: 120 + repository: opea/vllm-gaudi +llm_endpoint_url: http://{{ .Release.Name }}-vllm diff --git a/helm-charts/common/agent/templates/configmap.yaml b/helm-charts/common/agent/templates/configmap.yaml index aa3d872d..aaaa45c2 100644 --- a/helm-charts/common/agent/templates/configmap.yaml +++ b/helm-charts/common/agent/templates/configmap.yaml @@ -20,7 +20,7 @@ data: {{- if .Values.llm_endpoint_url }} llm_endpoint_url: {{ tpl .Values.llm_endpoint_url . | quote }} {{- else }} - llm_endpoint_url: "http://{{ .Release.Name }}-tgi" + llm_endpoint_url: "http://{{ .Release.Name }}-vllm" {{- end }} # {{- if .Values.port }} # port: {{ .Values.port | quote }} diff --git a/helm-charts/common/agent/values.yaml b/helm-charts/common/agent/values.yaml index 0668bb5e..a09a4142 100644 --- a/helm-charts/common/agent/values.yaml +++ b/helm-charts/common/agent/values.yaml @@ -7,27 +7,55 @@ tgi: enabled: false + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.3.1" + resources: + limits: + habana.ai/gaudi: 4 LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct" - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" + MAX_INPUT_LENGTH: 4096 + MAX_TOTAL_TOKENS: 8192 + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: true + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + extraCmdArgs: ["--sharded", "true", "--num-shard", "4"] vllm: enabled: false - LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" - extraCmdArgs: ["--tensor-parallel-size", "1", "--block-size", "128", "--max-num-seqs", "4096", "--max-seq_len-to-capture", "8192", "--enable-auto-tool-choice", "--tool-call-parser", "mistral"] + accelDevice: "gaudi" + image: + repository: opea/vllm-gaudi + resources: + limits: + habana.ai/gaudi: 4 + LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct" + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: true + VLLM_SKIP_WARMUP: true + extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384"] replicaCount: 1 + +toolPath: "/mnt/tools" +strategy: react_llama +recursion_limit: 10 +llm_engine: vllm llm_endpoint_url: "" model: "meta-llama/Meta-Llama-3.1-70B-Instruct" +temperature: "0.01" max_new_tokens: "4096" -llm_engine: "tgi" -strategy: "react_langchain" -recursion_limit: "15" -require_human_feedback: "false" - +stream: "false" +tools: /home/user/tools/supervisor_agent_tools.yaml +require_human_feedback: false # Set it as a non-null string, such as true, if you want to enable logging facility, # otherwise, keep it as "" to disable it. -LOGFLAG: "True" +LOGFLAG: "" image: repository: opea/agent