Skip to content

Commit

Permalink
Switch agent to use vllm inference engine by default
Browse files Browse the repository at this point in the history
Signed-off-by: Dolpher Du <dolpher.du@intel.com>
  • Loading branch information
yongfengdu committed Jan 22, 2025
1 parent 9c808f8 commit 6f81f39
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 79 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/_helm-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ jobs:
run: |
CHART_NAME="${workload_path##*/}"
echo "CHART_NAME=$CHART_NAME" >> $GITHUB_ENV
echo "RELEASE_NAME=${CHART_NAME}$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
echo "NAMESPACE=${CHART_NAME}-$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
echo "RELEASE_NAME=${CHART_NAME}$(date +%d%H%M%S)" >> $GITHUB_ENV
echo "NAMESPACE=Infra-${CHART_NAME}-$(date +%d%H%M%S)" >> $GITHUB_ENV
echo "ROLLOUT_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
echo "TEST_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
echo "KUBECTL_TIMEOUT_SECONDS=60s" >> $GITHUB_ENV
Expand Down
30 changes: 30 additions & 0 deletions helm-charts/agentqna/gaudi-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

tgi:
enabled: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-tgi
llm_engine: tgi
ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-tgi
llm_engine: tgi
sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-tgi
llm_engine: tgi
26 changes: 6 additions & 20 deletions helm-charts/agentqna/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,13 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

tgi:
vllm:
enabled: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
image:
repository: opea/vllm-gaudi
supervisor:
llm_endpoint_url: http://{{ .Release.Name }}-tgi
llm_engine: tgi
llm_endpoint_url: http://{{ .Release.Name }}-vllm
ragagent:
llm_endpoint_url: http://{{ .Release.Name }}-tgi
llm_engine: tgi
llm_endpoint_url: http://{{ .Release.Name }}-vllm
sqlagent:
llm_endpoint_url: http://{{ .Release.Name }}-tgi
llm_engine: tgi
llm_endpoint_url: http://{{ .Release.Name }}-vllm
16 changes: 0 additions & 16 deletions helm-charts/agentqna/vllm-gaudi-values.yaml

This file was deleted.

24 changes: 24 additions & 0 deletions helm-charts/common/agent/gaudi-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

tgi:
enabled: true
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120

llm_endpoint_url: http://{{ .Release.Name }}-tgi
llm_engine: tgi
33 changes: 3 additions & 30 deletions helm-charts/common/agent/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,8 @@
# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

tgi:
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 4
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
extraCmdArgs: ["--sharded","true","--num-shard","4"]
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
repository: opea/vllm-gaudi
llm_endpoint_url: http://{{ .Release.Name }}-vllm
2 changes: 1 addition & 1 deletion helm-charts/common/agent/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ data:
{{- if .Values.llm_endpoint_url }}
llm_endpoint_url: {{ tpl .Values.llm_endpoint_url . | quote }}
{{- else }}
llm_endpoint_url: "http://{{ .Release.Name }}-tgi"
llm_endpoint_url: "http://{{ .Release.Name }}-vllm"
{{- end }}
# {{- if .Values.port }}
# port: {{ .Values.port | quote }}
Expand Down
48 changes: 38 additions & 10 deletions helm-charts/common/agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,55 @@

tgi:
enabled: false
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 4
LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct"
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
MAX_INPUT_LENGTH: 4096
MAX_TOTAL_TOKENS: 8192
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
ENABLE_HPU_GRAPH: true
LIMIT_HPU_GRAPH: true
USE_FLASH_ATTENTION: true
FLASH_ATTENTION_RECOMPUTE: true
extraCmdArgs: ["--sharded", "true", "--num-shard", "4"]

vllm:
enabled: false
LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
extraCmdArgs: ["--tensor-parallel-size", "1", "--block-size", "128", "--max-num-seqs", "4096", "--max-seq_len-to-capture", "8192", "--enable-auto-tool-choice", "--tool-call-parser", "mistral"]
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
resources:
limits:
habana.ai/gaudi: 4
LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct"
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPU_ENABLE_LAZY_COLLECTIVES: true
VLLM_SKIP_WARMUP: true
extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384"]

replicaCount: 1

toolPath: "/mnt/tools"
strategy: react_llama
recursion_limit: 10
llm_engine: vllm
llm_endpoint_url: ""
model: "meta-llama/Meta-Llama-3.1-70B-Instruct"
temperature: "0.01"
max_new_tokens: "4096"
llm_engine: "tgi"
strategy: "react_langchain"
recursion_limit: "15"
require_human_feedback: "false"

stream: "false"
tools: /home/user/tools/supervisor_agent_tools.yaml
require_human_feedback: false
# Set it as a non-null string, such as true, if you want to enable logging facility,
# otherwise, keep it as "" to disable it.
LOGFLAG: "True"
LOGFLAG: ""

image:
repository: opea/agent
Expand Down

0 comments on commit 6f81f39

Please sign in to comment.