diff --git a/helm-charts/agentqna/Chart.yaml b/helm-charts/agentqna/Chart.yaml index ba92205e8..81aa6b5af 100644 --- a/helm-charts/agentqna/Chart.yaml +++ b/helm-charts/agentqna/Chart.yaml @@ -8,7 +8,11 @@ type: application dependencies: - name: agent version: 0-latest - alias: worker + alias: ragagent + repository: "file://../common/agent" + - name: agent + version: 0-latest + alias: sqlagent repository: "file://../common/agent" - name: agent version: 0-latest @@ -18,6 +22,10 @@ dependencies: version: 0-latest repository: "file://../common/tgi" condition: tgi.enabled + - name: vllm + version: 0-latest + repository: "file://../common/vllm" + condition: vllm.enabled - name: tei version: 0-latest repository: "file://../common/tei" diff --git a/helm-charts/agentqna/gaudi-values.yaml b/helm-charts/agentqna/gaudi-values.yaml index 91ef5d102..5aad1a44d 100644 --- a/helm-charts/agentqna/gaudi-values.yaml +++ b/helm-charts/agentqna/gaudi-values.yaml @@ -6,23 +6,6 @@ tgi: enabled: true - accelDevice: "gaudi" - image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" - resources: - limits: - habana.ai/gaudi: 4 - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - extraCmdArgs: ["--sharded","true","--num-shard","4"] livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 @@ -36,3 +19,12 @@ tgi: periodSeconds: 5 timeoutSeconds: 1 failureThreshold: 120 +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-tgi + llm_engine: tgi diff --git a/helm-charts/agentqna/templates/tests/test-pod.yaml b/helm-charts/agentqna/templates/tests/test-pod.yaml index 48f6c09aa..5ed8d9fdd 100644 --- a/helm-charts/agentqna/templates/tests/test-pod.yaml +++ b/helm-charts/agentqna/templates/tests/test-pod.yaml @@ -17,14 +17,54 @@ spec: command: ['bash', '-c'] args: - | + # Ingest data + cd /mnt/tools + pip install requests tqdm + ./ingest_data.sh + # Test ragagent + max_retry=10; + for ((i=1; i<=max_retry; i++)); do + curl http://{{ include "agentqna.fullname" (index .Subcharts "ragagent") }}:{{ .Values.ragagent.service.port }}/v1/chat/completions -sS --fail-with-body \ + -X POST \ + -d '{"messages": "Tell me about Michael Jackson song Thriller"}' \ + -H 'Content-Type: application/json' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + # Test sqlagent + max_retry=10; + for ((i=1; i<=max_retry; i++)); do + curl http://{{ include "agentqna.fullname" (index .Subcharts "sqlagent") }}:{{ .Values.sqlagent.service.port }}/v1/chat/completions -sS --fail-with-body \ + -X POST \ + -d '{"messages": "How many employees are there in the company?"}' \ + -H 'Content-Type: application/json' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + # Test supervisor max_retry=20; for ((i=1; i<=max_retry; i++)); do curl http://{{ include "agentqna.fullname" (index .Subcharts "supervisor") }}:{{ .Values.supervisor.service.port }}/v1/chat/completions -sS --fail-with-body \ -X POST \ - -d '{"query": "Tell me three songs of Michael Jackson."}' \ + -d '{"messages": "How many albums does Iron Maiden have?"}' \ -H 'Content-Type: application/json' && break; curlcode=$? if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; done; if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + volumeMounts: + - mountPath: /mnt/tools + name: test + volumes: + - name: test + hostPath: + path: /mnt/tools + type: Directory + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + restartPolicy: Never diff --git a/helm-charts/agentqna/values.yaml b/helm-charts/agentqna/values.yaml index cba1128dc..d9e63d06d 100644 --- a/helm-charts/agentqna/values.yaml +++ b/helm-charts/agentqna/values.yaml @@ -57,13 +57,30 @@ docretriever: # Overrides the image tag whose default is the chart appVersion. tag: "latest" -worker: +sqlagent: + DBPath: "/mnt/tools" + db_name: "Chinook" + db_path: "sqlite:////home/user/chinook-db/Chinook_Sqlite.sqlite" + service: + port: 9096 + strategy: sql_agent_llama + use_hints: "false" + recursion_limit: "6" + llm_engine: vllm + llm_endpoint_url: "" + model: "meta-llama/Meta-Llama-3.1-70B-Instruct" + temperature: "0.01" + max_new_tokens: "4096" + stream: "false" + require_human_feedback: "false" + +ragagent: toolPath: "/mnt/tools" service: port: 9095 strategy: rag_agent_llama - recursion_limit: "12" - llm_engine: tgi + recursion_limit: "6" + llm_engine: vllm llm_endpoint_url: "" model: "meta-llama/Meta-Llama-3.1-70B-Instruct" temperature: "0.01" @@ -79,7 +96,7 @@ supervisor: port: 9090 strategy: react_llama recursion_limit: 10 - llm_engine: tgi + llm_engine: vllm llm_endpoint_url: "" model: "meta-llama/Meta-Llama-3.1-70B-Instruct" temperature: "0.01" @@ -89,6 +106,7 @@ supervisor: require_human_feedback: false CRAG_SERVER: "" WORKER_AGENT_URL: "" + SQL_AGENT_URL: "" crag: image: @@ -101,9 +119,39 @@ crag: # Override values in specific subcharts tgi: enabled: false + accelDevice: "gaudi" + image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.0.6" + resources: + limits: + habana.ai/gaudi: 4 + LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct" + MAX_INPUT_LENGTH: 4096 + MAX_TOTAL_TOKENS: 8192 + CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: true + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + extraCmdArgs: ["--sharded", "true", "--num-shard", "4"] + +vllm: + enabled: false + accelDevice: "gaudi" + image: + repository: opea/vllm-gaudi + tag: "latest" + resources: + limits: + habana.ai/gaudi: 4 LLM_MODEL_ID: "meta-llama/Meta-Llama-3.1-70B-Instruct" - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: true + VLLM_SKIP_WARMUP: true + extraCmdArgs: ["--tensor-parallel-size", "4", "--max-seq_len-to-capture", "16384"] global: http_proxy: "" diff --git a/helm-charts/agentqna/vllm-gaudi-values.yaml b/helm-charts/agentqna/vllm-gaudi-values.yaml new file mode 100644 index 000000000..2d171ea22 --- /dev/null +++ b/helm-charts/agentqna/vllm-gaudi-values.yaml @@ -0,0 +1,16 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +vllm: + enabled: true + image: + repository: opea/vllm-gaudi +supervisor: + llm_endpoint_url: http://{{ .Release.Name }}-vllm +ragagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm +sqlagent: + llm_endpoint_url: http://{{ .Release.Name }}-vllm diff --git a/helm-charts/common/agent/templates/configmap.yaml b/helm-charts/common/agent/templates/configmap.yaml index 0fa0bb437..6d41750dd 100644 --- a/helm-charts/common/agent/templates/configmap.yaml +++ b/helm-charts/common/agent/templates/configmap.yaml @@ -8,11 +8,17 @@ metadata: labels: {{- include "agent.labels" . | nindent 4 }} data: + {{- if .Values.db_name }} + db_name: {{ .Values.db_name | quote }} + {{- end }} + {{- if .Values.db_path }} + db_path: {{ .Values.db_path | quote }} + {{- end }} {{- if .Values.tools }} tools: {{ .Values.tools | quote }} {{- end }} {{- if .Values.llm_endpoint_url }} - llm_endpoint_url: {{ .Values.llm_endpoint_url | quote }} + llm_endpoint_url: {{ tpl .Values.llm_endpoint_url . | quote }} {{- else }} llm_endpoint_url: "http://{{ .Release.Name }}-tgi" {{- end }} @@ -41,12 +47,18 @@ data: {{- if .Values.WORKER_AGENT_URL }} WORKER_AGENT_URL: {{ .Values.WORKER_AGENT_URL | quote }} {{- else }} - WORKER_AGENT_URL: "http://{{ .Release.Name }}-worker:9095/v1/chat/completions" + WORKER_AGENT_URL: "http://{{ .Release.Name }}-rag-agent:9095/v1/chat/completions" + {{- end }} + {{- if .Values.SQL_AGENT_URL }} + SQL_AGENT_URL: {{ .Values.SQL_AGENT_URL | quote }} + {{- else }} + SQL_AGENT_URL: "http://{{ .Release.Name }}-sql-agent:9096/v1/chat/completions" {{- end }} require_human_feedback: {{ .Values.require_human_feedback | quote }} recursion_limit: {{ .Values.recursion_limit | quote }} llm_engine: {{ .Values.llm_engine | quote }} strategy: {{ .Values.strategy | quote }} + use_hints: {{ .Values.use_hints | quote }} max_new_tokens: {{ .Values.max_new_tokens | quote }} {{- if .Values.OPENAI_API_KEY }} OPENAI_API_KEY: {{ .Values.OPENAI_API_KEY | quote }} diff --git a/helm-charts/common/agent/templates/deployment.yaml b/helm-charts/common/agent/templates/deployment.yaml index dffe5ceca..c4b213270 100644 --- a/helm-charts/common/agent/templates/deployment.yaml +++ b/helm-charts/common/agent/templates/deployment.yaml @@ -49,6 +49,10 @@ spec: containerPort: 9090 protocol: TCP volumeMounts: + {{- if .Values.DBPath }} + - mountPath: /home/user/chinook-db + name: dbpath + {{- end }} {{- if .Values.toolPath }} - mountPath: /home/user/tools name: tool @@ -70,6 +74,12 @@ spec: resources: {{- toYaml .Values.resources | nindent 12 }} volumes: + {{- if .Values.DBPath }} + - name: dbpath + hostPath: + path: {{ .Values.DBPath }} + type: Directory + {{- end }} {{- if .Values.toolPath }} - name: tool hostPath: diff --git a/helm-charts/common/agent/templates/tests/test-pod.yaml b/helm-charts/common/agent/templates/tests/test-pod.yaml index 3dc4a2353..9c65831bb 100644 --- a/helm-charts/common/agent/templates/tests/test-pod.yaml +++ b/helm-charts/common/agent/templates/tests/test-pod.yaml @@ -17,6 +17,10 @@ spec: command: ['bash', '-c'] args: - | + if [ {{ include "agent.fullname" . }} != "agent" ]; then + echo "Skip test." + exit 0 + fi max_retry=20; for ((i=1; i<=max_retry; i++)); do curl http://{{ include "agent.fullname" . }}:{{ .Values.service.port }}/v1/chat/completions -sS --fail-with-body \ diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml index 5fbbf6b79..1ca8268ba 100644 --- a/helm-charts/common/vllm/templates/configmap.yaml +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -25,6 +25,9 @@ data: {{- if .Values.VLLM_CPU_KVCACHE_SPACE }} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}} {{- end }} + {{- if .Values.VLLM_SKIP_WARMUP }} + VLLM_SKIP_WARMUP: {{ .Values.VLLM_SKIP_WARMUP | quote }} + {{- end }} {{- if .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES }} PT_HPU_ENABLE_LAZY_COLLECTIVES: {{ .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES | quote }} {{- end }}