diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index eaf17cdc7..06557bf8f 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -23,18 +23,17 @@ cd GenAIInfra/helm-charts/ helm dependency update chatqna export HFTOKEN="insert-your-huggingface-token-here" export MODELDIR="/mnt/opea-models" -export MODELNAME="Intel/neural-chat-7b-v3-3" -# If you would like to use the traditional UI, please change the image as well as the containerport within the values -# append these at the end of the command "--set chatqna-ui.image.repository=opea/chatqna-ui,chatqna-ui.image.tag=latest,chatqna-ui.containerPort=5173" -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} +export MODELNAME="meta-llama/Meta-Llama-3-8B-Instruct" +# To use CPU with vLLM +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} +# To use Gaudi device with vLLM +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set vllm.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-vllm-values.yaml +# To use CPU with TGI +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/cpu-tgi-values.yaml # To use Gaudi device with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-tgi-values.yaml -# To use Gaudi device with vLLM -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-vllm-values.yaml -# To use Nvidia GPU +# To use Nvidia GPU with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml -# To include guardrail component in chatqna on Xeon with TGI -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml # To include guardrail component in chatqna on Gaudi with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml ``` @@ -74,12 +73,13 @@ Open a browser to access `http://:${port}` to play with the ## Values -| Key | Type | Default | Description | -| ----------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------- | -| image.repository | string | `"opea/chatqna"` | | -| service.port | string | `"8888"` | | -| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.monitoring | bool | `false` | Enable usage metrics for the service components. See ../monitoring.md before enabling! | +| Key | Type | Default | Description | +| ----------------- | ------ | --------------------------------------- | -------------------------------------------------------------------------------------- | +| image.repository | string | `"opea/chatqna"` | | +| service.port | string | `"8888"` | | +| tgi.LLM_MODEL_ID | string | `"meta-llama/Meta-Llama-3-8B-Instruct"` | Inference models for TGI | +| vllm.LLM_MODEL_ID | string | `"meta-llama/Meta-Llama-3-8B-Instruct"` | Inference models for vLLM | +| global.monitoring | bool | `false` | Enable usage metrics for the service components. See ../monitoring.md before enabling! | ## Troubleshooting diff --git a/helm-charts/chatqna/cpu-tgi-values.yaml b/helm-charts/chatqna/cpu-tgi-values.yaml new file mode 100644 index 000000000..f552e1d5b --- /dev/null +++ b/helm-charts/chatqna/cpu-tgi-values.yaml @@ -0,0 +1,112 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Override CPU resource request and probe timing values in specific subcharts +# +# RESOURCES +# +# Resource request matching actual resource usage (with enough slack) +# is important when service is scaled up, so that right amount of pods +# get scheduled to right nodes. +# +# Because resource usage depends on the used devices, model, data type +# and SW versions, and this top-level chart has overrides for them, +# resource requests need to be specified here too. +# +# To test service without resource request, use "resources: {}". +# +# PROBES +# +# Inferencing pods startup / warmup takes *much* longer on CPUs than +# with acceleration devices, and their responses are also slower, +# especially when node is running several instances of these services. +# +# Kubernetes restarting pod before its startup finishes, or not +# sending it queries because it's not in ready state due to slow +# readiness responses, does really NOT help in getting faster responses. +# +# => probe timings need to be increased when running on CPU. + +vllm: + enabled: false +tgi: + enabled: true + # TODO: add Helm value also for TGI data type option: + # https://github.com/opea-project/GenAIExamples/issues/330 + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + + # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit: + #resources: + # limits: + # cpu: 8 + # memory: 70Gi + # requests: + # cpu: 6 + # memory: 65Gi + + livenessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + failureThreshold: 24 + timeoutSeconds: 4 + readinessProbe: + initialDelaySeconds: 16 + periodSeconds: 8 + timeoutSeconds: 4 + startupProbe: + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 180 + timeoutSeconds: 2 + +teirerank: + RERANK_MODEL_ID: "BAAI/bge-reranker-base" + + # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model: + resources: + limits: + cpu: 4 + memory: 30Gi + requests: + cpu: 2 + memory: 25Gi + + livenessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + failureThreshold: 24 + timeoutSeconds: 4 + readinessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + timeoutSeconds: 4 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + +tei: + EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" + + # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model: + resources: + limits: + cpu: 4 + memory: 4Gi + requests: + cpu: 2 + memory: 3Gi + + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 + timeoutSeconds: 2 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 diff --git a/helm-charts/chatqna/cpu-values.yaml b/helm-charts/chatqna/cpu-values.yaml index b4c5ee5dd..86b68a921 100644 --- a/helm-charts/chatqna/cpu-values.yaml +++ b/helm-charts/chatqna/cpu-values.yaml @@ -1,109 +1,5 @@ -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Override CPU resource request and probe timing values in specific subcharts -# -# RESOURCES -# -# Resource request matching actual resource usage (with enough slack) -# is important when service is scaled up, so that right amount of pods -# get scheduled to right nodes. -# -# Because resource usage depends on the used devices, model, data type -# and SW versions, and this top-level chart has overrides for them, -# resource requests need to be specified here too. -# -# To test service without resource request, use "resources: {}". -# -# PROBES -# -# Inferencing pods startup / warmup takes *much* longer on CPUs than -# with acceleration devices, and their responses are also slower, -# especially when node is running several instances of these services. -# -# Kubernetes restarting pod before its startup finishes, or not -# sending it queries because it's not in ready state due to slow -# readiness responses, does really NOT help in getting faster responses. -# -# => probe timings need to be increased when running on CPU. - -tgi: - # TODO: add Helm value also for TGI data type option: - # https://github.com/opea-project/GenAIExamples/issues/330 - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 - - # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit: - resources: - limits: - cpu: 8 - memory: 70Gi - requests: - cpu: 6 - memory: 65Gi - - livenessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - failureThreshold: 24 - timeoutSeconds: 4 - readinessProbe: - initialDelaySeconds: 16 - periodSeconds: 8 - timeoutSeconds: 4 - startupProbe: - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 180 - timeoutSeconds: 2 - -teirerank: - RERANK_MODEL_ID: "BAAI/bge-reranker-base" - - # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model: - resources: - limits: - cpu: 4 - memory: 30Gi - requests: - cpu: 2 - memory: 25Gi - - livenessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - failureThreshold: 24 - timeoutSeconds: 4 - readinessProbe: - initialDelaySeconds: 8 - periodSeconds: 8 - timeoutSeconds: 4 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 120 - -tei: - EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" - - # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model: - resources: - limits: - cpu: 4 - memory: 4Gi - requests: - cpu: 2 - memory: 3Gi - - livenessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 24 - timeoutSeconds: 2 - readinessProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 2 - startupProbe: - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 120 +image: + repository: opea/chatqna diff --git a/helm-charts/chatqna/gaudi-values.yaml b/helm-charts/chatqna/gaudi-tgi-values.yaml similarity index 97% rename from helm-charts/chatqna/gaudi-values.yaml rename to helm-charts/chatqna/gaudi-tgi-values.yaml index 47df99fc4..d4da00c97 100644 --- a/helm-charts/chatqna/gaudi-values.yaml +++ b/helm-charts/chatqna/gaudi-tgi-values.yaml @@ -4,12 +4,15 @@ # Accelerate inferencing in heaviest components to improve performance # by overriding their subchart values +vllm: + enabled: false # TGI: largest bottleneck for ChatQnA tgi: + enabled: true accelDevice: "gaudi" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 diff --git a/helm-charts/chatqna/gaudi-vllm-values.yaml b/helm-charts/chatqna/gaudi-vllm-values.yaml index 6c1a44ebf..76eafae02 100644 --- a/helm-charts/chatqna/gaudi-vllm-values.yaml +++ b/helm-charts/chatqna/gaudi-vllm-values.yaml @@ -6,9 +6,9 @@ tgi: enabled: false - vllm: enabled: true + shmSize: 1Gi accelDevice: "gaudi" image: repository: opea/vllm-gaudi @@ -19,7 +19,7 @@ vllm: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 - failureThreshold: 120 + failureThreshold: 180 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 @@ -39,7 +39,6 @@ vllm: "--max-seq_len-to-capture", "2048" ] - # Reranking: second largest bottleneck when reranking is in use # (i.e. query context docs have been uploaded with data-prep) # diff --git a/helm-charts/chatqna/guardrails-gaudi-values.yaml b/helm-charts/chatqna/guardrails-gaudi-values.yaml index aad83623d..8e8a491a0 100644 --- a/helm-charts/chatqna/guardrails-gaudi-values.yaml +++ b/helm-charts/chatqna/guardrails-gaudi-values.yaml @@ -44,17 +44,18 @@ teirerank: readinessProbe: timeoutSeconds: 1 -tgi: +tgi-guardrails: + enabled: true accelDevice: "gaudi" + LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" image: repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + tag: "2.3.1" resources: limits: habana.ai/gaudi: 1 - # higher limits are needed with extra input tokens added by rerank - MAX_INPUT_LENGTH: "2048" - MAX_TOTAL_TOKENS: "4096" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" OMPI_MCA_btl_vader_single_copy_mechanism: "none" ENABLE_HPU_GRAPH: "true" @@ -75,34 +76,37 @@ tgi: timeoutSeconds: 1 failureThreshold: 120 -tgi-guardrails: +tgi: + enabled: false +vllm: enabled: true + shmSize: 1Gi accelDevice: "gaudi" - LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" image: - repository: ghcr.io/huggingface/tgi-gaudi - tag: "2.0.6" + repository: opea/vllm-gaudi resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "1024" - MAX_TOTAL_TOKENS: "2048" - CUDA_GRAPHS: "" - OMPI_MCA_btl_vader_single_copy_mechanism: "none" - ENABLE_HPU_GRAPH: "true" - LIMIT_HPU_GRAPH: "true" - USE_FLASH_ATTENTION: "true" - FLASH_ATTENTION_RECOMPUTE: "true" - livenessProbe: + startupProbe: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 + failureThreshold: 180 readinessProbe: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 - startupProbe: + livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 timeoutSeconds: 1 - failureThreshold: 120 + + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] diff --git a/helm-charts/chatqna/guardrails-values.yaml b/helm-charts/chatqna/guardrails-values.yaml deleted file mode 100644 index d37a41060..000000000 --- a/helm-charts/chatqna/guardrails-values.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -image: - repository: opea/chatqna-guardrails - -# guardrails related config -guardrails-usvc: - enabled: true - # SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi-guardrails" - SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" -tgi-guardrails: - enabled: true - LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" diff --git a/helm-charts/chatqna/nv-values.yaml b/helm-charts/chatqna/nv-values.yaml index 67c4e3ac1..93abd8952 100644 --- a/helm-charts/chatqna/nv-values.yaml +++ b/helm-charts/chatqna/nv-values.yaml @@ -2,7 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 # To override values in subchart tgi +vllm: + enabled: false tgi: + enabled: true accelDevice: "nvidia" image: repository: ghcr.io/huggingface/text-generation-inference diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 35a243c81..a23ca2de6 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -64,11 +64,20 @@ autoscaling: # Optional subcharts enablement and subcharts settings overwritten # LLM choice, tgi by default. tgi: - enabled: true - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 -vllm: enabled: false - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct +vllm: + enabled: true + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + shmSize: 128Gi + VLLM_TORCH_PROFILER_DIR: "/tmp/vllm_profile" +data-prep: + DATAPREP_BACKEND: "REDIS" + INDEX_NAME: "rag-redis" +retriever-usvc: + RETRIEVER_BACKEND: "REDIS" + INDEX_NAME: "rag-redis" + # disable guardrails by default # See guardrails-values.yaml for guardrail related options @@ -86,22 +95,25 @@ redis-vector-db: # Microservice layer, disabled by default llm-uservice: enabled: false + TEXTGEN_BACKEND: "vLLM" + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct embedding-usvc: enabled: false + EMBEDDING_BACKEND: "TEI" reranking-usvc: enabled: false + RERANK_BACKEND: "TEI" nginx: service: type: NodePort -# If you would like to switch to traditional UI image # Uncomment the following lines -# chatqna-ui: -# image: -# repository: opea/chatqna-ui -# tag: "latest" -# containerPort: "5173" +chatqna-ui: + image: + repository: opea/chatqna-ui + tag: "latest" + containerPort: "5173" global: http_proxy: "" diff --git a/helm-charts/chatqna/withwrapper-values.yaml b/helm-charts/chatqna/withwrapper-values.yaml deleted file mode 100644 index 2874787be..000000000 --- a/helm-charts/chatqna/withwrapper-values.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -image: - repository: opea/chatqna-wrapper - -llm-uservice: - enabled: true -embedding-usvc: - enabled: true -reranking-usvc: - enabled: true diff --git a/helm-charts/common/ui/templates/tests/test-pod.yaml b/helm-charts/common/ui/templates/tests/test-pod.yaml index 5c320d599..192e09cbe 100644 --- a/helm-charts/common/ui/templates/tests/test-pod.yaml +++ b/helm-charts/common/ui/templates/tests/test-pod.yaml @@ -1,6 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +{{- if .Values.untilGenAIExampleIssue1441Fixed }} apiVersion: v1 kind: Pod metadata: @@ -25,3 +26,4 @@ spec: done; if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi restartPolicy: Never +{{- end }} diff --git a/helm-charts/common/ui/values.yaml b/helm-charts/common/ui/values.yaml index 168e14d37..dd4094761 100644 --- a/helm-charts/common/ui/values.yaml +++ b/helm-charts/common/ui/values.yaml @@ -99,13 +99,13 @@ affinity: {} BACKEND_SERVICE_ENDPOINT: "/v1/chatqna" # data preparation service URL, default to Mega data preparation service -DATAPREP_SERVICE_ENDPOINT: "/v1/dataprep" +DATAPREP_SERVICE_ENDPOINT: "/v1/dataprep/ingest" # data preparation get file service URL, default to Mega data preparation service -DATAPREP_GET_FILE_ENDPOINT: "/v1/dataprep/get_file" +DATAPREP_GET_FILE_ENDPOINT: "/v1/dataprep/get" # data preparation delete file service URL, default to Mega data preparation service -DATAPREP_DELETE_FILE_ENDPOINT: "/v1/dataprep/delete_file" +DATAPREP_DELETE_FILE_ENDPOINT: "/v1/dataprep/delete" global: # service account name to be shared with all parent/child charts. diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml index 1ca8268ba..173363a36 100644 --- a/helm-charts/common/vllm/templates/configmap.yaml +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -34,3 +34,6 @@ data: {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }} OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote}} {{- end }} + {{- if .Values.VLLM_TORCH_PROFILER_DIR }} + VLLM_TORCH_PROFILER_DIR: {{ .Values.VLLM_TORCH_PROFILER_DIR | quote }} + {{- end }} diff --git a/helm-charts/valuefiles.yaml b/helm-charts/valuefiles.yaml index d3aaa5322..d5c9cc37c 100644 --- a/helm-charts/valuefiles.yaml +++ b/helm-charts/valuefiles.yaml @@ -13,12 +13,11 @@ chatqna: dest_dir: ChatQnA/kubernetes/helm values: - cpu-values.yaml - - gaudi-values.yaml + - cpu-tgi-values.yaml - gaudi-vllm-values.yaml - - guardrails-values.yaml + - gaudi-tgi-values.yaml - guardrails-gaudi-values.yaml - norerank-values.yaml - - withwrapper-values.yaml agentqna: src_repo: GenAIInfra src_dir: helm-charts/agentqna