diff --git a/helm-charts/codegen/gaudi-values.yaml b/helm-charts/codegen/gaudi-values.yaml index e26bb4a5..ce4dad64 100644 --- a/helm-charts/codegen/gaudi-values.yaml +++ b/helm-charts/codegen/gaudi-values.yaml @@ -13,6 +13,11 @@ tgi: MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/codegen/values.yaml b/helm-charts/codegen/values.yaml index de9e2a4b..36bf1917 100644 --- a/helm-charts/codegen/values.yaml +++ b/helm-charts/codegen/values.yaml @@ -60,6 +60,9 @@ affinity: {} tgi: LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct +llm-uservice: + LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct + nginx: service: type: NodePort diff --git a/helm-charts/codetrans/gaudi-values.yaml b/helm-charts/codetrans/gaudi-values.yaml index e5367383..2739ad08 100644 --- a/helm-charts/codetrans/gaudi-values.yaml +++ b/helm-charts/codetrans/gaudi-values.yaml @@ -12,6 +12,11 @@ tgi: MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/codetrans/values.yaml b/helm-charts/codetrans/values.yaml index ba39428f..77142681 100644 --- a/helm-charts/codetrans/values.yaml +++ b/helm-charts/codetrans/values.yaml @@ -61,6 +61,9 @@ affinity: {} tgi: LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 +llm-uservice: + LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 + nginx: service: type: NodePort diff --git a/helm-charts/common/tgi/templates/configmap.yaml b/helm-charts/common/tgi/templates/configmap.yaml index 0b738587..3e6f46ea 100644 --- a/helm-charts/common/tgi/templates/configmap.yaml +++ b/helm-charts/common/tgi/templates/configmap.yaml @@ -57,3 +57,9 @@ data: {{- if .Values.FLASH_ATTENTION_RECOMPUTE }} FLASH_ATTENTION_RECOMPUTE: {{ .Values.FLASH_ATTENTION_RECOMPUTE | quote }} {{- end }} + {{- if .Values.PREFILL_BATCH_BUCKET_SIZE }} + PREFILL_BATCH_BUCKET_SIZE: {{ .Values.PREFILL_BATCH_BUCKET_SIZE | quote }} + {{- end }} + {{- if .Values.BATCH_BUCKET_SIZE }} + BATCH_BUCKET_SIZE: {{ .Values.BATCH_BUCKET_SIZE | quote }} + {{- end }} diff --git a/helm-charts/faqgen/gaudi-values.yaml b/helm-charts/faqgen/gaudi-values.yaml index d14729c4..e681c5f3 100644 --- a/helm-charts/faqgen/gaudi-values.yaml +++ b/helm-charts/faqgen/gaudi-values.yaml @@ -9,9 +9,21 @@ tgi: resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "0" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + PREFILL_BATCH_BUCKET_SIZE: 1 + BATCH_BUCKET_SIZE: 8 + extraCmdArgs: + - "--max-batch-total-tokens" + - "65536" + - "--max-batch-prefill-tokens" + - "4096" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/faqgen/values.yaml b/helm-charts/faqgen/values.yaml index 02a81b3c..7580a11e 100644 --- a/helm-charts/faqgen/values.yaml +++ b/helm-charts/faqgen/values.yaml @@ -59,7 +59,8 @@ affinity: {} # To override values in subchart llm-uservice llm-uservice: image: - repository: opea/llm-faqgen-tgi + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct # To override values in subchart tgi tgi: diff --git a/helm-charts/visualqna/gaudi-values.yaml b/helm-charts/visualqna/gaudi-values.yaml index 5a0e95c3..0fab1331 100644 --- a/helm-charts/visualqna/gaudi-values.yaml +++ b/helm-charts/visualqna/gaudi-values.yaml @@ -16,6 +16,11 @@ tgi: MAX_INPUT_LENGTH: "4096" MAX_TOTAL_TOKENS: "8192" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/visualqna/values.yaml b/helm-charts/visualqna/values.yaml index aa6e377a..dc6b3401 100644 --- a/helm-charts/visualqna/values.yaml +++ b/helm-charts/visualqna/values.yaml @@ -67,6 +67,9 @@ tgi: MAX_TOTAL_TOKENS: "8192" LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf +lvm-uservice: + LVM_BACKEND: "TGI" + nginx: service: type: NodePort