From e9bb334652c70af56758578be6e4575b3abb803d Mon Sep 17 00:00:00 2001 From: Lianhao Lu Date: Thu, 16 Jan 2025 08:41:35 +0000 Subject: [PATCH] Refactor e2e chart: codegen codetrans faqgen visualqna Signed-off-by: Lianhao Lu --- helm-charts/codegen/gaudi-values.yaml | 5 +++++ helm-charts/codegen/values.yaml | 3 +++ helm-charts/codetrans/gaudi-values.yaml | 5 +++++ helm-charts/codetrans/values.yaml | 3 +++ helm-charts/common/tgi/templates/configmap.yaml | 6 ++++++ helm-charts/faqgen/gaudi-values.yaml | 16 ++++++++++++++-- helm-charts/faqgen/values.yaml | 3 ++- helm-charts/visualqna/gaudi-values.yaml | 5 +++++ helm-charts/visualqna/values.yaml | 3 +++ 9 files changed, 46 insertions(+), 3 deletions(-) diff --git a/helm-charts/codegen/gaudi-values.yaml b/helm-charts/codegen/gaudi-values.yaml index e26bb4a5e..ce4dad64f 100644 --- a/helm-charts/codegen/gaudi-values.yaml +++ b/helm-charts/codegen/gaudi-values.yaml @@ -13,6 +13,11 @@ tgi: MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/codegen/values.yaml b/helm-charts/codegen/values.yaml index de9e2a4b9..36bf19176 100644 --- a/helm-charts/codegen/values.yaml +++ b/helm-charts/codegen/values.yaml @@ -60,6 +60,9 @@ affinity: {} tgi: LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct +llm-uservice: + LLM_MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct + nginx: service: type: NodePort diff --git a/helm-charts/codetrans/gaudi-values.yaml b/helm-charts/codetrans/gaudi-values.yaml index e5367383a..2739ad087 100644 --- a/helm-charts/codetrans/gaudi-values.yaml +++ b/helm-charts/codetrans/gaudi-values.yaml @@ -12,6 +12,11 @@ tgi: MAX_INPUT_LENGTH: "1024" MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/codetrans/values.yaml b/helm-charts/codetrans/values.yaml index ba39428fa..77142681b 100644 --- a/helm-charts/codetrans/values.yaml +++ b/helm-charts/codetrans/values.yaml @@ -61,6 +61,9 @@ affinity: {} tgi: LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 +llm-uservice: + LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 + nginx: service: type: NodePort diff --git a/helm-charts/common/tgi/templates/configmap.yaml b/helm-charts/common/tgi/templates/configmap.yaml index 0b7385870..3e6f46ea9 100644 --- a/helm-charts/common/tgi/templates/configmap.yaml +++ b/helm-charts/common/tgi/templates/configmap.yaml @@ -57,3 +57,9 @@ data: {{- if .Values.FLASH_ATTENTION_RECOMPUTE }} FLASH_ATTENTION_RECOMPUTE: {{ .Values.FLASH_ATTENTION_RECOMPUTE | quote }} {{- end }} + {{- if .Values.PREFILL_BATCH_BUCKET_SIZE }} + PREFILL_BATCH_BUCKET_SIZE: {{ .Values.PREFILL_BATCH_BUCKET_SIZE | quote }} + {{- end }} + {{- if .Values.BATCH_BUCKET_SIZE }} + BATCH_BUCKET_SIZE: {{ .Values.BATCH_BUCKET_SIZE | quote }} + {{- end }} diff --git a/helm-charts/faqgen/gaudi-values.yaml b/helm-charts/faqgen/gaudi-values.yaml index d14729c4a..e681c5f32 100644 --- a/helm-charts/faqgen/gaudi-values.yaml +++ b/helm-charts/faqgen/gaudi-values.yaml @@ -9,9 +9,21 @@ tgi: resources: limits: habana.ai/gaudi: 1 - MAX_INPUT_LENGTH: "4096" - MAX_TOTAL_TOKENS: "8192" + MAX_INPUT_LENGTH: "1024" + MAX_TOTAL_TOKENS: "2048" CUDA_GRAPHS: "0" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" + PREFILL_BATCH_BUCKET_SIZE: 1 + BATCH_BUCKET_SIZE: 8 + extraCmdArgs: + - "--max-batch-total-tokens" + - "65536" + - "--max-batch-prefill-tokens" + - "4096" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/faqgen/values.yaml b/helm-charts/faqgen/values.yaml index 02a81b3c3..7580a11e2 100644 --- a/helm-charts/faqgen/values.yaml +++ b/helm-charts/faqgen/values.yaml @@ -59,7 +59,8 @@ affinity: {} # To override values in subchart llm-uservice llm-uservice: image: - repository: opea/llm-faqgen-tgi + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct # To override values in subchart tgi tgi: diff --git a/helm-charts/visualqna/gaudi-values.yaml b/helm-charts/visualqna/gaudi-values.yaml index 5a0e95c3a..0fab1331c 100644 --- a/helm-charts/visualqna/gaudi-values.yaml +++ b/helm-charts/visualqna/gaudi-values.yaml @@ -16,6 +16,11 @@ tgi: MAX_INPUT_LENGTH: "4096" MAX_TOTAL_TOKENS: "8192" CUDA_GRAPHS: "" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + ENABLE_HPU_GRAPH: "true" + LIMIT_HPU_GRAPH: "true" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "true" livenessProbe: initialDelaySeconds: 5 periodSeconds: 5 diff --git a/helm-charts/visualqna/values.yaml b/helm-charts/visualqna/values.yaml index aa6e377a8..dc6b34014 100644 --- a/helm-charts/visualqna/values.yaml +++ b/helm-charts/visualqna/values.yaml @@ -67,6 +67,9 @@ tgi: MAX_TOTAL_TOKENS: "8192" LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf +lvm-uservice: + LVM_BACKEND: "TGI" + nginx: service: type: NodePort