From 764a8967b0d2fc182400056f51a8ef9fb7bf009d Mon Sep 17 00:00:00 2001 From: Dolpher Du Date: Tue, 12 Nov 2024 15:50:19 +0000 Subject: [PATCH] Align vllm settings with docker compose version Align settings with PR opea-project/GenAIExamples#1061. Make llm-uservice support both tgi and vllm backend. Signed-off-by: Dolpher Du --- helm-charts/common/llm-uservice/Chart.yaml | 4 ++++ .../llm-uservice/templates/configmap.yaml | 12 ++++++++++-- helm-charts/common/llm-uservice/values.yaml | 6 ++++++ .../common/llm-uservice/vllm-values.yaml | 17 +++++++++++++++++ helm-charts/common/vllm/gaudi-values.yaml | 10 +++++----- .../common/vllm/templates/configmap.yaml | 3 +++ helm-charts/common/vllm/values.yaml | 2 +- 7 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 helm-charts/common/llm-uservice/vllm-values.yaml diff --git a/helm-charts/common/llm-uservice/Chart.yaml b/helm-charts/common/llm-uservice/Chart.yaml index 8039c98de..8f45dbaa4 100644 --- a/helm-charts/common/llm-uservice/Chart.yaml +++ b/helm-charts/common/llm-uservice/Chart.yaml @@ -13,3 +13,7 @@ dependencies: version: 1.0.0 repository: file://../tgi condition: tgi.enabled + - name: vllm + version: 1.0.0 + repository: file://../vllm + condition: vllm.enabled diff --git a/helm-charts/common/llm-uservice/templates/configmap.yaml b/helm-charts/common/llm-uservice/templates/configmap.yaml index bb1c39434..bd49777dc 100644 --- a/helm-charts/common/llm-uservice/templates/configmap.yaml +++ b/helm-charts/common/llm-uservice/templates/configmap.yaml @@ -13,6 +13,14 @@ data: {{- else }} TGI_LLM_ENDPOINT: "http://{{ .Release.Name }}-tgi" {{- end }} + {{- if .Values.vLLM_ENDPOINT }} + vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote}} + {{- else }} + vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm" + {{- end }} + {{- if .Values.LLM_MODEL_ID }} + LLM_MODEL: {{ .Values.LLM_MODEL_ID | quote}} + {{- end }} HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} HF_HOME: "/tmp/.cache/huggingface" {{- if .Values.global.HF_ENDPOINT }} @@ -20,8 +28,8 @@ data: {{- end }} http_proxy: {{ .Values.global.http_proxy | quote }} https_proxy: {{ .Values.global.https_proxy | quote }} - {{- if and (not .Values.TGI_LLM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }} - no_proxy: "{{ .Release.Name }}-tgi,{{ .Values.global.no_proxy }}" + {{- if or .Values.global.http_proxy .Values.global.https_proxy }} + no_proxy: "{{ .Release.Name }}-tgi,{{ .Release.Name }}-vllm,{{ .Values.global.no_proxy }}" {{- else }} no_proxy: {{ .Values.global.no_proxy | quote }} {{- end }} diff --git a/helm-charts/common/llm-uservice/values.yaml b/helm-charts/common/llm-uservice/values.yaml index 8eae02042..2b9aa49b6 100644 --- a/helm-charts/common/llm-uservice/values.yaml +++ b/helm-charts/common/llm-uservice/values.yaml @@ -7,9 +7,15 @@ tgi: enabled: false +vllm: + enabled: false replicaCount: 1 +# For tgi TGI_LLM_ENDPOINT: "" +# For vllm, set the LLM_MODEL_ID the same as vllm sub chart +vLLM_ENDPOINT: "" +LLM_MODEL_ID: "" # Set it as a non-null string, such as true, if you want to enable logging facility, # otherwise, keep it as "" to disable it. diff --git a/helm-charts/common/llm-uservice/vllm-values.yaml b/helm-charts/common/llm-uservice/vllm-values.yaml new file mode 100644 index 000000000..059f32821 --- /dev/null +++ b/helm-charts/common/llm-uservice/vllm-values.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for llm-uservice. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +tgi: + enabled: false +vllm: + enabled: true + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + +vLLM_ENDPOINT: "" +LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +image: + repository: opea/llm-vllm + tag: "latest" diff --git a/helm-charts/common/vllm/gaudi-values.yaml b/helm-charts/common/vllm/gaudi-values.yaml index b1a346137..6e80ecab7 100644 --- a/helm-charts/common/vllm/gaudi-values.yaml +++ b/helm-charts/common/vllm/gaudi-values.yaml @@ -6,14 +6,14 @@ # Declare variables to be passed into your templates. image: - repository: opea/llm-vllm-hpu + repository: opea/vllm-hpu tag: "latest" -VLLM_CPU_KVCACHE_SPACE: "40" - +# VLLM_CPU_KVCACHE_SPACE: "40" +OMPI_MCA_btl_vader_single_copy_mechanism: none +extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] # Workaround for current HPU image with start command /bin/bash -# extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] -extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"] +# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"] resources: limits: habana.ai/gaudi: 1 diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml index 80b9a97da..c38dbefa5 100644 --- a/helm-charts/common/vllm/templates/configmap.yaml +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -23,3 +23,6 @@ data: {{- if .Values.VLLM_CPU_KVCACHE_SPACE }} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}} {{- end }} + {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }} + OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote}} + {{- end }} diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml index 3e98a21be..f7333722e 100644 --- a/helm-charts/common/vllm/values.yaml +++ b/helm-charts/common/vllm/values.yaml @@ -50,7 +50,7 @@ resources: {} # cpu: 100m # memory: 128Mi -extraCmdArgs: ["--enforce-eager","--dtype","auto"] +extraCmdArgs: ["--enforce-eager", "--dtype", "auto"] livenessProbe: httpGet: