opea-project · yongfengdu · Nov 14, 2024 · Nov 12, 2024
@@ -13,3 +13,7 @@ dependencies:
     version: 1.0.0
     repository: file://../tgi
     condition: tgi.enabled
+  - name: vllm
+    version: 1.0.0
+    repository: file://../vllm
+    condition: vllm.enabled
@@ -13,15 +13,23 @@ data:
   {{- else }}
   TGI_LLM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
   {{- end }}
+  {{- if .Values.vLLM_ENDPOINT }}
+  vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote}}
+  {{- else }}
+  vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
+  {{- end }}
+  {{- if .Values.LLM_MODEL_ID }}
+  LLM_MODEL: {{ .Values.LLM_MODEL_ID | quote}}
+  {{- end }}
   HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}}
   HF_HOME: "/tmp/.cache/huggingface"
   {{- if .Values.global.HF_ENDPOINT }}
   HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}}
   {{- end }}
   http_proxy: {{ .Values.global.http_proxy | quote }}
   https_proxy: {{ .Values.global.https_proxy | quote }}
-  {{- if and (not .Values.TGI_LLM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }}
-  no_proxy: "{{ .Release.Name }}-tgi,{{ .Values.global.no_proxy }}"
+  {{- if or .Values.global.http_proxy .Values.global.https_proxy }}
+  no_proxy: "{{ .Release.Name }}-tgi,{{ .Release.Name }}-vllm,{{ .Values.global.no_proxy }}"
   {{- else }}
   no_proxy: {{ .Values.global.no_proxy | quote }}
   {{- end }}

@@ -7,9 +7,15 @@
 
 tgi:
   enabled: false
+vllm:
+  enabled: false
 
 replicaCount: 1
+# For tgi
 TGI_LLM_ENDPOINT: ""
+# For vllm, set the LLM_MODEL_ID the same as vllm sub chart
+vLLM_ENDPOINT: ""
+LLM_MODEL_ID: ""
 
 # Set it as a non-null string, such as true, if you want to enable logging facility,
 # otherwise, keep it as "" to disable it.

@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Default values for llm-uservice.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+
+vLLM_ENDPOINT: ""
+LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+image:
+  repository: opea/llm-vllm
+  tag: "latest"
@@ -6,14 +6,14 @@
 # Declare variables to be passed into your templates.
 
 image:
-  repository: opea/llm-vllm-hpu
+  repository: opea/vllm-hpu
   tag: "latest"
 
-VLLM_CPU_KVCACHE_SPACE: "40"
-
+# VLLM_CPU_KVCACHE_SPACE: "40"
+OMPI_MCA_btl_vader_single_copy_mechanism: none
+extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
 # Workaround for current HPU image with start command /bin/bash
-# extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
-extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"]
+# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"]
 resources:
   limits:
     habana.ai/gaudi: 1
@@ -23,3 +23,6 @@ data:
   {{- if .Values.VLLM_CPU_KVCACHE_SPACE }}
   VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}}
   {{- end }}
+  {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }}
+  OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote}}
+  {{- end }}
@@ -50,7 +50,7 @@ resources: {}
   #   cpu: 100m
   #   memory: 128Mi
 
-extraCmdArgs: ["--enforce-eager","--dtype","auto"]
+extraCmdArgs: ["--enforce-eager", "--dtype", "auto"]
 
 livenessProbe:
   httpGet: