Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align vllm settings with docker compose version #554

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions helm-charts/common/llm-uservice/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ dependencies:
version: 1.0.0
repository: file://../tgi
condition: tgi.enabled
- name: vllm
version: 1.0.0
repository: file://../vllm
condition: vllm.enabled
12 changes: 10 additions & 2 deletions helm-charts/common/llm-uservice/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,23 @@ data:
{{- else }}
TGI_LLM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
{{- end }}
{{- if .Values.vLLM_ENDPOINT }}
vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote}}
{{- else }}
vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
{{- end }}
{{- if .Values.LLM_MODEL_ID }}
LLM_MODEL: {{ .Values.LLM_MODEL_ID | quote}}
{{- end }}
HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}}
HF_HOME: "/tmp/.cache/huggingface"
{{- if .Values.global.HF_ENDPOINT }}
HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}}
{{- end }}
http_proxy: {{ .Values.global.http_proxy | quote }}
https_proxy: {{ .Values.global.https_proxy | quote }}
{{- if and (not .Values.TGI_LLM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }}
no_proxy: "{{ .Release.Name }}-tgi,{{ .Values.global.no_proxy }}"
{{- if or .Values.global.http_proxy .Values.global.https_proxy }}
no_proxy: "{{ .Release.Name }}-tgi,{{ .Release.Name }}-vllm,{{ .Values.global.no_proxy }}"
{{- else }}
no_proxy: {{ .Values.global.no_proxy | quote }}
{{- end }}
Expand Down
6 changes: 6 additions & 0 deletions helm-charts/common/llm-uservice/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,15 @@

tgi:
enabled: false
vllm:
enabled: false

replicaCount: 1
# For tgi
TGI_LLM_ENDPOINT: ""
# For vllm, set the LLM_MODEL_ID the same as vllm sub chart
vLLM_ENDPOINT: ""
LLM_MODEL_ID: ""

# Set it as a non-null string, such as true, if you want to enable logging facility,
# otherwise, keep it as "" to disable it.
Expand Down
17 changes: 17 additions & 0 deletions helm-charts/common/llm-uservice/vllm-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Default values for llm-uservice.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
tgi:
enabled: false
vllm:
enabled: true
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3

vLLM_ENDPOINT: ""
LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
image:
repository: opea/llm-vllm
tag: "latest"
10 changes: 5 additions & 5 deletions helm-charts/common/vllm/gaudi-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
# Declare variables to be passed into your templates.

image:
repository: opea/llm-vllm-hpu
repository: opea/vllm-hpu
tag: "latest"

VLLM_CPU_KVCACHE_SPACE: "40"

# VLLM_CPU_KVCACHE_SPACE: "40"
OMPI_MCA_btl_vader_single_copy_mechanism: none
extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
# Workaround for current HPU image with start command /bin/bash
# extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"]
# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"]
resources:
limits:
habana.ai/gaudi: 1
3 changes: 3 additions & 0 deletions helm-charts/common/vllm/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ data:
{{- if .Values.VLLM_CPU_KVCACHE_SPACE }}
VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}}
{{- end }}
{{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }}
OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote}}
{{- end }}
2 changes: 1 addition & 1 deletion helm-charts/common/vllm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ resources: {}
# cpu: 100m
# memory: 128Mi

extraCmdArgs: ["--enforce-eager","--dtype","auto"]
extraCmdArgs: ["--enforce-eager", "--dtype", "auto"]

livenessProbe:
httpGet:
Expand Down