Skip to content

Commit

Permalink
Add HPA support to ChatQnA
Browse files Browse the repository at this point in the history
Signed-off-by: Alexey Fomenko <alexey.fomenko@intel.com>
  • Loading branch information
byako committed Aug 20, 2024
1 parent b1182c4 commit f32d810
Show file tree
Hide file tree
Showing 16 changed files with 267 additions and 0 deletions.
53 changes: 53 additions & 0 deletions helm-charts/chatqna/templates/customMetrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: v1
data:
config.yaml: |
rules:
- seriesQuery: '{__name__="tgi_request_inference_duration_sum"}'
# Average request latency from TGI histograms, over 1 min
# (0.001 divider add is to make sure there's always a valid value)
metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))'
name:
matches: ^tgi_request_inference_duration_sum
as: "tgi_request_latency"
resources:
# HPA needs both namespace + suitable object resource for its query paths:
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
# (pod is not suitable object type for matching as each instance has different name)
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "reranking_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}'
# Average request latency from TEI histograms, over 1 min
metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))'
name:
matches: ^te_request_inference_duration_sum
as: "embedding_request_latency"
resources:
overrides:
namespace:
resource: namespace
service:
resource: service
kind: ConfigMap
metadata:
name: adapter-config
namespace: monitoring
{{- end }}
4 changes: 4 additions & 0 deletions helm-charts/chatqna/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@ metadata:
labels:
{{- include "chatqna.labels" . | nindent 4 }}
spec:

# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "chatqna.selectorLabels" . | nindent 6 }}
Expand Down
50 changes: 50 additions & 0 deletions helm-charts/chatqna/templates/horizontalPorAutoscaler.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "chatqna.fullname" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "chatqna.fullname" . }}
minReplicas: 1
maxReplicas: 6
metrics:
- type: Object
object:
metric:
# TGI time metrics are in seconds
name: tgi_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tgi-svc
target:
# tgi_request_latency is already average for all the TGI pods,
# so this uses Value instead of averageValue.
# On ICL Xeon, max TGI queue wait time target = 4s.
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
{{- end }}
1 change: 1 addition & 0 deletions helm-charts/chatqna/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ kind: Service
metadata:
name: {{ include "chatqna.fullname" . }}
labels:
svc: tgi-svc
{{- include "chatqna.labels" . | nindent 4 }}
spec:
type: {{ .Values.service.type }}
Expand Down
15 changes: 15 additions & 0 deletions helm-charts/chatqna/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: tgi-svc
spec:
selector:
matchLabels:
svc: tgi-svc
endpoints:
- interval: 4s
port: service
scheme: http
2 changes: 2 additions & 0 deletions helm-charts/chatqna/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# Declare variables to be passed into your templates.

replicaCount: 1
horizontalPodAutoscaler:
enable: false

image:
repository: opea/chatqna
Expand Down
3 changes: 3 additions & 0 deletions helm-charts/common/embedding-usvc/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ metadata:
labels:
{{- include "embedding-usvc.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "embedding-usvc.selectorLabels" . | nindent 6 }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "embedding-usvc.fullname" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "embedding-usvc.fullname" . }}
minReplicas: 1
maxReplicas: 2
metrics:
- type: Object
object:
metric:
# tei-embedding time metrics are in seconds
name: embedding_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tei-embedding-svc
target:
# embedding_request_latency is already average for all the TEI pods,
# so this uses Value instead of averageValue.
# On ICL Xeon, max tei-embedding wait time target = 4s.
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
{{- end }}
1 change: 1 addition & 0 deletions helm-charts/common/embedding-usvc/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ kind: Service
metadata:
name: {{ include "embedding-usvc.fullname" . }}
labels:
svc: embedding-svc
{{- include "embedding-usvc.labels" . | nindent 4 }}
spec:
type: {{ .Values.service.type }}
Expand Down
15 changes: 15 additions & 0 deletions helm-charts/common/embedding-usvc/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: embedding-svc
spec:
selector:
matchLabels:
svc: embedding-svc
endpoints:
- interval: 4s
port: service
scheme: http
2 changes: 2 additions & 0 deletions helm-charts/common/embedding-usvc/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ autodependency:
enabled: false

replicaCount: 1
horizontalPodAutoscaler:
enabled: false

TEI_EMBEDDING_ENDPOINT: ""
image:
Expand Down
3 changes: 3 additions & 0 deletions helm-charts/common/reranking-usvc/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ metadata:
labels:
{{- include "reranking-usvc.labels" . | nindent 4 }}
spec:
# use explicit replica counts only of HorizontalPodAutoscaler is disabled
{{- if not .Values.horizontalPodAutoscaler.enabled }}
replicas: {{ .Values.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "reranking-usvc.selectorLabels" . | nindent 6 }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

{{- if .Values.horizontalPodAutoscaler.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "reranking-usvc.fullname" . }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "reranking-usvc.fullname" . }}
minReplicas: 1
maxReplicas: 3
metrics:
- type: Object
object:
metric:
# tei-reranking time metrics are in seconds
name: reranking_request_latency
describedObject:
apiVersion: v1
# get metric for named object of given type (in same namespace)
kind: Service
name: tei-reranking-svc
target:
# reranking_request_latency is already average for all the TEI pods,
# so this uses Value instead of averageValue.
# On ICL Xeon, max tei-reranking wait time target = 4s.
type: Value
value: 4
behavior:
scaleDown:
stabilizationWindowSeconds: 180
policies:
- type: Percent
value: 25
periodSeconds: 15
scaleUp:
selectPolicy: Max
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 50
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 15
{{- end }}
1 change: 1 addition & 0 deletions helm-charts/common/reranking-usvc/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ kind: Service
metadata:
name: {{ include "reranking-usvc.fullname" . }}
labels:
svc: reranking-svc
{{- include "reranking-usvc.labels" . | nindent 4 }}
spec:
type: {{ .Values.service.type }}
Expand Down
15 changes: 15 additions & 0 deletions helm-charts/common/reranking-usvc/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: reranking-svc
spec:
selector:
matchLabels:
svc: reranking-svc
endpoints:
- interval: 4s
port: service
scheme: http
2 changes: 2 additions & 0 deletions helm-charts/common/reranking-usvc/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ autodependency:
enabled: false

replicaCount: 1
horizontalPodAutoscaler:
enabled: false

TEI_RERANKING_ENDPOINT: ""
image:
Expand Down

0 comments on commit f32d810

Please sign in to comment.