From 6f7512c676fa12ec79dd6183a220307212ef29bf Mon Sep 17 00:00:00 2001 From: Alexey Fomenko Date: Tue, 20 Aug 2024 19:44:17 +0300 Subject: [PATCH] Add HPA support to embedding, reranking, tgi services Signed-off-by: Alexey Fomenko --- .../chatqna/templates/customMetrics.yaml | 53 ++++++++++++++ helm-charts/chatqna/values.yaml | 8 +++ helm-charts/common/embedding-usvc/README.md | 68 ++++++++++++++++-- .../embedding-usvc/templates/deployment.yaml | 7 ++ .../templates/horizontalPodAutoscaler.yaml | 51 ++++++++++++++ .../templates/servicemonitor.yaml | 17 +++++ helm-charts/common/embedding-usvc/values.yaml | 8 +++ helm-charts/common/teirerank/README.md | 70 +++++++++++++++++-- .../teirerank/templates/deployment.yaml | 7 ++ .../templates/horizontalPodAutoscaler.yaml | 51 ++++++++++++++ .../teirerank/templates/servicemonitor.yaml | 17 +++++ helm-charts/common/teirerank/values.yaml | 9 +++ helm-charts/common/tgi/README.md | 58 +++++++++++++++ .../common/tgi/templates/deployment.yaml | 7 ++ .../templates/horizontalPorAutoscaler.yaml | 51 ++++++++++++++ .../common/tgi/templates/servicemonitor.yaml | 22 ++++++ helm-charts/common/tgi/values.yaml | 8 +++ 17 files changed, 501 insertions(+), 11 deletions(-) create mode 100644 helm-charts/chatqna/templates/customMetrics.yaml create mode 100644 helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/embedding-usvc/templates/servicemonitor.yaml create mode 100644 helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml create mode 100644 helm-charts/common/teirerank/templates/servicemonitor.yaml create mode 100644 helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml create mode 100644 helm-charts/common/tgi/templates/servicemonitor.yaml diff --git a/helm-charts/chatqna/templates/customMetrics.yaml b/helm-charts/chatqna/templates/customMetrics.yaml new file mode 100644 index 000000000..e4dacbdf1 --- /dev/null +++ b/helm-charts/chatqna/templates/customMetrics.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: v1 +data: + config.yaml: | + rules: + - seriesQuery: '{__name__="tgi_request_inference_duration_sum"}' + # Average request latency from TGI histograms, over 1 min + # (0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(tgi_request_inference_duration_sum{<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{<<.LabelMatchers>>}[1m]))' + name: + matches: ^tgi_request_inference_duration_sum + as: "tgi_request_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-reranking-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-reranking-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "reranking_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service + - seriesQuery: '{__name__="te_request_inference_duration_sum",service="tei-embedding-svc"}' + # Average request latency from TEI histograms, over 1 min + metricsQuery: 'rate(te_request_inference_duration_sum{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="tei-embedding-svc",<<.LabelMatchers>>}[1m]))' + name: + matches: ^te_request_inference_duration_sum + as: "embedding_request_latency" + resources: + overrides: + namespace: + resource: namespace + service: + resource: service +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring +{{- end }} diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index f848b209e..b062d6c03 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -7,6 +7,14 @@ replicaCount: 1 +# Enabling HPA will: +# - Overwrite existing PrometheusAdapter "adapter-config" configMap with ChatQnA specific custom metric queries +# for embedding, reranking, tgi services +# Upstream default configMap: +# - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml +horizontalPodAutoscaler: + enabled: false + image: repository: opea/chatqna pullPolicy: IfNotPresent diff --git a/helm-charts/common/embedding-usvc/README.md b/helm-charts/common/embedding-usvc/README.md index 88926d37e..1919e35ce 100644 --- a/helm-charts/common/embedding-usvc/README.md +++ b/helm-charts/common/embedding-usvc/README.md @@ -27,6 +27,34 @@ helm dependency update helm install embedding-usvc . --set autodependency.enabled=true ``` +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +### Gotchas + +Why HPA is opt-in: +* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are running. @@ -42,10 +70,40 @@ curl http://localhost:6000/v1/embeddings \ -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*embedding +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | tr -d '"' | grep namespaces | sed "s%/%/${ns}/metrics/%"); do + kubectl get --raw $url/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values -| Key | Type | Default | Description | -| ---------------------- | ------ | ---------------------- | ----------- | -| image.repository | string | `"opea/embedding-tei"` | | -| service.port | string | `"6000"` | | -| TEI_EMBEDDING_ENDPOINT | string | `""` | | +| Key | Type | Default | Description | +| ------------------------------- | ------ | ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| image.repository | string | `"opea/embedding-tei"` | | +| service.port | string | `"6000"` | | +| TEI_EMBEDDING_ENDPOINT | string | `""` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | diff --git a/helm-charts/common/embedding-usvc/templates/deployment.yaml b/helm-charts/common/embedding-usvc/templates/deployment.yaml index 26f5a76fa..acde12477 100644 --- a/helm-charts/common/embedding-usvc/templates/deployment.yaml +++ b/helm-charts/common/embedding-usvc/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "embedding-usvc.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "embedding-usvc.selectorLabels" . | nindent 6 }} @@ -77,3 +80,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} \ No newline at end of file diff --git a/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 000000000..62089e190 --- /dev/null +++ b/helm-charts/common/embedding-usvc/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "embedding-usvc.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "embedding-usvc.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-embedding time metrics are in seconds + name: embedding_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tei-embedding-svc + target: + # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml new file mode 100644 index 000000000..efadcd313 --- /dev/null +++ b/helm-charts/common/embedding-usvc/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "embedding-usvc.fullname" . }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: {{ include "embedding-usvc.fullname" . }} + endpoints: + - interval: 4s + port: embedding-usvc + scheme: http +{{- end }} \ No newline at end of file diff --git a/helm-charts/common/embedding-usvc/values.yaml b/helm-charts/common/embedding-usvc/values.yaml index f3b1f9e89..8f766d0c9 100644 --- a/helm-charts/common/embedding-usvc/values.yaml +++ b/helm-charts/common/embedding-usvc/values.yaml @@ -10,6 +10,14 @@ autodependency: replicaCount: 1 +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Require custom metrics ConfigMap available in the main application chart +horizontalPodAutoscaler: + enabled: false + maxReplicas: 2 + TEI_EMBEDDING_ENDPOINT: "" image: repository: opea/embedding-tei diff --git a/helm-charts/common/teirerank/README.md b/helm-charts/common/teirerank/README.md index b3cb2f193..6f42b2e5d 100644 --- a/helm-charts/common/teirerank/README.md +++ b/helm-charts/common/teirerank/README.md @@ -21,6 +21,34 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/BAAI/bge-reranker-base" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +### Gotchas + +Why HPA is opt-in: +* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -36,11 +64,41 @@ curl http://localhost:2082/rerank \ -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*rerank +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | tr -d '"' | grep namespaces | sed "s%/%/${ns}/metrics/%"); do + kubectl get --raw $url/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values -| Key | Type | Default | Description | -| ----------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| RERANK_MODEL_ID | string | `"BAAI/bge-reranker-base"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | -| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | -| image.tag | string | `"cpu-1.5"` | | +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| RERANK_MODEL_ID | string | `"BAAI/bge-reranker-base"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, teirerank will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | +| image.repository | string | `"ghcr.io/huggingface/text-embeddings-inference"` | | +| image.tag | string | `"cpu-1.5"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | \ No newline at end of file diff --git a/helm-charts/common/teirerank/templates/deployment.yaml b/helm-charts/common/teirerank/templates/deployment.yaml index 4a85b7fc6..2889fe9b0 100644 --- a/helm-charts/common/teirerank/templates/deployment.yaml +++ b/helm-charts/common/teirerank/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "teirerank.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "teirerank.selectorLabels" . | nindent 6 }} @@ -102,3 +105,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 60 + {{- end }} \ No newline at end of file diff --git a/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml new file mode 100644 index 000000000..d89215a46 --- /dev/null +++ b/helm-charts/common/teirerank/templates/horizontalPodAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "teirerank.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "teirerank.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # tei-reranking time metrics are in seconds + name: reranking_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tei-reranking-svc + target: + # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when + # TEI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/teirerank/templates/servicemonitor.yaml b/helm-charts/common/teirerank/templates/servicemonitor.yaml new file mode 100644 index 000000000..10713e818 --- /dev/null +++ b/helm-charts/common/teirerank/templates/servicemonitor.yaml @@ -0,0 +1,17 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "teirerank.fullname" . }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: {{ include "teirerank.fullname" . }} + endpoints: + - interval: 4s + port: teirerank + scheme: http +{{- end }} \ No newline at end of file diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml index 80a4cf73a..3b6c14f40 100644 --- a/helm-charts/common/teirerank/values.yaml +++ b/helm-charts/common/teirerank/values.yaml @@ -7,6 +7,15 @@ replicaCount: 1 + +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Require custom metrics ConfigMap available in the main application chart +horizontalPodAutoscaler: + enabled: false + maxReplicas: 3 + port: 2082 shmSize: 1Gi RERANK_MODEL_ID: "BAAI/bge-reranker-base" diff --git a/helm-charts/common/tgi/README.md b/helm-charts/common/tgi/README.md index 62e4d70c7..dad8c4b8e 100644 --- a/helm-charts/common/tgi/README.md +++ b/helm-charts/common/tgi/README.md @@ -24,6 +24,34 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/models--bigscience--bloom-560m" +## HorizontalPodAutoscaler (HPA) support + +`horizontalPodAutoscaler` option enables HPA scaling for the deployment: +https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ + +Autoscaling is based on custom application metrics provided through [Prometheus](https://prometheus.io/). + +### Pre-conditions + +If cluster does not run [Prometheus operator](https://github.com/prometheus-operator/kube-prometheus) +yet, it SHOULD be be installed before enabling HPA, e.g. by using: +https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack + +### Gotchas + +Why HPA is opt-in: +* Enabling chart `horizontalPodAutoscaler` option will _overwrite_ cluster's current + `PrometheusAdapter` configuration with its own custom metrics configuration. + Take copy of the existing one before install, if that matters: + `kubectl -n monitoring get cm/adapter-config -o yaml > adapter-config.yaml` +* `PrometheusAdapter` needs to be restarted after install, for it to read the new configuration: + `ns=monitoring; kubectl -n $ns delete $(kubectl -n $ns get pod --selector app.kubernetes.io/name=prometheus-adapter -o name)` +* By default Prometheus adds [k8s RBAC rules](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/prometheus-roleBindingSpecificNamespaces.yaml) + for accessing metrics from `default`, `kube-system` and `monitoring` namespaces. If Helm is + asked to install OPEA services to some other namespace, those rules need to be updated accordingly +* Provided HPA rules are examples for Xeon, for efficient scaling they need to be fine-tuned for given setup + (underlying HW, used models, OPEA version etc) + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. @@ -39,6 +67,35 @@ curl http://localhost:2080/generate \ -H 'Content-Type: application/json' ``` +### Verify HPA metrics + +To verify that metrics required by horizontalPodAutoscaler option work, check that: + +Prometheus has found the metric endpoints, i.e. last number on the line is non-zero: + +```console +prom_url=http://$(kubectl -n monitoring get -o jsonpath="{.spec.clusterIP}:{.spec.ports[0].port}" svc/prometheus-k8s) +curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*tgi +``` + +Prometheus adapter provides custom metrics for their data: + +```console +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 | jq .resources[].name +``` + +And those custom metrics have valid values for HPA rules: + +```console +ns=default; # OPEA namespace +url=/apis/custom.metrics.k8s.io/v1beta1; +for m in $(kubectl get --raw $url | jq .resources[].name | tr -d '"' | grep namespaces | sed "s%/%/${ns}/metrics/%"); do + kubectl get --raw $url/$m | jq; +done | grep -e metricName -e value +``` + +NOTE: HuggingFace TGI and TEI services provide metrics endpoint only after they've processed their first request! + ## Values | Key | Type | Default | Description | @@ -48,3 +105,4 @@ curl http://localhost:2080/generate \ | global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tgi will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | | image.repository | string | `"ghcr.io/huggingface/text-generation-inference"` | | | image.tag | string | `"1.4"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployments based on metrics it provides. See #pre-conditions and #gotchas before enabling! | \ No newline at end of file diff --git a/helm-charts/common/tgi/templates/deployment.yaml b/helm-charts/common/tgi/templates/deployment.yaml index 2ef224b59..742ec1a1c 100644 --- a/helm-charts/common/tgi/templates/deployment.yaml +++ b/helm-charts/common/tgi/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "tgi.labels" . | nindent 4 }} spec: + # use explicit replica counts only of HorizontalPodAutoscaler is disabled + {{- if not .Values.horizontalPodAutoscaler.enabled }} replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "tgi.selectorLabels" . | nindent 6 }} @@ -94,3 +97,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 + {{- end }} \ No newline at end of file diff --git a/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml new file mode 100644 index 000000000..bae813e16 --- /dev/null +++ b/helm-charts/common/tgi/templates/horizontalPorAutoscaler.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "tgi.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "tgi.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: tgi_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: tgi-svc + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/tgi/templates/servicemonitor.yaml b/helm-charts/common/tgi/templates/servicemonitor.yaml new file mode 100644 index 000000000..a8a373e9d --- /dev/null +++ b/helm-charts/common/tgi/templates/servicemonitor.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "tgi.fullname" . }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: {{ include "tgi.fullname" . }} + endpoints: + - interval: 4s + port: tgi + scheme: http +{{- end }} \ No newline at end of file diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml index dff877f5b..1f2c87a71 100644 --- a/helm-charts/common/tgi/values.yaml +++ b/helm-charts/common/tgi/values.yaml @@ -7,6 +7,14 @@ replicaCount: 1 +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Require custom metrics ConfigMap available in the main application chart +horizontalPodAutoscaler: + enabled: false + maxReplicas: 6 + port: 2080 image: