From 955b137be9ad68e0d01e284018dac33dbedc8c60 Mon Sep 17 00:00:00 2001 From: func25 Date: Wed, 8 Jan 2025 13:01:02 +0700 Subject: [PATCH 1/8] vmcluster: enhance vmstorage readiness probe --- charts/victoria-metrics-cluster/CHANGELOG.md | 2 +- charts/victoria-metrics-cluster/values.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/victoria-metrics-cluster/CHANGELOG.md b/charts/victoria-metrics-cluster/CHANGELOG.md index dac1c3414..c951f25e3 100644 --- a/charts/victoria-metrics-cluster/CHANGELOG.md +++ b/charts/victoria-metrics-cluster/CHANGELOG.md @@ -1,6 +1,6 @@ ## Next release -- TODO +- vmstorage: reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). ## 0.16.2 diff --git a/charts/victoria-metrics-cluster/values.yaml b/charts/victoria-metrics-cluster/values.yaml index c71f1e12b..0e0013326 100644 --- a/charts/victoria-metrics-cluster/values.yaml +++ b/charts/victoria-metrics-cluster/values.yaml @@ -1027,9 +1027,9 @@ vmstorage: readiness: httpGet: {} initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 - failureThreshold: 3 + failureThreshold: 10 # -- VMStorage liveness probe liveness: tcpSocket: {} From 587f0cf5cb577f9e2bd6664f5545263e03d37d18 Mon Sep 17 00:00:00 2001 From: func25 Date: Wed, 8 Jan 2025 13:26:48 +0700 Subject: [PATCH 2/8] vmcluster: update all components' readiness probe --- charts/victoria-metrics-cluster/CHANGELOG.md | 2 +- charts/victoria-metrics-cluster/README.md | 16 ++++++++-------- charts/victoria-metrics-cluster/values.yaml | 16 ++++++++-------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/charts/victoria-metrics-cluster/CHANGELOG.md b/charts/victoria-metrics-cluster/CHANGELOG.md index c951f25e3..6d4b53bf6 100644 --- a/charts/victoria-metrics-cluster/CHANGELOG.md +++ b/charts/victoria-metrics-cluster/CHANGELOG.md @@ -1,6 +1,6 @@ ## Next release -- vmstorage: reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). +- all: reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). ## 0.16.2 diff --git a/charts/victoria-metrics-cluster/README.md b/charts/victoria-metrics-cluster/README.md index cabfcdafb..6750e6d25 100644 --- a/charts/victoria-metrics-cluster/README.md +++ b/charts/victoria-metrics-cluster/README.md @@ -1591,10 +1591,10 @@ labels: {} tcpSocket: {} timeoutSeconds: 5 readiness: - failureThreshold: 3 + failureThreshold: 10 httpGet: {} initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 startup: {} @@ -2531,10 +2531,10 @@ labels: {} tcpSocket: {} timeoutSeconds: 5 readiness: - failureThreshold: 3 + failureThreshold: 10 httpGet: {} initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 startup: {} @@ -3435,10 +3435,10 @@ labels: {} tcpSocket: {} timeoutSeconds: 5 readiness: - failureThreshold: 3 + failureThreshold: 10 httpGet: {} initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 startup: {} @@ -3952,11 +3952,11 @@ loggerFormat: json port: manager-http timeoutSeconds: 5 readiness: - failureThreshold: 3 + failureThreshold: 10 httpGet: port: manager-http initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 startup: {} diff --git a/charts/victoria-metrics-cluster/values.yaml b/charts/victoria-metrics-cluster/values.yaml index 0e0013326..95959b4e4 100644 --- a/charts/victoria-metrics-cluster/values.yaml +++ b/charts/victoria-metrics-cluster/values.yaml @@ -109,9 +109,9 @@ vmselect: readiness: httpGet: {} initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 - failureThreshold: 3 + failureThreshold: 10 # -- VMSelect liveness probe liveness: tcpSocket: {} @@ -389,9 +389,9 @@ vminsert: readiness: httpGet: {} initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 - failureThreshold: 3 + failureThreshold: 10 # -- VMInsert liveness probe liveness: tcpSocket: {} @@ -639,9 +639,9 @@ vmauth: readiness: httpGet: {} initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 - failureThreshold: 3 + failureThreshold: 10 # -- VMAuth liveness probe liveness: tcpSocket: {} @@ -1108,9 +1108,9 @@ vmstorage: httpGet: port: manager-http initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 - failureThreshold: 3 + failureThreshold: 10 # -- VMBackupManager liveness probe liveness: tcpSocket: From 4b4417597c47fd0e824213ba05f0c208db2ebb62 Mon Sep 17 00:00:00 2001 From: func25 Date: Wed, 8 Jan 2025 14:04:47 +0700 Subject: [PATCH 3/8] remove vmstorage readiness probe --- charts/victoria-metrics-cluster/CHANGELOG.md | 3 ++- charts/victoria-metrics-cluster/values.yaml | 7 +------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/charts/victoria-metrics-cluster/CHANGELOG.md b/charts/victoria-metrics-cluster/CHANGELOG.md index 6d4b53bf6..1a1b7fc85 100644 --- a/charts/victoria-metrics-cluster/CHANGELOG.md +++ b/charts/victoria-metrics-cluster/CHANGELOG.md @@ -1,6 +1,7 @@ ## Next release -- all: reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). +- vmstorage: remove readiness probe (reason: vminsert already handles routing and retries, and readiness probes can inadvertently introduce delays, DNS instability, and unnecessary disruptions) +- all (except vmstorage): reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). ## 0.16.2 diff --git a/charts/victoria-metrics-cluster/values.yaml b/charts/victoria-metrics-cluster/values.yaml index 95959b4e4..1b6a7e79a 100644 --- a/charts/victoria-metrics-cluster/values.yaml +++ b/charts/victoria-metrics-cluster/values.yaml @@ -1024,12 +1024,7 @@ vmstorage: # -- Readiness & Liveness probes probe: # -- VMStorage readiness probe - readiness: - httpGet: {} - initialDelaySeconds: 5 - periodSeconds: 5 - timeoutSeconds: 5 - failureThreshold: 10 + readiness: {} # -- VMStorage liveness probe liveness: tcpSocket: {} From 2d2a2f0eac46a73c01e7e218107a9d38d2812d53 Mon Sep 17 00:00:00 2001 From: Andrii Chubatiuk Date: Wed, 8 Jan 2025 09:19:59 +0200 Subject: [PATCH 4/8] updated snapshots --- .../tests/__snapshot__/vmauth_test.yaml.snap | 8 ++++---- .../tests/__snapshot__/vminsert_test.yaml.snap | 8 ++++---- .../tests/__snapshot__/vmselect_test.yaml.snap | 16 ++++++++-------- .../__snapshot__/vmstorage_test.yaml.snap | 18 ------------------ 4 files changed, 16 insertions(+), 34 deletions(-) diff --git a/charts/victoria-metrics-cluster/tests/__snapshot__/vmauth_test.yaml.snap b/charts/victoria-metrics-cluster/tests/__snapshot__/vmauth_test.yaml.snap index 533cdc19c..027659ccc 100644 --- a/charts/victoria-metrics-cluster/tests/__snapshot__/vmauth_test.yaml.snap +++ b/charts/victoria-metrics-cluster/tests/__snapshot__/vmauth_test.yaml.snap @@ -49,13 +49,13 @@ deployment should match snapshot: - containerPort: 8427 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 volumeMounts: - mountPath: /config @@ -118,13 +118,13 @@ deployment should match snapshot with fullnameOverride, extraLabels and podLabel - containerPort: 8427 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 volumeMounts: - mountPath: /config diff --git a/charts/victoria-metrics-cluster/tests/__snapshot__/vminsert_test.yaml.snap b/charts/victoria-metrics-cluster/tests/__snapshot__/vminsert_test.yaml.snap index af012c047..9ae5c7a6f 100644 --- a/charts/victoria-metrics-cluster/tests/__snapshot__/vminsert_test.yaml.snap +++ b/charts/victoria-metrics-cluster/tests/__snapshot__/vminsert_test.yaml.snap @@ -50,13 +50,13 @@ deployment should match snapshot: - containerPort: 8480 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 serviceAccountName: RELEASE-NAME-victoria-metrics-cluster deployment should match snapshot with fullnameOverride, extraLabels and podLabels: @@ -113,12 +113,12 @@ deployment should match snapshot with fullnameOverride, extraLabels and podLabel - containerPort: 8480 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 serviceAccountName: RELEASE-NAME-victoria-metrics-cluster diff --git a/charts/victoria-metrics-cluster/tests/__snapshot__/vmselect_test.yaml.snap b/charts/victoria-metrics-cluster/tests/__snapshot__/vmselect_test.yaml.snap index ce2e0e62e..3caa54048 100644 --- a/charts/victoria-metrics-cluster/tests/__snapshot__/vmselect_test.yaml.snap +++ b/charts/victoria-metrics-cluster/tests/__snapshot__/vmselect_test.yaml.snap @@ -51,13 +51,13 @@ deployment should match snapshot: - containerPort: 8481 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 securityContext: {} volumeMounts: @@ -123,13 +123,13 @@ deployment should match snapshot with fullnameOverride, extraLabels and podLabel - containerPort: 8481 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 securityContext: {} volumeMounts: @@ -197,13 +197,13 @@ statefulset should match snapshot: - containerPort: 8481 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 securityContext: {} volumeMounts: @@ -273,13 +273,13 @@ statefulset should match snapshot with fullnameOverride, extraLabels and podLabe - containerPort: 8481 name: http readinessProbe: - failureThreshold: 3 + failureThreshold: 10 httpGet: path: /health port: http scheme: HTTP initialDelaySeconds: 5 - periodSeconds: 15 + periodSeconds: 5 timeoutSeconds: 5 securityContext: {} volumeMounts: diff --git a/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap b/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap index f8f2a54af..8bbe5a128 100644 --- a/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap +++ b/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap @@ -55,15 +55,6 @@ statefulset should match snapshot: name: vminsert - containerPort: 8401 name: vmselect - readinessProbe: - failureThreshold: 3 - httpGet: - path: /health - port: http - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 15 - timeoutSeconds: 5 volumeMounts: - mountPath: /storage name: vmstorage-volume @@ -139,15 +130,6 @@ statefulset should match snapshot with fullnameOverride, extraLabels and podLabe name: vminsert - containerPort: 8401 name: vmselect - readinessProbe: - failureThreshold: 3 - httpGet: - path: /health - port: http - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 15 - timeoutSeconds: 5 volumeMounts: - mountPath: /storage name: vmstorage-volume From 91e88e8f738e9ab62a1e496e3908d013595f20c9 Mon Sep 17 00:00:00 2001 From: Haley Wang Date: Wed, 8 Jan 2025 15:32:37 +0800 Subject: [PATCH 5/8] fix --- charts/victoria-metrics-cluster/CHANGELOG.md | 4 +-- .../__snapshot__/vmstorage_test.yaml.snap | 32 +++++++++++-------- charts/victoria-metrics-cluster/values.yaml | 10 +++--- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/charts/victoria-metrics-cluster/CHANGELOG.md b/charts/victoria-metrics-cluster/CHANGELOG.md index 1a1b7fc85..5158c4c73 100644 --- a/charts/victoria-metrics-cluster/CHANGELOG.md +++ b/charts/victoria-metrics-cluster/CHANGELOG.md @@ -1,7 +1,7 @@ ## Next release -- vmstorage: remove readiness probe (reason: vminsert already handles routing and retries, and readiness probes can inadvertently introduce delays, DNS instability, and unnecessary disruptions) -- all (except vmstorage): reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). +- Remove vmstorage readiness probe, as vminsert already handles routing and retries, while readiness probes can inadvertently introduce delays, DNS instability, and unnecessary disruptions. +- Reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). ## 0.16.2 diff --git a/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap b/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap index 8bbe5a128..8426e2f7a 100644 --- a/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap +++ b/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap @@ -40,13 +40,6 @@ statefulset should match snapshot: - --storageDataPath=/storage image: victoriametrics/vmstorage:0.1.0-cluster imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 10 - initialDelaySeconds: 30 - periodSeconds: 30 - tcpSocket: - port: http - timeoutSeconds: 5 name: vmstorage ports: - containerPort: 8482 @@ -55,6 +48,15 @@ statefulset should match snapshot: name: vminsert - containerPort: 8401 name: vmselect + readinessProbe: + failureThreshold: 10 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 volumeMounts: - mountPath: /storage name: vmstorage-volume @@ -115,13 +117,6 @@ statefulset should match snapshot with fullnameOverride, extraLabels and podLabe - --storageDataPath=/storage image: victoriametrics/vmstorage:0.1.0-cluster imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 10 - initialDelaySeconds: 30 - periodSeconds: 30 - tcpSocket: - port: http - timeoutSeconds: 5 name: vmstorage ports: - containerPort: 8482 @@ -130,6 +125,15 @@ statefulset should match snapshot with fullnameOverride, extraLabels and podLabe name: vminsert - containerPort: 8401 name: vmselect + readinessProbe: + failureThreshold: 10 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 volumeMounts: - mountPath: /storage name: vmstorage-volume diff --git a/charts/victoria-metrics-cluster/values.yaml b/charts/victoria-metrics-cluster/values.yaml index 1b6a7e79a..17f606e1f 100644 --- a/charts/victoria-metrics-cluster/values.yaml +++ b/charts/victoria-metrics-cluster/values.yaml @@ -1024,12 +1024,10 @@ vmstorage: # -- Readiness & Liveness probes probe: # -- VMStorage readiness probe - readiness: {} - # -- VMStorage liveness probe - liveness: - tcpSocket: {} - initialDelaySeconds: 30 - periodSeconds: 30 + readiness: + httpGet: {} + initialDelaySeconds: 5 + periodSeconds: 5 timeoutSeconds: 5 failureThreshold: 10 # -- VMStorage startup probe From cd409e747812675eaa11adbe82f57b4440e7d349 Mon Sep 17 00:00:00 2001 From: Haley Wang Date: Wed, 8 Jan 2025 15:51:12 +0800 Subject: [PATCH 6/8] vmstorage: add a default minReadySeconds --- charts/victoria-metrics-cluster/CHANGELOG.md | 1 + .../templates/vmstorage-statefulset.yaml | 1 + .../tests/__snapshot__/vmstorage_test.yaml.snap | 2 ++ charts/victoria-metrics-cluster/values.yaml | 3 ++- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/charts/victoria-metrics-cluster/CHANGELOG.md b/charts/victoria-metrics-cluster/CHANGELOG.md index 5158c4c73..71f649531 100644 --- a/charts/victoria-metrics-cluster/CHANGELOG.md +++ b/charts/victoria-metrics-cluster/CHANGELOG.md @@ -2,6 +2,7 @@ - Remove vmstorage readiness probe, as vminsert already handles routing and retries, while readiness probes can inadvertently introduce delays, DNS instability, and unnecessary disruptions. - Reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). +- Add a default minReadySeconds for vmstorage, to help stabilizing service during rollout. ## 0.16.2 diff --git a/charts/victoria-metrics-cluster/templates/vmstorage-statefulset.yaml b/charts/victoria-metrics-cluster/templates/vmstorage-statefulset.yaml index ac02cc21e..07eb190b8 100644 --- a/charts/victoria-metrics-cluster/templates/vmstorage-statefulset.yaml +++ b/charts/victoria-metrics-cluster/templates/vmstorage-statefulset.yaml @@ -250,6 +250,7 @@ spec: {{- end }} {{- include "vm.license.volume" . | nindent 8 }} {{- end }} + minReadySeconds: {{ $app.minReadySeconds }} {{- if and $app.persistentVolume.enabled (not $app.persistentVolume.existingClaim) }} volumeClaimTemplates: - apiVersion: v1 diff --git a/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap b/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap index 8426e2f7a..ea9e0e1f2 100644 --- a/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap +++ b/charts/victoria-metrics-cluster/tests/__snapshot__/vmstorage_test.yaml.snap @@ -13,6 +13,7 @@ statefulset should match snapshot: name: RELEASE-NAME-victoria-metrics-cluster-vmstorage namespace: NAMESPACE spec: + minReadySeconds: 5 podManagementPolicy: OrderedReady replicas: 2 selector: @@ -89,6 +90,7 @@ statefulset should match snapshot with fullnameOverride, extraLabels and podLabe name: vmstorage-node namespace: NAMESPACE spec: + minReadySeconds: 5 podManagementPolicy: OrderedReady replicas: 2 selector: diff --git a/charts/victoria-metrics-cluster/values.yaml b/charts/victoria-metrics-cluster/values.yaml index 17f606e1f..5a5927997 100644 --- a/charts/victoria-metrics-cluster/values.yaml +++ b/charts/victoria-metrics-cluster/values.yaml @@ -1021,7 +1021,8 @@ vmstorage: ipFamilies: [] # -- Pod's termination grace period in seconds terminationGracePeriodSeconds: 60 - # -- Readiness & Liveness probes + minReadySeconds: 5 + # -- Readiness probes probe: # -- VMStorage readiness probe readiness: From 485f6a9e945d897f6f0fee8ec272ceebc9f642f7 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 8 Jan 2025 16:39:12 +0800 Subject: [PATCH 7/8] Update charts/victoria-metrics-cluster/CHANGELOG.md Co-authored-by: Phuong Le <39565248+func25@users.noreply.github.com> --- charts/victoria-metrics-cluster/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/victoria-metrics-cluster/CHANGELOG.md b/charts/victoria-metrics-cluster/CHANGELOG.md index 71f649531..79140faae 100644 --- a/charts/victoria-metrics-cluster/CHANGELOG.md +++ b/charts/victoria-metrics-cluster/CHANGELOG.md @@ -1,6 +1,6 @@ ## Next release -- Remove vmstorage readiness probe, as vminsert already handles routing and retries, while readiness probes can inadvertently introduce delays, DNS instability, and unnecessary disruptions. +- Remove vmstorage liveness probe, as vminsert already handles routing and retries, while liveness probes can inadvertently introduce delays, DNS instability, and unnecessary disruptions. - Reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). - Add a default minReadySeconds for vmstorage, to help stabilizing service during rollout. From 2b11f66df04690bbec2ae424f37663e2ecd42390 Mon Sep 17 00:00:00 2001 From: Haley Wang Date: Thu, 9 Jan 2025 14:27:22 +0800 Subject: [PATCH 8/8] add update note --- charts/victoria-metrics-cluster/CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/victoria-metrics-cluster/CHANGELOG.md b/charts/victoria-metrics-cluster/CHANGELOG.md index 79140faae..1779ccb53 100644 --- a/charts/victoria-metrics-cluster/CHANGELOG.md +++ b/charts/victoria-metrics-cluster/CHANGELOG.md @@ -1,5 +1,8 @@ ## Next release +**Update note**: A default `minReadySeconds` has been added for the vmstorage statefulset, vmstorage pods will restart after the upgrade. +**Update note**: The default probes of vminsert, vmselect, vmauth, and vmstorage have been changed, all pods will restart after the upgrade. + - Remove vmstorage liveness probe, as vminsert already handles routing and retries, while liveness probes can inadvertently introduce delays, DNS instability, and unnecessary disruptions. - Reduce the default readiness probe interval to 5s (was 15s) and the failure threshold to 10 (was 3). - Add a default minReadySeconds for vmstorage, to help stabilizing service during rollout.