From 2f642869f353359baaf58b67026a3b81ce728e9c Mon Sep 17 00:00:00 2001 From: tschneid Date: Wed, 8 Jan 2025 15:35:04 -0600 Subject: [PATCH] refactor amw, dce, dcr, and aks-metrics-enable - move metrics modules from ADO -> ARO-HCP - add default node/k8s recording rule group for all clusters with the Azure Monitoring Workspace - use bicep to deploy data collection endpoints and data collection rules - enable metrics within aks-cluster-base.bicep - remove unused make targets and pipeline steps --- dev-infrastructure/Makefile | 16 +- .../mgmt-cluster.tmpl.bicepparam | 3 + .../svc-cluster.tmpl.bicepparam | 3 + dev-infrastructure/mgmt-pipeline.yaml | 16 -- .../modules/aks-cluster-base.bicep | 24 +++ .../modules/metrics/datacollection.bicep | 60 +++++++ .../modules/metrics/metrics.bicep | 2 +- .../modules/metrics/monitor.bicep | 58 ++++++ .../rules/defaultRecordingRuleGroups.bicep | 168 ++++++++++++++++++ .../rules/prometheusAlertingRules.bicep | 25 +++ dev-infrastructure/svc-pipeline.yaml | 16 -- .../templates/mgmt-cluster.bicep | 20 ++- .../templates/svc-cluster.bicep | 19 ++ 13 files changed, 382 insertions(+), 48 deletions(-) create mode 100644 dev-infrastructure/modules/metrics/datacollection.bicep create mode 100644 dev-infrastructure/modules/metrics/monitor.bicep create mode 100644 dev-infrastructure/modules/metrics/rules/defaultRecordingRuleGroups.bicep create mode 100644 dev-infrastructure/modules/metrics/rules/prometheusAlertingRules.bicep diff --git a/dev-infrastructure/Makefile b/dev-infrastructure/Makefile index eb8eb8a0c8..e42d918131 100644 --- a/dev-infrastructure/Makefile +++ b/dev-infrastructure/Makefile @@ -263,7 +263,7 @@ svc.kv.permission: @scripts/kv-permissions.sh $(PRINCIPAL_ID) $(SVC_KV_RESOURCEGROUP) $(SVC_KV_NAME) .PHONY: svc.kv.permission -svc.init: region svc svc.aks.admin-access svc.aks.kubeconfig svc.istio metrics-infra svc.enable-aks-metrics svc.oidc.storage.permissions +svc.init: region metrics-infra svc svc.aks.admin-access svc.aks.kubeconfig svc.istio svc.oidc.storage.permissions .PHONY: svc.init svc.what-if: svc.rg @@ -340,18 +340,6 @@ mgmt: mgmt.wait mgmt.rg persist=${PERSIST} .PHONY: mgmt -mgmt.enable-aks-metrics: - @if [ "$$(az aks show --resource-group $(MGMT_RESOURCEGROUP) --name ${AKS_NAME} --query 'azureMonitorProfile.metrics.enabled' -o tsv)" = "true" ]; then \ - echo "Azure Monitor metrics are already enabled."; \ - else \ - az aks update --enable-azure-monitor-metrics \ - --resource-group $(MGMT_RESOURCEGROUP) \ - --name ${AKS_NAME} \ - --azure-monitor-workspace-resource-id $$(az deployment group show --resource-group $(REGIONAL_RESOURCEGROUP) --name metrics-infra --output tsv --query properties.outputs.monitorId.value) \ - --grafana-resource-id $$(az deployment group show --resource-group $(REGIONAL_RESOURCEGROUP) --name metrics-infra --output tsv --query properties.outputs.grafanaId.value); \ - fi -.PHONY: mgmt.enable-aks-metrics - mgmt.aks.admin-access: @scripts/aks-admin-access.sh $(MGMT_RESOURCEGROUP) $(PRINCIPAL_ID) .PHONY: mgmt.aks.admin-access @@ -365,7 +353,7 @@ mgmt.aks.kubeconfigfile: @echo ${MGMT_KUBECONFIG_FILE} .PHONY: mgmt.aks.kubeconfigfile -mgmt.init: region mgmt mgmt.aks.admin-access mgmt.aks.kubeconfig metrics-infra mgmt.enable-aks-metrics +mgmt.init: region metrics-infra mgmt mgmt.aks.admin-access mgmt.aks.kubeconfig .PHONY: mgmt.init mgmt.what-if: mgmt.rg diff --git a/dev-infrastructure/configurations/mgmt-cluster.tmpl.bicepparam b/dev-infrastructure/configurations/mgmt-cluster.tmpl.bicepparam index 610efa3baa..5974a49cb5 100644 --- a/dev-infrastructure/configurations/mgmt-cluster.tmpl.bicepparam +++ b/dev-infrastructure/configurations/mgmt-cluster.tmpl.bicepparam @@ -40,3 +40,6 @@ param mgmtKeyVaultName = '{{ .mgmtKeyVault.name }}' // MI for deployment scripts param aroDevopsMsiId = '{{ .aroDevopsMsiId }}' + +// Azure Monitor Workspace +param azureMonitorWorkspaceName = '{{ .monitoring.workspaceName }}' diff --git a/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam b/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam index 882f6777c9..63e9d2fc90 100644 --- a/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam +++ b/dev-infrastructure/configurations/svc-cluster.tmpl.bicepparam @@ -50,3 +50,6 @@ param regionalDNSZoneName = '{{ .regionalDNSSubdomain}}.{{ .baseDnsZoneName }}' param regionalResourceGroup = '{{ .regionRG }}' param frontendIngressCertName = '{{ .frontend.cert.name }}' + +// Azure Monitor Workspace +param azureMonitorWorkspaceName = '{{ .monitoring.workspaceName }}' diff --git a/dev-infrastructure/mgmt-pipeline.yaml b/dev-infrastructure/mgmt-pipeline.yaml index ba1a9959a7..3ff1b5e018 100644 --- a/dev-infrastructure/mgmt-pipeline.yaml +++ b/dev-infrastructure/mgmt-pipeline.yaml @@ -32,19 +32,3 @@ resourceGroups: deploymentLevel: ResourceGroup dependsOn: - mgmt-infra - - name: enable-metrics - action: Shell - command: scripts/enable-aks-metrics.sh - variables: - - name: RESOURCEGROUP - configRef: mgmt.rg - - name: AKS_NAME - configRef: aksName - - name: GRAFANA_RESOURCEGROUP - configRef: regionRG - - name: MONITORING_WORKSPACE_NAME - configRef: monitoring.workspaceName - - name: GRAFANA_NAME - configRef: monitoring.grafanaName - dependsOn: - - mgmt-cluster diff --git a/dev-infrastructure/modules/aks-cluster-base.bicep b/dev-infrastructure/modules/aks-cluster-base.bicep index f949b7af56..869d847189 100644 --- a/dev-infrastructure/modules/aks-cluster-base.bicep +++ b/dev-infrastructure/modules/aks-cluster-base.bicep @@ -3,6 +3,11 @@ param aksClusterName string param aksNodeResourceGroupName string param aksEtcdKVEnableSoftDelete bool +// Metrics +param dcrId string +param metricLabelsAllowlist string = '' +param metricAnnotationsAllowList string = '' + // System agentpool spec(Infra) param systemAgentMinCount int = 2 param systemAgentMaxCount int = 3 @@ -302,6 +307,15 @@ resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-04-02-previ nodeOSUpgradeChannel: 'NodeImage' upgradeChannel: 'patch' } + azureMonitorProfile: { + metrics: { + enabled: true + kubeStateMetrics: { + metricLabelsAllowlist: metricLabelsAllowlist + metricAnnotationsAllowList: metricAnnotationsAllowList + } + } + } disableLocalAccounts: true dnsPrefix: dnsPrefix enableRBAC: true @@ -517,6 +531,16 @@ resource aroDevopsMSIClusterAdmin 'Microsoft.Authorization/roleAssignments@2022- } } +// metrics dcr association +resource azuremonitormetrics_dcra_clusterResourceId 'Microsoft.Insights/dataCollectionRuleAssociations@2022-06-01' = { + name: '${resourceGroup().name}-${aksCluster.name}-dcra' + scope: aksCluster + properties: { + description: 'Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster.' + dataCollectionRuleId: dcrId + } +} + // Outputs output userAssignedIdentities array = [ for i in range(0, length(workloadIdentities)): { diff --git a/dev-infrastructure/modules/metrics/datacollection.bicep b/dev-infrastructure/modules/metrics/datacollection.bicep new file mode 100644 index 0000000000..922905544e --- /dev/null +++ b/dev-infrastructure/modules/metrics/datacollection.bicep @@ -0,0 +1,60 @@ +param azureMonitorWorkspaceName string +param azureMonitorWorkspaceLocation string +param aksClusterName string +param regionalResourceGroup string + +var dceName = take('MSProm-${azureMonitorWorkspaceLocation}-${aksClusterName}', 44) +var dcrName = take('MSProm-${azureMonitorWorkspaceLocation}-${aksClusterName}', 44) + +resource amw 'microsoft.monitor/accounts@2021-06-03-preview' existing = { + name: azureMonitorWorkspaceName + scope: resourceGroup(regionalResourceGroup) +} + +resource dce 'Microsoft.Insights/dataCollectionEndpoints@2022-06-01' = { + name: dceName + location: azureMonitorWorkspaceLocation + kind: 'Linux' + properties: {} +} + +resource dcr 'Microsoft.Insights/dataCollectionRules@2022-06-01' = { + name: dcrName + location: azureMonitorWorkspaceLocation + kind: 'Linux' + properties: { + dataCollectionEndpointId: dce.id + dataFlows: [ + { + destinations: [ + 'MonitoringAccount1' + ] + streams: [ + 'Microsoft-PrometheusMetrics' + ] + } + ] + dataSources: { + prometheusForwarder: [ + { + name: 'PrometheusDataSource' + streams: [ + 'Microsoft-PrometheusMetrics' + ] + labelIncludeFilter: {} + } + ] + } + description: 'DCR for Azure Monitor Metrics Profile (Managed Prometheus)' + destinations: { + monitoringAccounts: [ + { + accountResourceId: amw.id + name: 'MonitoringAccount1' + } + ] + } + } +} + +output dcrId string = dcr.id diff --git a/dev-infrastructure/modules/metrics/metrics.bicep b/dev-infrastructure/modules/metrics/metrics.bicep index a5dd65972c..6011472cf1 100644 --- a/dev-infrastructure/modules/metrics/metrics.bicep +++ b/dev-infrastructure/modules/metrics/metrics.bicep @@ -28,7 +28,7 @@ module grafana 'br:arointacr.azurecr.io/grafana.bicep:metrics.20240814.1' = { } } -module monitor 'br:arointacr.azurecr.io/monitor.bicep:monitor.20241004.1' = { +module monitor 'monitor.bicep' = { name: 'monitor' params: { globalResourceGroup: globalResourceGroup diff --git a/dev-infrastructure/modules/metrics/monitor.bicep b/dev-infrastructure/modules/metrics/monitor.bicep new file mode 100644 index 0000000000..cd4b2a1bb4 --- /dev/null +++ b/dev-infrastructure/modules/metrics/monitor.bicep @@ -0,0 +1,58 @@ +@description('Metrics global resource group name') +param globalResourceGroup string + +@description('Metrics global MSI name') +param msiName string + +@description('Metrics global Grafana name') +param grafanaName string + +@description('Metrics region monitor name') +param monitorName string = 'aro-hcp-monitor' + +resource monitor 'microsoft.monitor/accounts@2021-06-03-preview' = { + name: monitorName + location: resourceGroup().location +} + +module defaultRuleGroups 'rules/defaultRecordingRuleGroups.bicep' ={ + name: 'defaultRecordingRuleGroups' + params: { + azureMonitorWorkspaceLocation: resourceGroup().location + azureMonitorWorkspaceName: monitorName + regionalResourceGroup: resourceGroup().name + } +} +// Assign the Monitoring Data Reader role to the Azure Managed Grafana system-assigned managed identity at the workspace scope +var dataReader = 'b0d8363b-8ddd-447d-831f-62ca05bff136' + +resource msi 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' existing = { + name: msiName + scope: resourceGroup(globalResourceGroup) +} + +resource grafana 'Microsoft.Dashboard/grafana@2023-09-01' existing = { + name: grafanaName + scope: resourceGroup(globalResourceGroup) +} + +resource roleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(monitor.id, grafana.id, dataReader) + scope: monitor + properties: { + principalId: grafana.identity.principalId + principalType: 'ServicePrincipal' + roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', dataReader) + } +} + +module prometheus 'rules/prometheusAlertingRules.bicep' = { + name: 'prometheusAlertingRules' + params: { + azureMonitoring: monitor.id + } +} + +output msiId string = msi.id +output grafanaId string = grafana.id +output monitorId string = monitor.id diff --git a/dev-infrastructure/modules/metrics/rules/defaultRecordingRuleGroups.bicep b/dev-infrastructure/modules/metrics/rules/defaultRecordingRuleGroups.bicep new file mode 100644 index 0000000000..d0c200d860 --- /dev/null +++ b/dev-infrastructure/modules/metrics/rules/defaultRecordingRuleGroups.bicep @@ -0,0 +1,168 @@ +param regionalResourceGroup string +param azureMonitorWorkspaceName string +param azureMonitorWorkspaceLocation string + +resource amw 'microsoft.monitor/accounts@2021-06-03-preview' existing = { + name: azureMonitorWorkspaceName + scope: resourceGroup(regionalResourceGroup) +} + +// default recording rules from https://github.com/Azure/prometheus-collector/blob/main/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep +resource kubernetesRecordingRuleGroup 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: 'all-clusters-defaultK8sRecordingRules' + location: azureMonitorWorkspaceLocation + properties: { + description: 'default kubernetes recording rules' + scopes: [amw.id] + enabled: true + interval: 'PT1M' + rules: [ + { + record: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate' + expression: 'sum by (cluster, namespace, pod, container) ( irate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}))' + } + { + record: 'node_namespace_pod_container:container_memory_working_set_bytes' + expression: 'container_memory_working_set_bytes{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' + } + { + record: 'node_namespace_pod_container:container_memory_rss' + expression: 'container_memory_rss{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' + } + { + record: 'node_namespace_pod_container:container_memory_cache' + expression: 'container_memory_cache{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' + } + { + record: 'node_namespace_pod_container:container_memory_swap' + expression: 'container_memory_swap{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' + } + { + record: 'cluster:namespace:pod_memory:active:kube_pod_container_resource_requests' + expression: 'kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1))' + } + { + record: 'namespace_memory:kube_pod_container_resource_requests:sum' + expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' + } + { + record: 'cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests' + expression: 'kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1))' + } + { + record: 'namespace_cpu:kube_pod_container_resource_requests:sum' + expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' + } + { + record: 'cluster:namespace:pod_memory:active:kube_pod_container_resource_limits' + expression: 'kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1))' + } + { + record: 'namespace_memory:kube_pod_container_resource_limits:sum' + expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' + } + { + record: 'cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits' + expression: 'kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) )' + } + { + record: 'namespace_cpu:kube_pod_container_resource_limits:sum' + expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' + } + { + record: 'namespace_workload_pod:kube_pod_owner:relabel' + expression: 'max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( 1, max by (replicaset, namespace, owner_name) ( kube_replicaset_owner{job="kube-state-metrics"} ) ), "workload", "$1", "owner_name", "(.*)" ))' + labels: { + workload_type: 'deployment' + } + } + { + record: 'namespace_workload_pod:kube_pod_owner:relabel' + expression: 'max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ))' + labels: { + workload_type: 'daemonset' + } + } + { + record: 'namespace_workload_pod:kube_pod_owner:relabel' + expression: 'max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ))' + labels: { + workload_type: 'statefulset' + } + } + { + record: 'namespace_workload_pod:kube_pod_owner:relabel' + expression: 'max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, "workload", "$1", "owner_name", "(.*)" ))' + labels: { + workload_type: 'job' + } + } + { + record: ':node_memory_MemAvailable_bytes:sum' + expression: 'sum( node_memory_MemAvailable_bytes{job="node"} or ( node_memory_Buffers_bytes{job="node"} + node_memory_Cached_bytes{job="node"} + node_memory_MemFree_bytes{job="node"} + node_memory_Slab_bytes{job="node"} )) by (cluster)' + } + { + record: 'cluster:node_cpu:ratio_rate5m' + expression: 'sum(rate(node_cpu_seconds_total{job="node",mode!="idle",mode!="iowait",mode!="steal"}[5m])) by (cluster) /count(sum(node_cpu_seconds_total{job="node"}) by (cluster, instance, cpu)) by (cluster)' + } + ] + } +} +// default recording rules from https://github.com/Azure/prometheus-collector/blob/main/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep +resource nodeRecordingRuleGroup 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: 'all-cluster-defaultNodeRecordingRules' + + location: azureMonitorWorkspaceLocation + properties: { + description: 'default node recording rules' + scopes: [amw.id] + enabled: true + interval: 'PT1M' + rules: [ + { + record: 'instance:node_num_cpu:sum' + expression: 'count without (cpu, mode) ( node_cpu_seconds_total{job="node",mode="idle"})' + } + { + record: 'instance:node_cpu_utilisation:rate5m' + expression: '1 - avg without (cpu) ( sum without (mode) (rate(node_cpu_seconds_total{job="node", mode=~"idle|iowait|steal"}[5m])))' + } + { + record: 'instance:node_load1_per_cpu:ratio' + expression: '( node_load1{job="node"}/ instance:node_num_cpu:sum{job="node"})' + } + { + record: 'instance:node_memory_utilisation:ratio' + expression: '1 - ( ( node_memory_MemAvailable_bytes{job="node"} or ( node_memory_Buffers_bytes{job="node"} + node_memory_Cached_bytes{job="node"} + node_memory_MemFree_bytes{job="node"} + node_memory_Slab_bytes{job="node"} ) )/ node_memory_MemTotal_bytes{job="node"})' + } + { + record: 'instance:node_vmstat_pgmajfault:rate5m' + expression: 'rate(node_vmstat_pgmajfault{job="node"}[5m])' + } + { + record: 'instance_device:node_disk_io_time_seconds:rate5m' + expression: 'rate(node_disk_io_time_seconds_total{job="node", device!=""}[5m])' + } + { + record: 'instance_device:node_disk_io_time_weighted_seconds:rate5m' + expression: 'rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[5m])' + } + { + record: 'instance:node_network_receive_bytes_excluding_lo:rate5m' + expression: 'sum without (device) ( rate(node_network_receive_bytes_total{job="node", device!="lo"}[5m]))' + } + { + record: 'instance:node_network_transmit_bytes_excluding_lo:rate5m' + expression: 'sum without (device) ( rate(node_network_transmit_bytes_total{job="node", device!="lo"}[5m]))' + } + { + record: 'instance:node_network_receive_drop_excluding_lo:rate5m' + expression: 'sum without (device) ( rate(node_network_receive_drop_total{job="node", device!="lo"}[5m]))' + } + { + record: 'instance:node_network_transmit_drop_excluding_lo:rate5m' + expression: 'sum without (device) ( rate(node_network_transmit_drop_total{job="node", device!="lo"}[5m]))' + } + ] + } +} diff --git a/dev-infrastructure/modules/metrics/rules/prometheusAlertingRules.bicep b/dev-infrastructure/modules/metrics/rules/prometheusAlertingRules.bicep new file mode 100644 index 0000000000..8f63e54c8d --- /dev/null +++ b/dev-infrastructure/modules/metrics/rules/prometheusAlertingRules.bicep @@ -0,0 +1,25 @@ +param azureMonitoring string + +resource prometheusRuleGroups 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { + name: 'hcp-prometheus-rules' + location: resourceGroup().location + properties: { + rules: [ + { + // Copy from https://github.com/Azure/prometheus-collector/blob/main/AddonBicepTemplate/recommendedMetricAlerts.bicep + alert: 'KubePodImagePull' + expression: 'max_over_time(kube_pod_container_status_waiting_reason{reason="ImagePullBackOff", job="kube-state-metrics"}[5m]) >= 1' + for: 'PT15M' + enabled: true + severity: 4 + resolveConfiguration: { + autoResolved: true + timeToResolve: 'PT10M' + } + } + ] + scopes: [ + azureMonitoring + ] + } +} diff --git a/dev-infrastructure/svc-pipeline.yaml b/dev-infrastructure/svc-pipeline.yaml index 6f842de1a6..041a0f7e84 100644 --- a/dev-infrastructure/svc-pipeline.yaml +++ b/dev-infrastructure/svc-pipeline.yaml @@ -18,22 +18,6 @@ resourceGroups: deploymentLevel: ResourceGroup dependsOn: - svc-infra - - name: enable-metrics - action: Shell - command: scripts/enable-aks-metrics.sh - variables: - - name: RESOURCEGROUP - configRef: svc.rg - - name: AKS_NAME - configRef: aksName - - name: GRAFANA_RESOURCEGROUP - configRef: regionRG - - name: MONITORING_WORKSPACE_NAME - configRef: monitoring.workspaceName - - name: GRAFANA_NAME - configRef: monitoring.grafanaName - dependsOn: - - svc - name: istio action: Shell command: scripts/istio.sh diff --git a/dev-infrastructure/templates/mgmt-cluster.bicep b/dev-infrastructure/templates/mgmt-cluster.bicep index af6138f73e..3b0ee43e98 100644 --- a/dev-infrastructure/templates/mgmt-cluster.bicep +++ b/dev-infrastructure/templates/mgmt-cluster.bicep @@ -83,6 +83,9 @@ param mgmtKeyVaultName string @description('MSI that will be used to run deploymentScripts') param aroDevopsMsiId string +@description('The resource ID of the Azure Monitor Workspace (stores prometheus metrics)') +param azureMonitorWorkspaceName string + module mgmtCluster '../modules/aks-cluster-base.bicep' = { name: 'cluster' scope: resourceGroup() @@ -117,13 +120,28 @@ module mgmtCluster '../modules/aks-cluster-base.bicep' = { systemOsDiskSizeGB: aksSystemOsDiskSizeGB userOsDiskSizeGB: aksUserOsDiskSizeGB aroDevopsMsiId: aroDevopsMsiId + dcrId: dataCollection.outputs.dcrId } } output aksClusterName string = mgmtCluster.outputs.aksClusterName // -// K E Y V A U L T S +// M E T R I C S +// + +module dataCollection '../modules/metrics/datacollection.bicep' = { + name: '${resourceGroup().name}-aksClusterName' + params: { + azureMonitorWorkspaceLocation: location + azureMonitorWorkspaceName: azureMonitorWorkspaceName + regionalResourceGroup: regionalResourceGroup + aksClusterName: aksClusterName + } +} + +// +// K E Y V A U L T S // module cxCSIKeyVaultAccess '../modules/keyvault/keyvault-secret-access.bicep' = [ diff --git a/dev-infrastructure/templates/svc-cluster.bicep b/dev-infrastructure/templates/svc-cluster.bicep index 8d2b527dcf..83923ccfa0 100644 --- a/dev-infrastructure/templates/svc-cluster.bicep +++ b/dev-infrastructure/templates/svc-cluster.bicep @@ -140,6 +140,9 @@ param regionalDNSZoneName string @description('Frontend Ingress Certificate Name') param frontendIngressCertName string +@description('The resource ID of the Azure Monitor Workspace (stores prometheus metrics)') +param azureMonitorWorkspaceName string + var clusterServiceMIName = 'clusters-service' resource serviceKeyVault 'Microsoft.KeyVault/vaults@2024-04-01-preview' existing = { @@ -199,10 +202,26 @@ module svcCluster '../modules/aks-cluster-base.bicep' = { aksKeyVaultName: aksKeyVaultName acrPullResourceGroups: acrPullResourceGroups aroDevopsMsiId: aroDevopsMsiId + dcrId: dataCollection.outputs.dcrId } } output aksClusterName string = svcCluster.outputs.aksClusterName + +// +// M E T R I C S +// + +module dataCollection '../modules/metrics/datacollection.bicep' = { + name: '${resourceGroup().name}-${aksClusterName}' + params: { + azureMonitorWorkspaceLocation: location + azureMonitorWorkspaceName: azureMonitorWorkspaceName + regionalResourceGroup: regionalResourceGroup + aksClusterName: aksClusterName + } +} + var frontendMI = filter(svcCluster.outputs.userAssignedIdentities, id => id.uamiName == 'frontend')[0] var backendMI = filter(svcCluster.outputs.userAssignedIdentities, id => id.uamiName == 'backend')[0]