This commit is contained in:
2024-01-24 14:12:50 +01:00
parent 7e2440ef75
commit 974d30ef28
12 changed files with 186 additions and 121 deletions

View File

@@ -28,17 +28,17 @@ data "kustomization_overlay" "data" {
patches {
target {
kind = "Alertmanager"
name = "prometheus-community-kube-alertmanager"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: prometheus-community-kube-alertmanager
name: alertmanager-kube-promethe-alertmanager
spec:
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
version: ${var.images.alertmanager.tag}
externalUrl: http://alertmanager-operated.${var.namespace}:9093
externalUrl: http://${var.component}-${var.instance}.${var.namespace}:9093
replicas: ${var.replicas}
listenLocal: ${var.listenLocal}
logLevel: "${var.logLevel}"
@@ -72,7 +72,7 @@ data "kustomization_overlay" "data" {
patches {
target {
kind = "ServiceMonitor"
name = "prometheus-community-kube-alertmanager"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
- op: replace
@@ -83,13 +83,13 @@ data "kustomization_overlay" "data" {
patches {
target {
kind = "PrometheusRule"
name = "prometheus-community-kube-alertmanager.rules"
name = "alertmanager-kube-promethe-alertmanager.rules"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-alertmanager.rules
name: alertmanager-kube-promethe-alertmanager.rules
spec:
groups:
- name: alertmanager.rules
@@ -98,53 +98,53 @@ spec:
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m]) == 0
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
- alert: AlertmanagerMembersInconsistent
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m])
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m]))
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
- alert: AlertmanagerFailedToSendAlerts
expr: |-
(
rate(alertmanager_notifications_failed_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m])
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m])
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerConfigInconsistent
expr: |-
count by (namespace,service,cluster) (
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"})
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
)
!= 1
- alert: AlertmanagerClusterDown
expr: |-
(
count by (namespace,service,cluster) (
avg_over_time(up{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
)
/
count by (namespace,service,cluster) (
up{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5
@@ -152,11 +152,11 @@ spec:
expr: |-
(
count by (namespace,service,cluster) (
changes(process_start_time_seconds{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[10m]) > 4
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
)
/
count by (namespace,service,cluster) (
up{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5