fix
This commit is contained in:
@@ -28,17 +28,17 @@ data "kustomization_overlay" "data" {
|
||||
patches {
|
||||
target {
|
||||
kind = "Alertmanager"
|
||||
name = "prometheus-community-kube-alertmanager"
|
||||
name = "alertmanager-kube-promethe-alertmanager"
|
||||
}
|
||||
patch = <<-EOF
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Alertmanager
|
||||
metadata:
|
||||
name: prometheus-community-kube-alertmanager
|
||||
name: alertmanager-kube-promethe-alertmanager
|
||||
spec:
|
||||
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
|
||||
version: ${var.images.alertmanager.tag}
|
||||
externalUrl: http://alertmanager-operated.${var.namespace}:9093
|
||||
externalUrl: http://${var.component}-${var.instance}.${var.namespace}:9093
|
||||
replicas: ${var.replicas}
|
||||
listenLocal: ${var.listenLocal}
|
||||
logLevel: "${var.logLevel}"
|
||||
@@ -72,7 +72,7 @@ data "kustomization_overlay" "data" {
|
||||
patches {
|
||||
target {
|
||||
kind = "ServiceMonitor"
|
||||
name = "prometheus-community-kube-alertmanager"
|
||||
name = "alertmanager-kube-promethe-alertmanager"
|
||||
}
|
||||
patch = <<-EOF
|
||||
- op: replace
|
||||
@@ -83,13 +83,13 @@ data "kustomization_overlay" "data" {
|
||||
patches {
|
||||
target {
|
||||
kind = "PrometheusRule"
|
||||
name = "prometheus-community-kube-alertmanager.rules"
|
||||
name = "alertmanager-kube-promethe-alertmanager.rules"
|
||||
}
|
||||
patch = <<-EOF
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: prometheus-community-kube-alertmanager.rules
|
||||
name: alertmanager-kube-promethe-alertmanager.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
@@ -98,53 +98,53 @@ spec:
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m]) == 0
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
< on (namespace,service,cluster) group_left
|
||||
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m]))
|
||||
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
expr: |-
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: |-
|
||||
count by (namespace,service,cluster) (
|
||||
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"})
|
||||
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
|
||||
)
|
||||
!= 1
|
||||
- alert: AlertmanagerClusterDown
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service,cluster) (
|
||||
avg_over_time(up{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
|
||||
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (namespace,service,cluster) (
|
||||
up{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}
|
||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
@@ -152,11 +152,11 @@ spec:
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service,cluster) (
|
||||
changes(process_start_time_seconds{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}[10m]) > 4
|
||||
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (namespace,service,cluster) (
|
||||
up{job="prometheus-community-kube-alertmanager",namespace="${var.namespace}"}
|
||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
|
||||
Reference in New Issue
Block a user