fix
This commit is contained in:
167
monitor/alertmanager/datas.tf
Normal file
167
monitor/alertmanager/datas.tf
Normal file
@@ -0,0 +1,167 @@
|
||||
locals {
|
||||
authentik_url = "http://authentik.${var.domain}-auth.svc"
|
||||
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
|
||||
common-labels = {
|
||||
"vynil.solidite.fr/owner-name" = var.instance
|
||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
||||
"vynil.solidite.fr/owner-category" = var.category
|
||||
"vynil.solidite.fr/owner-component" = var.component
|
||||
"app.kubernetes.io/managed-by" = "vynil"
|
||||
"app.kubernetes.io/instance" = var.instance
|
||||
}
|
||||
rb-patch = <<-EOF
|
||||
- op: replace
|
||||
path: /subjects/0/namespace
|
||||
value: "${var.namespace}"
|
||||
EOF
|
||||
}
|
||||
|
||||
data "kubernetes_secret_v1" "authentik" {
|
||||
metadata {
|
||||
name = "authentik"
|
||||
namespace = "${var.domain}-auth"
|
||||
}
|
||||
}
|
||||
|
||||
data "kustomization_overlay" "data" {
|
||||
common_labels = local.common-labels
|
||||
namespace = var.namespace
|
||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
|
||||
patches {
|
||||
target {
|
||||
kind = "Alertmanager"
|
||||
name = "alertmanager-kube-promethe-alertmanager"
|
||||
}
|
||||
patch = <<-EOF
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Alertmanager
|
||||
metadata:
|
||||
name: alertmanager-kube-promethe-alertmanager
|
||||
spec:
|
||||
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
|
||||
version: ${var.images.alertmanager.tag}
|
||||
externalUrl: http://${var.component}-${var.instance}.${var.namespace}:9093
|
||||
replicas: ${var.replicas}
|
||||
listenLocal: ${var.listenLocal}
|
||||
logLevel: "${var.logLevel}"
|
||||
retention: "${var.retention}"
|
||||
EOF
|
||||
}
|
||||
patches {
|
||||
target {
|
||||
kind = "ConfigMap"
|
||||
name = "alertmanager-kube-grafana-datasource"
|
||||
}
|
||||
patch = <<-EOF
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-operated
|
||||
data:
|
||||
datasource.yaml: |-
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Alertmanager
|
||||
type: alertmanager
|
||||
uid: alertmanager
|
||||
url: http://${var.component}-${var.instance}.${var.namespace}:9093/
|
||||
access: proxy
|
||||
jsonData:
|
||||
handleGrafanaManagedAlerts: false
|
||||
implementation: prometheus
|
||||
EOF
|
||||
}
|
||||
patches {
|
||||
target {
|
||||
kind = "ServiceMonitor"
|
||||
name = "alertmanager-kube-promethe-alertmanager"
|
||||
}
|
||||
patch = <<-EOF
|
||||
- op: replace
|
||||
path: /spec/namespaceSelector/matchNames/0
|
||||
value: "${var.namespace}"
|
||||
EOF
|
||||
}
|
||||
patches {
|
||||
target {
|
||||
kind = "PrometheusRule"
|
||||
name = "alertmanager-kube-promethe-alertmanager.rules"
|
||||
}
|
||||
patch = <<-EOF
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: alertmanager-kube-promethe-alertmanager.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
expr: |-
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
< on (namespace,service,cluster) group_left
|
||||
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
expr: |-
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
expr: |-
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
||||
/
|
||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: |-
|
||||
count by (namespace,service,cluster) (
|
||||
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
|
||||
)
|
||||
!= 1
|
||||
- alert: AlertmanagerClusterDown
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service,cluster) (
|
||||
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (namespace,service,cluster) (
|
||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
expr: |-
|
||||
(
|
||||
count by (namespace,service,cluster) (
|
||||
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (namespace,service,cluster) (
|
||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
EOF
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user