Files
domain/monitor/alertmanager/datas.tf
2024-05-25 14:55:10 +02:00

168 lines
6.2 KiB
HCL

locals {
authentik_url = "http://authentik.${var.domain}-auth.svc"
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
common_labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
rb-patch = <<-EOF
- op: replace
path: /subjects/0/namespace
value: "${var.namespace}"
EOF
}
data "kubernetes_secret_v1" "authentik" {
metadata {
name = "authentik"
namespace = "${var.domain}-auth"
}
}
data "kustomization_overlay" "data" {
common_labels = local.common_labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
patches {
target {
kind = "Alertmanager"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager-kube-promethe-alertmanager
spec:
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
version: ${var.images.alertmanager.tag}
externalUrl: http://${var.instance}-${var.component}.${var.namespace}:9093
replicas: ${var.replicas}
listenLocal: ${var.listenLocal}
logLevel: "${var.logLevel}"
retention: "${var.retention}"
EOF
}
patches {
target {
kind = "ConfigMap"
name = "alertmanager-kube-grafana-datasource"
}
patch = <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-operated
data:
datasource.yaml: |-
apiVersion: 1
datasources:
- name: Alertmanager
type: alertmanager
uid: alertmanager
url: http://${var.instance}-${var.component}.${var.namespace}:9093/
access: proxy
jsonData:
handleGrafanaManagedAlerts: false
implementation: prometheus
EOF
}
patches {
target {
kind = "ServiceMonitor"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
- op: replace
path: /spec/namespaceSelector/matchNames/0
value: "${var.namespace}"
EOF
}
patches {
target {
kind = "PrometheusRule"
name = "alertmanager-kube-promethe-alertmanager.rules"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alertmanager-kube-promethe-alertmanager.rules
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
- alert: AlertmanagerMembersInconsistent
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
- alert: AlertmanagerFailedToSendAlerts
expr: |-
(
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerConfigInconsistent
expr: |-
count by (namespace,service,cluster) (
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
)
!= 1
- alert: AlertmanagerClusterDown
expr: |-
(
count by (namespace,service,cluster) (
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5
- alert: AlertmanagerClusterCrashlooping
expr: |-
(
count by (namespace,service,cluster) (
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5
EOF
}
}