locals { authentik_url = "http://authentik.${var.domain}-auth.svc" authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"] common_labels = { "vynil.solidite.fr/owner-name" = var.instance "vynil.solidite.fr/owner-namespace" = var.namespace "vynil.solidite.fr/owner-category" = var.category "vynil.solidite.fr/owner-component" = var.component "app.kubernetes.io/managed-by" = "vynil" "app.kubernetes.io/instance" = var.instance } rb-patch = <<-EOF - op: replace path: /subjects/0/namespace value: "${var.namespace}" EOF } data "kubernetes_secret_v1" "authentik" { metadata { name = "authentik" namespace = "${var.domain}-auth" } } data "kustomization_overlay" "data" { common_labels = local.common_labels namespace = var.namespace resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1] patches { target { kind = "Alertmanager" name = "alertmanager-kube-promethe-alertmanager" } patch = <<-EOF apiVersion: monitoring.coreos.com/v1 kind: Alertmanager metadata: name: alertmanager-kube-promethe-alertmanager spec: image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}" version: ${var.images.alertmanager.tag} externalUrl: http://${var.instance}-${var.component}.${var.namespace}:9093 replicas: ${var.replicas} listenLocal: ${var.listenLocal} logLevel: "${var.logLevel}" retention: "${var.retention}" EOF } patches { target { kind = "ConfigMap" name = "alertmanager-kube-grafana-datasource" } patch = <<-EOF apiVersion: v1 kind: ConfigMap metadata: name: alertmanager-operated data: datasource.yaml: |- apiVersion: 1 datasources: - name: Alertmanager type: alertmanager uid: alertmanager url: http://${var.instance}-${var.component}.${var.namespace}:9093/ access: proxy jsonData: handleGrafanaManagedAlerts: false implementation: prometheus EOF } patches { target { kind = "ServiceMonitor" name = "alertmanager-kube-promethe-alertmanager" } patch = <<-EOF - op: replace path: /spec/namespaceSelector/matchNames/0 value: "${var.namespace}" EOF } patches { target { kind = "PrometheusRule" name = "alertmanager-kube-promethe-alertmanager.rules" } patch = <<-EOF apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: alertmanager-kube-promethe-alertmanager.rules spec: groups: - name: alertmanager.rules rules: - alert: AlertmanagerFailedReload expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0 - alert: AlertmanagerMembersInconsistent expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < on (namespace,service,cluster) group_left count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])) - alert: AlertmanagerFailedToSendAlerts expr: |- ( rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) ) > 0.01 - alert: AlertmanagerClusterFailedToSendAlerts expr: |- min by (namespace,service, integration) ( rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m]) ) > 0.01 - alert: AlertmanagerClusterFailedToSendAlerts expr: |- min by (namespace,service, integration) ( rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m]) ) > 0.01 - alert: AlertmanagerConfigInconsistent expr: |- count by (namespace,service,cluster) ( count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}) ) != 1 - alert: AlertmanagerClusterDown expr: |- ( count by (namespace,service,cluster) ( avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5 ) / count by (namespace,service,cluster) ( up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"} ) ) >= 0.5 - alert: AlertmanagerClusterCrashlooping expr: |- ( count by (namespace,service,cluster) ( changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4 ) / count by (namespace,service,cluster) ( up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"} ) ) >= 0.5 EOF } }