168 lines
6.2 KiB
HCL
168 lines
6.2 KiB
HCL
locals {
|
|
authentik_url = "http://authentik.${var.domain}-auth.svc"
|
|
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
|
|
common-labels = {
|
|
"vynil.solidite.fr/owner-name" = var.instance
|
|
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
"vynil.solidite.fr/owner-category" = var.category
|
|
"vynil.solidite.fr/owner-component" = var.component
|
|
"app.kubernetes.io/managed-by" = "vynil"
|
|
"app.kubernetes.io/instance" = var.instance
|
|
}
|
|
rb-patch = <<-EOF
|
|
- op: replace
|
|
path: /subjects/0/namespace
|
|
value: "${var.namespace}"
|
|
EOF
|
|
}
|
|
|
|
data "kubernetes_secret_v1" "authentik" {
|
|
metadata {
|
|
name = "authentik"
|
|
namespace = "${var.domain}-auth"
|
|
}
|
|
}
|
|
|
|
data "kustomization_overlay" "data" {
|
|
common_labels = local.common-labels
|
|
namespace = var.namespace
|
|
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
|
|
patches {
|
|
target {
|
|
kind = "Alertmanager"
|
|
name = "alertmanager-kube-promethe-alertmanager"
|
|
}
|
|
patch = <<-EOF
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: Alertmanager
|
|
metadata:
|
|
name: alertmanager-kube-promethe-alertmanager
|
|
spec:
|
|
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
|
|
version: ${var.images.alertmanager.tag}
|
|
externalUrl: http://${var.component}-${var.instance}.${var.namespace}:9093
|
|
replicas: ${var.replicas}
|
|
listenLocal: ${var.listenLocal}
|
|
logLevel: "${var.logLevel}"
|
|
retention: "${var.retention}"
|
|
EOF
|
|
}
|
|
patches {
|
|
target {
|
|
kind = "ConfigMap"
|
|
name = "alertmanager-kube-grafana-datasource"
|
|
}
|
|
patch = <<-EOF
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: alertmanager-operated
|
|
data:
|
|
datasource.yaml: |-
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: Alertmanager
|
|
type: alertmanager
|
|
uid: alertmanager
|
|
url: http://${var.component}-${var.instance}.${var.namespace}:9093/
|
|
access: proxy
|
|
jsonData:
|
|
handleGrafanaManagedAlerts: false
|
|
implementation: prometheus
|
|
EOF
|
|
}
|
|
patches {
|
|
target {
|
|
kind = "ServiceMonitor"
|
|
name = "alertmanager-kube-promethe-alertmanager"
|
|
}
|
|
patch = <<-EOF
|
|
- op: replace
|
|
path: /spec/namespaceSelector/matchNames/0
|
|
value: "${var.namespace}"
|
|
EOF
|
|
}
|
|
patches {
|
|
target {
|
|
kind = "PrometheusRule"
|
|
name = "alertmanager-kube-promethe-alertmanager.rules"
|
|
}
|
|
patch = <<-EOF
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: alertmanager-kube-promethe-alertmanager.rules
|
|
spec:
|
|
groups:
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: AlertmanagerFailedReload
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
|
|
- alert: AlertmanagerMembersInconsistent
|
|
expr: |-
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
|
< on (namespace,service,cluster) group_left
|
|
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
|
|
- alert: AlertmanagerFailedToSendAlerts
|
|
expr: |-
|
|
(
|
|
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
|
/
|
|
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
|
)
|
|
> 0.01
|
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
expr: |-
|
|
min by (namespace,service, integration) (
|
|
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
|
/
|
|
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
|
)
|
|
> 0.01
|
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
expr: |-
|
|
min by (namespace,service, integration) (
|
|
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
|
/
|
|
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
|
)
|
|
> 0.01
|
|
- alert: AlertmanagerConfigInconsistent
|
|
expr: |-
|
|
count by (namespace,service,cluster) (
|
|
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
|
|
)
|
|
!= 1
|
|
- alert: AlertmanagerClusterDown
|
|
expr: |-
|
|
(
|
|
count by (namespace,service,cluster) (
|
|
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
|
|
)
|
|
/
|
|
count by (namespace,service,cluster) (
|
|
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
|
)
|
|
)
|
|
>= 0.5
|
|
- alert: AlertmanagerClusterCrashlooping
|
|
expr: |-
|
|
(
|
|
count by (namespace,service,cluster) (
|
|
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
|
|
)
|
|
/
|
|
count by (namespace,service,cluster) (
|
|
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
|
)
|
|
)
|
|
>= 0.5
|
|
EOF
|
|
}
|
|
}
|