This commit is contained in:
2024-01-28 10:02:31 +01:00
parent 0961759af7
commit 8a214e997c
162 changed files with 8560 additions and 0 deletions

View File

@@ -0,0 +1,167 @@
locals {
authentik_url = "http://authentik.${var.domain}-auth.svc"
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
rb-patch = <<-EOF
- op: replace
path: /subjects/0/namespace
value: "${var.namespace}"
EOF
}
data "kubernetes_secret_v1" "authentik" {
metadata {
name = "authentik"
namespace = "${var.domain}-auth"
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
patches {
target {
kind = "Alertmanager"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager-kube-promethe-alertmanager
spec:
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
version: ${var.images.alertmanager.tag}
externalUrl: http://${var.component}-${var.instance}.${var.namespace}:9093
replicas: ${var.replicas}
listenLocal: ${var.listenLocal}
logLevel: "${var.logLevel}"
retention: "${var.retention}"
EOF
}
patches {
target {
kind = "ConfigMap"
name = "alertmanager-kube-grafana-datasource"
}
patch = <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-operated
data:
datasource.yaml: |-
apiVersion: 1
datasources:
- name: Alertmanager
type: alertmanager
uid: alertmanager
url: http://${var.component}-${var.instance}.${var.namespace}:9093/
access: proxy
jsonData:
handleGrafanaManagedAlerts: false
implementation: prometheus
EOF
}
patches {
target {
kind = "ServiceMonitor"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
- op: replace
path: /spec/namespaceSelector/matchNames/0
value: "${var.namespace}"
EOF
}
patches {
target {
kind = "PrometheusRule"
name = "alertmanager-kube-promethe-alertmanager.rules"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alertmanager-kube-promethe-alertmanager.rules
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
- alert: AlertmanagerMembersInconsistent
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
- alert: AlertmanagerFailedToSendAlerts
expr: |-
(
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerConfigInconsistent
expr: |-
count by (namespace,service,cluster) (
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
)
!= 1
- alert: AlertmanagerClusterDown
expr: |-
(
count by (namespace,service,cluster) (
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5
- alert: AlertmanagerClusterCrashlooping
expr: |-
(
count by (namespace,service,cluster) (
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5
EOF
}
}

View File

@@ -0,0 +1,111 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alertmanager
description: null
options:
retention:
default: 120h
examples:
- 120h
type: string
ingress_class:
default: traefik
examples:
- traefik
type: string
logLevel:
default: info
examples:
- info
type: string
sub_domain:
default: alertmanager
examples:
- alertmanager
type: string
images:
default:
alertmanager:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/alertmanager
tag: v0.26.0
examples:
- alertmanager:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/alertmanager
tag: v0.26.0
properties:
alertmanager:
default:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/alertmanager
tag: v0.26.0
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: quay.io
type: string
repository:
default: prometheus/alertmanager
type: string
tag:
default: v0.26.0
type: string
type: object
type: object
domain:
default: your-company
examples:
- your-company
type: string
listenLocal:
default: false
examples:
- false
type: boolean
app_group:
default: monitor
examples:
- monitor
type: string
domain_name:
default: your_company.com
examples:
- your_company.com
type: string
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
replicas:
default: 1
examples:
- 1
type: integer
dependencies:
- dist: null
category: share
component: authentik-forward
providers:
kubernetes: true
authentik: true
kubectl: true
postgresql: null
mysql: null
restapi: true
http: true
gitea: null
tfaddtype: null

View File

@@ -0,0 +1,38 @@
# Source: kube-prometheus-stack/templates/alertmanager/alertmanager.yaml
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.2.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.2.0
release: "alertmanager"
heritage: "Helm"
spec:
image: "quay.io/prometheus/alertmanager:v0.26.0"
version: v0.26.0
replicas: 1
listenLocal: false
serviceAccountName: alertmanager-kube-promethe-alertmanager
externalUrl: http://alertmanager-kube-promethe-alertmanager.vynil-monitor:9093
paused: false
logFormat: "logfmt"
logLevel: "info"
retention: "120h"
alertmanagerConfigSelector: {}
alertmanagerConfigNamespaceSelector: {}
routePrefix: "/"
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
portName: http-web

View File

@@ -0,0 +1,142 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alertmanager-kube-promethe-alertmanager.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.2.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.2.0
release: "alertmanager"
heritage: "Helm"
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]))
for: 15m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |-
(
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration=~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration!~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |-
count by (namespace,service,cluster) (
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |-
(
count by (namespace,service,cluster) (
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]) < 0.5
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |-
(
count by (namespace,service,cluster) (
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[10m]) > 4
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

View File

@@ -0,0 +1,33 @@
# Source: kube-prometheus-stack/templates/alertmanager/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.2.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.2.0
release: "alertmanager"
heritage: "Helm"
spec:
selector:
matchLabels:
app: kube-prometheus-stack-alertmanager
release: "alertmanager"
self-monitor: "true"
namespaceSelector:
matchNames:
- "vynil-monitor"
endpoints:
- port: http-web
enableHttp2: true
path: "/metrics"
- port: reloader-web
scheme: http
path: "/metrics"

View File

@@ -0,0 +1,75 @@
locals {
dns_name = "${var.sub_domain}.${var.domain_name}"
dns_names = [local.dns_name]
app_name = var.component == var.instance ? var.instance : format("%s-%s", var.component, var.instance)
icon = "favicon.ico"
request_headers = {
"Content-Type" = "application/json"
Authorization = "Bearer ${data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]}"
}
service = {
"name" = "alertmanager-operated"
"port" = {
"number" = 9093
}
}
}
module "ingress" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//ingress"
component = ""
instance = var.instance
namespace = var.namespace
issuer = var.issuer
ingress_class = var.ingress_class
labels = local.common-labels
dns_names = local.dns_names
middlewares = [module.forward.middleware]
services = [local.service]
providers = {
kubectl = kubectl
}
}
module "application" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//application"
component = var.component
instance = var.instance
app_group = var.app_group
dns_name = local.dns_name
icon = local.icon
protocol_provider = module.forward.provider-id
providers = {
authentik = authentik
}
}
provider "restapi" {
uri = "http://authentik.${var.domain}-auth.svc/api/v3/"
headers = local.request_headers
create_method = "PATCH"
update_method = "PATCH"
destroy_method = "PATCH"
write_returns_object = true
id_attribute = "name"
}
module "forward" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//forward"
component = var.component
instance = var.instance
domain = var.domain
namespace = var.namespace
ingress_class = var.ingress_class
labels = local.common-labels
dns_names = local.dns_names
service = local.service
icon = local.icon
request_headers = local.request_headers
providers = {
restapi = restapi
http = http
kubectl = kubectl
authentik = authentik
}
}

View File

@@ -0,0 +1,33 @@
locals {
svc-label = merge(local.common-labels, {
"app" = "kube-prometheus-stack-prometheus"
"release" = "prometheus"
"self-monitor" = "true"
})
}
resource "kubectl_manifest" "svc" {
yaml_body = <<-EOF
apiVersion: v1
kind: Service
metadata:
name: "${var.component}-${var.instance}"
namespace: "${var.namespace}"
labels: ${jsonencode(local.svc-label)}
spec:
ports:
- name: http-web
port: 9093
targetPort: 9093
protocol: TCP
- name: reloader-web
appProtocol: http
port: 8080
targetPort: reloader-web
selector:
app.kubernetes.io/name: alertmanager
alertmanager: alertmanager-kube-promethe-alertmanager
sessionAffinity: None
type: "ClusterIP"
EOF
}

View File

@@ -0,0 +1,19 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-kube-grafana-datasource
labels:
grafana_datasource: "1"
app: alertmanager
data:
datasource.yaml: |-
apiVersion: 1
datasources:
- name: Alertmanager
type: alertmanager
uid: alertmanager
url: http://alertmanager-operated.vynil-monitor:9093/
access: proxy
jsonData:
handleGrafanaManagedAlerts: false
implementation: prometheus

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,18 @@
# Source: kube-prometheus-stack/templates/alertmanager/secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.2.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.2.0
release: "alertmanager"
heritage: "Helm"
data:
alertmanager.yaml: "Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0KaW5oaWJpdF9ydWxlczoKLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIC0gYWxlcnRuYW1lCiAgc291cmNlX21hdGNoZXJzOgogIC0gc2V2ZXJpdHkgPSBjcml0aWNhbAogIHRhcmdldF9tYXRjaGVyczoKICAtIHNldmVyaXR5ID1+IHdhcm5pbmd8aW5mbwotIGVxdWFsOgogIC0gbmFtZXNwYWNlCiAgLSBhbGVydG5hbWUKICBzb3VyY2VfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IHdhcm5pbmcKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIHNvdXJjZV9tYXRjaGVyczoKICAtIGFsZXJ0bmFtZSA9IEluZm9JbmhpYml0b3IKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBhbGVydG5hbWUgPSBJbmZvSW5oaWJpdG9yCnJlY2VpdmVyczoKLSBuYW1lOiAibnVsbCIKcm91dGU6CiAgZ3JvdXBfYnk6CiAgLSBuYW1lc3BhY2UKICBncm91cF9pbnRlcnZhbDogNW0KICBncm91cF93YWl0OiAzMHMKICByZWNlaXZlcjogIm51bGwiCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByb3V0ZXM6CiAgLSBtYXRjaGVyczoKICAgIC0gYWxlcnRuYW1lID0gIldhdGNoZG9nIgogICAgcmVjZWl2ZXI6ICJudWxsIgp0ZW1wbGF0ZXM6Ci0gL2V0Yy9hbGVydG1hbmFnZXIvY29uZmlnLyoudG1wbA=="

View File

@@ -0,0 +1,20 @@
---
# Source: kube-prometheus-stack/templates/alertmanager/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/name: kube-prometheus-stack-alertmanager
app.kubernetes.io/component: alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.2.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.2.0
release: "alertmanager"
heritage: "Helm"
automountServiceAccountToken: true