This commit is contained in:
2024-01-22 16:56:33 +01:00
parent 8ad641e923
commit 0f2eced091
29 changed files with 263 additions and 325 deletions

View File

@@ -8,7 +8,7 @@ locals {
} }
global = { global = {
"domain" = var.namespace "domain" = var.namespace
"domain-name" = "admin.${var.domain-name}" "domain-name" = "monitor.${var.domain-name}"
"issuer" = var.issuer "issuer" = var.issuer
"ingress-class" = var.ingress-class "ingress-class" = var.ingress-class
"backups" = var.backups "backups" = var.backups
@@ -16,6 +16,8 @@ locals {
} }
prometheus = { for k, v in var.prometheus : k => v if k!="enable" } prometheus = { for k, v in var.prometheus : k => v if k!="enable" }
alertmanager = { for k, v in var.alertmanager : k => v if k!="enable" } alertmanager = { for k, v in var.alertmanager : k => v if k!="enable" }
alerts-core = { for k, v in var.alerts-core : k => v if k!="enable" }
alerts-containers = { for k, v in var.alerts-containers : k => v if k!="enable" }
nodeExporter = { for k, v in var.node-exporter : k => v if k!="enable" } nodeExporter = { for k, v in var.node-exporter : k => v if k!="enable" }
kubeStateMetrics = merge({"cluster-admin" = true}, { for k, v in var.kube-state-metrics : k => v if k!="enable" }) kubeStateMetrics = merge({"cluster-admin" = true}, { for k, v in var.kube-state-metrics : k => v if k!="enable" })
monitorControlPlan = merge({"cluster-admin" = true}, { for k, v in var.monitor-control-plan : k => v if k!="enable" }) monitorControlPlan = merge({"cluster-admin" = true}, { for k, v in var.monitor-control-plan : k => v if k!="enable" })
@@ -117,3 +119,38 @@ resource "kubectl_manifest" "monitorControlPlan" {
options: ${jsonencode(merge(local.global, local.monitorControlPlan))} options: ${jsonencode(merge(local.global, local.monitorControlPlan))}
EOF EOF
} }
resource "kubectl_manifest" "alerts-core" {
count = var.alerts-core.enable ? 1 : 0
depends_on = [kubernetes_namespace_v1.monitor-ns]
yaml_body = <<-EOF
apiVersion: "vynil.solidite.fr/v1"
kind: "Install"
metadata:
name: "alerts-core"
namespace: "${kubernetes_namespace_v1.monitor-ns[0].metadata[0].name}"
labels: ${jsonencode(local.common-labels)}
spec:
distrib: "${var.distributions.domain}"
category: "monitor"
component: "alerts-core"
options: ${jsonencode(merge(local.global, local.alerts-core))}
EOF
}
resource "kubectl_manifest" "alerts-containers" {
count = var.alerts-containers.enable ? 1 : 0
depends_on = [kubernetes_namespace_v1.monitor-ns]
yaml_body = <<-EOF
apiVersion: "vynil.solidite.fr/v1"
kind: "Install"
metadata:
name: "alerts-containers"
namespace: "${kubernetes_namespace_v1.monitor-ns[0].metadata[0].name}"
labels: ${jsonencode(local.common-labels)}
spec:
distrib: "${var.distributions.domain}"
category: "monitor"
component: "alerts-containers"
options: ${jsonencode(merge(local.global, local.alerts-containers))}
EOF
}

View File

@@ -6,11 +6,91 @@ metadata:
name: domain-monitor name: domain-monitor
description: null description: null
options: options:
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
issuer: issuer:
default: letsencrypt-prod default: letsencrypt-prod
examples: examples:
- letsencrypt-prod - letsencrypt-prod
type: string type: string
ingress-class:
default: traefik
examples:
- traefik
type: string
app-group:
default: monitor
examples:
- monitor
type: string
alertmanager:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: alertmanager
alerts-core:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: alerts-core
domain:
default: your-company
examples:
- your-company
type: string
prometheus:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: prometheus
monitor-control-plan:
default:
enable: false
examples:
- enable: false
properties:
enable:
default: false
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: monitor-control-plan
kube-state-metrics:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: kube-state-metrics
storage-classes: storage-classes:
default: default:
BlockReadWriteMany: '' BlockReadWriteMany: ''
@@ -36,12 +116,7 @@ options:
default: '' default: ''
type: string type: string
type: object type: object
ingress-class: alerts-containers:
default: traefik
examples:
- traefik
type: string
alertmanager:
default: default:
enable: true enable: true
examples: examples:
@@ -52,7 +127,19 @@ options:
type: boolean type: boolean
type: object type: object
x-vynil-category: monitor x-vynil-category: monitor
x-vynil-package: alertmanager x-vynil-package: alerts-containers
node-exporter:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: node-exporter
backups: backups:
default: default:
enable: false enable: false
@@ -83,59 +170,6 @@ options:
default: backup-settings default: backup-settings
type: string type: string
type: object type: object
app-group:
default: infra
examples:
- infra
type: string
kube-state-metrics:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: kube-state-metrics
monitor-control-plan:
default:
enable: false
examples:
- enable: false
properties:
enable:
default: false
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: monitor-control-plan
prometheus:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: prometheus
node-exporter:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: node-exporter
distributions: distributions:
default: default:
core: core core: core
@@ -151,16 +185,6 @@ options:
default: domain default: domain
type: string type: string
type: object type: object
domain:
default: your-company
examples:
- your-company
type: string
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
dependencies: [] dependencies: []
providers: providers:
kubernetes: true kubernetes: true

View File

@@ -6,16 +6,41 @@ metadata:
name: alertmanager name: alertmanager
description: null description: null
options: options:
sub-domain: replicas:
default: to-be-set default: 1
examples: examples:
- to-be-set - 1
type: integer
listenLocal:
default: false
examples:
- false
type: boolean
logLevel:
default: info
examples:
- info
type: string
sub-domain:
default: alertmanager
examples:
- alertmanager
type: string type: string
ingress-class: ingress-class:
default: traefik default: traefik
examples: examples:
- traefik - traefik
type: string type: string
retention:
default: 120h
examples:
- 120h
type: string
app-group:
default: monitor
examples:
- monitor
type: string
issuer: issuer:
default: letsencrypt-prod default: letsencrypt-prod
examples: examples:
@@ -60,41 +85,16 @@ options:
type: string type: string
type: object type: object
type: object type: object
replicas:
default: 1
examples:
- 1
type: integer
retention:
default: 120h
examples:
- 120h
type: string
domain-name: domain-name:
default: your_company.com default: your_company.com
examples: examples:
- your_company.com - your_company.com
type: string type: string
app-group:
default: monitor
examples:
- monitor
type: string
logLevel:
default: info
examples:
- info
type: string
domain: domain:
default: your-company default: your-company
examples: examples:
- your-company - your-company
type: string type: string
listenLocal:
default: false
examples:
- false
type: boolean
dependencies: dependencies:
- dist: null - dist: null
category: share category: share

View File

@@ -0,0 +1,16 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -0,0 +1,18 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alerts-containers
description: null
options: {}
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,3 +1,4 @@
---
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml # Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml
apiVersion: monitoring.coreos.com/v1 apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule kind: PrometheusRule

View File

@@ -0,0 +1,16 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -0,0 +1,18 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alerts-core
description: null
options: {}
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,3 +1,4 @@
---
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml # Source: kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml
apiVersion: monitoring.coreos.com/v1 apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule kind: PrometheusRule

View File

@@ -5,75 +5,11 @@ category: monitor
metadata: metadata:
name: monitor-control-plan name: monitor-control-plan
description: null description: null
options: options: {}
sub-domain:
default: to-be-set
examples:
- to-be-set
type: string
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
domain:
default: your-company
examples:
- your-company
type: string
ingress-class:
default: traefik
examples:
- traefik
type: string
images:
default:
operator:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
examples:
- operator:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
properties:
operator:
default:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: docker.io
type: string
repository:
default: to-be/defined
type: string
tag:
default: v1.0.0
type: string
type: object
type: object
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
dependencies: [] dependencies: []
providers: providers:
kubernetes: true kubernetes: true
authentik: true authentik: null
kubectl: true kubectl: true
postgresql: null postgresql: null
restapi: null restapi: null

View File

@@ -6,41 +6,51 @@ metadata:
name: prometheus name: prometheus
description: null description: null
options: options:
shards: issuer:
default: 1 default: letsencrypt-prod
examples: examples:
- 1 - letsencrypt-prod
type: integer type: string
retention: app-group:
default: 10d default: monitor
examples: examples:
- 10d - monitor
type: string type: string
logLevel: logLevel:
default: info default: info
examples: examples:
- info - info
type: string type: string
domain-name: enableAdminAPI:
default: your_company.com default: false
examples: examples:
- your_company.com - false
type: string type: boolean
domain: replicas:
default: your-company default: 1
examples: examples:
- your-company - 1
type: string type: integer
listenLocal: listenLocal:
default: false default: false
examples: examples:
- false - false
type: boolean type: boolean
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
ingress-class: ingress-class:
default: traefik default: traefik
examples: examples:
- traefik - traefik
type: string type: string
shards:
default: 1
examples:
- 1
type: integer
images: images:
default: default:
prometheus: prometheus:
@@ -80,31 +90,21 @@ options:
type: string type: string
type: object type: object
type: object type: object
app-group: domain:
default: monitor default: your-company
examples: examples:
- monitor - your-company
type: string type: string
issuer: retention:
default: letsencrypt-prod default: 10d
examples: examples:
- letsencrypt-prod - 10d
type: string type: string
sub-domain: sub-domain:
default: prometheus default: prometheus
examples: examples:
- prometheus - prometheus
type: string type: string
replicas:
default: 1
examples:
- 1
type: integer
enableAdminAPI:
default: false
examples:
- false
type: boolean
dependencies: dependencies:
- dist: null - dist: null
category: share category: share

View File

@@ -1,68 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kube-state-metrics
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.0.2"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.0.2
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
-
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
!= 0
for: 15m
labels:
severity: critical

View File

@@ -1,32 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubelet.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.0.2"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.0.2
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubelet.rules
rules:
- expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.99'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.9'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.5'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile

View File

@@ -1,29 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-system-kube-proxy
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.0.2"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.0.2
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-system-kube-proxy
rules:
- alert: KubeProxyDown
annotations:
description: KubeProxy has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeproxydown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kube-proxy"} == 1)
for: 15m
labels:
severity: critical