This commit is contained in:
2024-01-22 16:56:33 +01:00
parent 8ad641e923
commit 0f2eced091
29 changed files with 263 additions and 325 deletions

View File

@@ -8,7 +8,7 @@ locals {
}
global = {
"domain" = var.namespace
"domain-name" = "admin.${var.domain-name}"
"domain-name" = "monitor.${var.domain-name}"
"issuer" = var.issuer
"ingress-class" = var.ingress-class
"backups" = var.backups
@@ -16,6 +16,8 @@ locals {
}
prometheus = { for k, v in var.prometheus : k => v if k!="enable" }
alertmanager = { for k, v in var.alertmanager : k => v if k!="enable" }
alerts-core = { for k, v in var.alerts-core : k => v if k!="enable" }
alerts-containers = { for k, v in var.alerts-containers : k => v if k!="enable" }
nodeExporter = { for k, v in var.node-exporter : k => v if k!="enable" }
kubeStateMetrics = merge({"cluster-admin" = true}, { for k, v in var.kube-state-metrics : k => v if k!="enable" })
monitorControlPlan = merge({"cluster-admin" = true}, { for k, v in var.monitor-control-plan : k => v if k!="enable" })
@@ -117,3 +119,38 @@ resource "kubectl_manifest" "monitorControlPlan" {
options: ${jsonencode(merge(local.global, local.monitorControlPlan))}
EOF
}
resource "kubectl_manifest" "alerts-core" {
count = var.alerts-core.enable ? 1 : 0
depends_on = [kubernetes_namespace_v1.monitor-ns]
yaml_body = <<-EOF
apiVersion: "vynil.solidite.fr/v1"
kind: "Install"
metadata:
name: "alerts-core"
namespace: "${kubernetes_namespace_v1.monitor-ns[0].metadata[0].name}"
labels: ${jsonencode(local.common-labels)}
spec:
distrib: "${var.distributions.domain}"
category: "monitor"
component: "alerts-core"
options: ${jsonencode(merge(local.global, local.alerts-core))}
EOF
}
resource "kubectl_manifest" "alerts-containers" {
count = var.alerts-containers.enable ? 1 : 0
depends_on = [kubernetes_namespace_v1.monitor-ns]
yaml_body = <<-EOF
apiVersion: "vynil.solidite.fr/v1"
kind: "Install"
metadata:
name: "alerts-containers"
namespace: "${kubernetes_namespace_v1.monitor-ns[0].metadata[0].name}"
labels: ${jsonencode(local.common-labels)}
spec:
distrib: "${var.distributions.domain}"
category: "monitor"
component: "alerts-containers"
options: ${jsonencode(merge(local.global, local.alerts-containers))}
EOF
}

View File

@@ -6,11 +6,91 @@ metadata:
name: domain-monitor
description: null
options:
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
ingress-class:
default: traefik
examples:
- traefik
type: string
app-group:
default: monitor
examples:
- monitor
type: string
alertmanager:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: alertmanager
alerts-core:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: alerts-core
domain:
default: your-company
examples:
- your-company
type: string
prometheus:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: prometheus
monitor-control-plan:
default:
enable: false
examples:
- enable: false
properties:
enable:
default: false
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: monitor-control-plan
kube-state-metrics:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: kube-state-metrics
storage-classes:
default:
BlockReadWriteMany: ''
@@ -36,12 +116,7 @@ options:
default: ''
type: string
type: object
ingress-class:
default: traefik
examples:
- traefik
type: string
alertmanager:
alerts-containers:
default:
enable: true
examples:
@@ -52,7 +127,19 @@ options:
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: alertmanager
x-vynil-package: alerts-containers
node-exporter:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: node-exporter
backups:
default:
enable: false
@@ -83,59 +170,6 @@ options:
default: backup-settings
type: string
type: object
app-group:
default: infra
examples:
- infra
type: string
kube-state-metrics:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: kube-state-metrics
monitor-control-plan:
default:
enable: false
examples:
- enable: false
properties:
enable:
default: false
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: monitor-control-plan
prometheus:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: prometheus
node-exporter:
default:
enable: true
examples:
- enable: true
properties:
enable:
default: true
type: boolean
type: object
x-vynil-category: monitor
x-vynil-package: node-exporter
distributions:
default:
core: core
@@ -151,16 +185,6 @@ options:
default: domain
type: string
type: object
domain:
default: your-company
examples:
- your-company
type: string
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
dependencies: []
providers:
kubernetes: true

View File

@@ -6,16 +6,41 @@ metadata:
name: alertmanager
description: null
options:
sub-domain:
default: to-be-set
replicas:
default: 1
examples:
- to-be-set
- 1
type: integer
listenLocal:
default: false
examples:
- false
type: boolean
logLevel:
default: info
examples:
- info
type: string
sub-domain:
default: alertmanager
examples:
- alertmanager
type: string
ingress-class:
default: traefik
examples:
- traefik
type: string
retention:
default: 120h
examples:
- 120h
type: string
app-group:
default: monitor
examples:
- monitor
type: string
issuer:
default: letsencrypt-prod
examples:
@@ -60,41 +85,16 @@ options:
type: string
type: object
type: object
replicas:
default: 1
examples:
- 1
type: integer
retention:
default: 120h
examples:
- 120h
type: string
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
app-group:
default: monitor
examples:
- monitor
type: string
logLevel:
default: info
examples:
- info
type: string
domain:
default: your-company
examples:
- your-company
type: string
listenLocal:
default: false
examples:
- false
type: boolean
dependencies:
- dist: null
category: share

View File

@@ -0,0 +1,16 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -0,0 +1,18 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alerts-containers
description: null
options: {}
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,3 +1,4 @@
---
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule

View File

@@ -0,0 +1,16 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -0,0 +1,18 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alerts-core
description: null
options: {}
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,3 +1,4 @@
---
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule

View File

@@ -5,75 +5,11 @@ category: monitor
metadata:
name: monitor-control-plan
description: null
options:
sub-domain:
default: to-be-set
examples:
- to-be-set
type: string
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
domain:
default: your-company
examples:
- your-company
type: string
ingress-class:
default: traefik
examples:
- traefik
type: string
images:
default:
operator:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
examples:
- operator:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
properties:
operator:
default:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: docker.io
type: string
repository:
default: to-be/defined
type: string
tag:
default: v1.0.0
type: string
type: object
type: object
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
options: {}
dependencies: []
providers:
kubernetes: true
authentik: true
authentik: null
kubectl: true
postgresql: null
restapi: null

View File

@@ -6,41 +6,51 @@ metadata:
name: prometheus
description: null
options:
shards:
default: 1
issuer:
default: letsencrypt-prod
examples:
- 1
type: integer
retention:
default: 10d
- letsencrypt-prod
type: string
app-group:
default: monitor
examples:
- 10d
- monitor
type: string
logLevel:
default: info
examples:
- info
type: string
domain-name:
default: your_company.com
enableAdminAPI:
default: false
examples:
- your_company.com
type: string
domain:
default: your-company
- false
type: boolean
replicas:
default: 1
examples:
- your-company
type: string
- 1
type: integer
listenLocal:
default: false
examples:
- false
type: boolean
domain-name:
default: your_company.com
examples:
- your_company.com
type: string
ingress-class:
default: traefik
examples:
- traefik
type: string
shards:
default: 1
examples:
- 1
type: integer
images:
default:
prometheus:
@@ -80,31 +90,21 @@ options:
type: string
type: object
type: object
app-group:
default: monitor
domain:
default: your-company
examples:
- monitor
- your-company
type: string
issuer:
default: letsencrypt-prod
retention:
default: 10d
examples:
- letsencrypt-prod
- 10d
type: string
sub-domain:
default: prometheus
examples:
- prometheus
type: string
replicas:
default: 1
examples:
- 1
type: integer
enableAdminAPI:
default: false
examples:
- false
type: boolean
dependencies:
- dist: null
category: share

View File

@@ -1,68 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kube-state-metrics
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.0.2"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.0.2
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
-
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
!= 0
for: 15m
labels:
severity: critical

View File

@@ -1,32 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubelet.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubelet.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.0.2"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.0.2
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubelet.rules
rules:
- expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.99'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.9'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.5'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile

View File

@@ -1,29 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kube-proxy.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-system-kube-proxy
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.0.2"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.0.2
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-system-kube-proxy
rules:
- alert: KubeProxyDown
annotations:
description: KubeProxy has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeproxydown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kube-proxy"} == 1)
for: 15m
labels:
severity: critical