This commit is contained in:
2024-01-28 10:00:47 +01:00
parent ccb828c881
commit 0961759af7
173 changed files with 449 additions and 8800 deletions

21
apps/infisical/configs.tf Normal file
View File

@@ -0,0 +1,21 @@
resource "kubectl_manifest" "config" {
yaml_body = <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: "${var.component}-${var.instance}"
namespace: "${var.namespace}"
labels: ${jsonencode(local.common-labels)}
data:
INVITE_ONLY_SIGNUP: ""
REDIS_URL: "${module.redis.url}"
SITE_URL: "https://${local.dns_name}"
SMTP_FROM_ADDRESS: ""
SMTP_FROM_NAME: "Infisical"
SMTP_HOST: ""
SMTP_PASSWORD: ""
SMTP_PORT: "587"
SMTP_SECURE: ""
SMTP_USERNAME: ""
EOF
}

View File

@@ -1,20 +1,26 @@
locals {
authentik_url = "http://authentik.${var.domain}-auth.svc"
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/name" = var.component
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("v1_Service_prometheus",file))<1]
data "kubernetes_secret_v1" "authentik" {
metadata {
name = "authentik"
namespace = "${var.domain}-auth"
}
}
data "kustomization_overlay" "data_no_ns" {
resources = [for file in fileset(path.module, "*.yaml"): file if length(regexall("v1_Service_prometheus",file))>0]
data "kustomization_overlay" "data" {
namespace = var.namespace
common_labels = local.common-labels
resources = []
}

40
apps/infisical/deploy.tf Normal file
View File

@@ -0,0 +1,40 @@
resource "kubectl_manifest" "deploy" {
yaml_body = <<-EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: "${var.component}-${var.instance}"
namespace: "${var.namespace}"
labels: ${jsonencode(local.common-labels)}
annotations:
configmap.reloader.stakater.com/reload: "${kubectl_manifest.config.name}"
secret.reloader.stakater.com/reload: "${kubectl_manifest.secret.name}"
spec:
replicas: ${var.replicas}
selector:
matchLabels: ${jsonencode(local.common-labels)}
template:
metadata:
labels: ${jsonencode(local.common-labels)}
spec:
containers:
- name: infisical-backend
image: "${var.images.infiscal.registry}/${var.images.infiscal.repository}:${var.images.infiscal.tag}"
imagePullPolicy: "${var.images.infiscal.pullPolicy}"
readinessProbe:
httpGet:
path: /api/status
port: 8080
initialDelaySeconds: 10
periodSeconds: 10
ports:
- containerPort: 8080
name: http
protocol: TCP
envFrom:
- configMapRef:
name: "${kubectl_manifest.config.name}"
- secretRef:
name: "${kubectl_manifest.secret.name}"
EOF
}

View File

@@ -1,70 +1,66 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
category: apps
metadata:
name: grafana
name: infisical
description: null
options:
domain:
default: your-company
language:
default: fr_FR
examples:
- your-company
- fr_FR
type: string
sub_domain:
default: to-be-set
examples:
- to-be-set
type: string
replicas:
default: 1
examples:
- 1
type: integer
images:
default:
busybox:
registry: docker.io
repository: library/busybox
tag: 1.31.1
grafana:
infisical:
pullPolicy: IfNotPresent
registry: docker.io
repository: grafana/grafana
tag: 10.2.3
sidecar:
repository: infisical/infisical
tag: v1.0.0
redis:
pullPolicy: IfNotPresent
registry: quay.io
repository: kiwigrid/k8s-sidecar
tag: 1.25.2
repository: opstree/redis
tag: v7.0.12
redis_exporter:
pullPolicy: IfNotPresent
registry: quay.io
repository: opstree/redis-exporter
tag: v1.44.0
examples:
- busybox:
registry: docker.io
repository: library/busybox
tag: 1.31.1
grafana:
- infisical:
pullPolicy: IfNotPresent
registry: docker.io
repository: grafana/grafana
tag: 10.2.3
sidecar:
repository: infisical/infisical
tag: v1.0.0
redis:
pullPolicy: IfNotPresent
registry: quay.io
repository: kiwigrid/k8s-sidecar
tag: 1.25.2
repository: opstree/redis
tag: v7.0.12
redis_exporter:
pullPolicy: IfNotPresent
registry: quay.io
repository: opstree/redis-exporter
tag: v1.44.0
properties:
busybox:
default:
registry: docker.io
repository: library/busybox
tag: 1.31.1
properties:
registry:
default: docker.io
type: string
repository:
default: library/busybox
type: string
tag:
default: 1.31.1
type: string
type: object
grafana:
infisical:
default:
pullPolicy: IfNotPresent
registry: docker.io
repository: grafana/grafana
tag: 10.2.3
repository: infisical/infisical
tag: v1.0.0
properties:
pullPolicy:
default: IfNotPresent
@@ -77,18 +73,18 @@ options:
default: docker.io
type: string
repository:
default: grafana/grafana
default: infisical/infisical
type: string
tag:
default: 10.2.3
default: v1.0.0
type: string
type: object
sidecar:
redis:
default:
pullPolicy: IfNotPresent
registry: quay.io
repository: kiwigrid/k8s-sidecar
tag: 1.25.2
repository: opstree/redis
tag: v7.0.12
properties:
pullPolicy:
default: IfNotPresent
@@ -101,92 +97,88 @@ options:
default: quay.io
type: string
repository:
default: kiwigrid/k8s-sidecar
default: opstree/redis
type: string
tag:
default: 1.25.2
default: v7.0.12
type: string
type: object
redis_exporter:
default:
pullPolicy: IfNotPresent
registry: quay.io
repository: opstree/redis-exporter
tag: v1.44.0
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: quay.io
type: string
repository:
default: opstree/redis-exporter
type: string
tag:
default: v1.44.0
type: string
type: object
type: object
ingress_class:
default: traefik
sso_vynil:
default: true
examples:
- traefik
- true
type: boolean
timezone:
default: Europe/Paris
examples:
- Europe/Paris
type: string
domain_name:
default: your_company.com
default: your-company.com
examples:
- your_company.com
- your-company.com
type: string
domain:
default: your-company
examples:
- your-company
type: string
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
admin_name:
default: grafana_admin
ingress_class:
default: traefik
examples:
- grafana_admin
type: string
sub_domain:
default: grafana
examples:
- grafana
- traefik
type: string
app_group:
default: monitor
default: apps
examples:
- monitor
- apps
type: string
storage:
default:
volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
description: Configure this app storage
examples:
- volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
volume:
default:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
accessMode:
default: ReadWriteOnce
enum:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
type: string
class:
default: ''
type: string
size:
default: 10Gi
type: string
type:
default: Filesystem
enum:
- Filesystem
- Block
type: string
type: object
type: object
dependencies: []
dependencies:
- dist: null
category: core
component: secret-generator
- dist: null
category: dbo
component: mongo
- dist: null
category: dbo
component: redis
providers:
kubernetes: true
authentik: true
kubectl: true
postgresql: null
mysql: null
restapi: true
http: true
gitea: null

View File

@@ -1,17 +1,22 @@
locals {
dns_name = "${var.sub_domain}.${var.domain_name}"
dns_names = [local.dns_name]
app_name = var.component == var.instance ? var.instance : format("%s-%s", var.component, var.instance)
icon = "favicon.ico"
icon = "icon.svg"
request_headers = {
"Content-Type" = "application/json"
Authorization = "Bearer ${data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]}"
}
service = {
"name" = "alertmanager-operated"
"port" = {
"number" = 9093
}
}
module "service" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//service"
component = var.component
instance = var.instance
namespace = var.namespace
labels = local.common-labels
targets = ["http"]
providers = {
kubectl = kubectl
}
}
@@ -25,7 +30,7 @@ module "ingress" {
labels = local.common-labels
dns_names = local.dns_names
middlewares = [module.forward.middleware]
services = [local.service]
services = [module.service.default_definition]
providers = {
kubectl = kubectl
}
@@ -63,7 +68,7 @@ module "forward" {
ingress_class = var.ingress_class
labels = local.common-labels
dns_names = local.dns_names
service = local.service
service = module.service.default_definition
icon = local.icon
request_headers = local.request_headers
providers = {

31
apps/infisical/secret.tf Normal file
View File

@@ -0,0 +1,31 @@
resource "kubectl_manifest" "secret" {
ignore_fields = ["metadata.annotations"]
yaml_body = <<-EOF
apiVersion: "secretgenerator.mittwald.de/v1alpha1"
kind: "StringSecret"
metadata:
name: "${var.component}-${var.instance}"
namespace: "${var.namespace}"
labels: ${jsonencode(local.common-labels)}
spec:
forceRegenerate: false
data:
MONGO_URL: "${module.mongo.url}"
fields:
- fieldName: "JWT_SIGNUP_SECRET"
length: "32"
- fieldName: "JWT_SERVICE_SECRET"
length: "32"
- fieldName: "JWT_REFRESH_SECRET"
length: "32"
- fieldName: "JWT_PROVIDER_AUTH_SECRET"
length: "32"
- fieldName: "JWT_MFA_SECRET"
length: "32"
- fieldName: "JWT_AUTH_SECRET"
length: "32"
- fieldName: "ENCRYPTION_KEY"
length: "32"
EOF
}

21
apps/infisical/storage.tf Normal file
View File

@@ -0,0 +1,21 @@
module "redis" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//redis"
component = var.component
instance = var.instance
namespace = var.namespace
labels = local.common-labels
images = var.images
providers = {
kubectl = kubectl
}
}
module "mongo" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//mongo"
component = var.component
instance = var.instance
namespace = var.namespace
labels = local.common-labels
providers = {
kubectl = kubectl
}
}

View File

@@ -6,21 +6,175 @@ metadata:
name: nextcloud
description: null
options:
ingress_class:
default: traefik
issuer:
default: letsencrypt-prod
examples:
- traefik
- letsencrypt-prod
type: string
app_group:
default: ''
postgres:
default:
replicas: 1
examples:
- ''
type: string
openid-name:
default: vynil
- replicas: 1
properties:
replicas:
default: 1
type: integer
type: object
apps:
default:
audioplayer: false
bookmarks: false
bpm: false
calendar: false
collabora: false
contacts: false
deck: false
groupfolders: true
mindmap: false
music: false
notes: false
onlyoffice: false
passman: false
spreed: false
tables: false
tasks: false
texteditor: true
examples:
- vynil
type: string
- audioplayer: false
bookmarks: false
bpm: false
calendar: false
collabora: false
contacts: false
deck: false
groupfolders: true
mindmap: false
music: false
notes: false
onlyoffice: false
passman: false
spreed: false
tables: false
tasks: false
texteditor: true
properties:
audioplayer:
default: false
type: boolean
bookmarks:
default: false
type: boolean
bpm:
default: false
type: boolean
calendar:
default: false
type: boolean
collabora:
default: false
type: boolean
contacts:
default: false
type: boolean
deck:
default: false
type: boolean
groupfolders:
default: true
type: boolean
mindmap:
default: false
type: boolean
music:
default: false
type: boolean
notes:
default: false
type: boolean
onlyoffice:
default: false
type: boolean
passman:
default: false
type: boolean
spreed:
default: false
type: boolean
tables:
default: false
type: boolean
tasks:
default: false
type: boolean
texteditor:
default: true
type: boolean
type: object
storage:
default:
postgres:
size: 5Gi
redis:
size: 2Gi
volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
description: Configure this app storage
examples:
- postgres:
size: 5Gi
redis:
size: 2Gi
volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
postgres:
default:
size: 5Gi
properties:
size:
default: 5Gi
type: string
type: object
redis:
default:
size: 2Gi
properties:
size:
default: 2Gi
type: string
type: object
volume:
default:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
accessMode:
default: ReadWriteOnce
enum:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
type: string
class:
default: ''
type: string
size:
default: 10Gi
type: string
type:
default: Filesystem
type: string
type: object
type: object
backups:
default:
enable: false
@@ -127,182 +281,6 @@ options:
default: false
type: boolean
type: object
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
storage:
default:
postgres:
size: 5Gi
redis:
size: 2Gi
volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
description: Configure this app storage
examples:
- postgres:
size: 5Gi
redis:
size: 2Gi
volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
postgres:
default:
size: 5Gi
properties:
size:
default: 5Gi
type: string
type: object
redis:
default:
size: 2Gi
properties:
size:
default: 2Gi
type: string
type: object
volume:
default:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
accessMode:
default: ReadWriteOnce
enum:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
type: string
class:
default: ''
type: string
size:
default: 10Gi
type: string
type:
default: Filesystem
type: string
type: object
type: object
apps:
default:
audioplayer: false
bookmarks: false
bpm: false
calendar: false
collabora: false
contacts: false
deck: false
groupfolders: true
mindmap: false
music: false
notes: false
onlyoffice: false
passman: false
spreed: false
tables: false
tasks: false
texteditor: true
examples:
- audioplayer: false
bookmarks: false
bpm: false
calendar: false
collabora: false
contacts: false
deck: false
groupfolders: true
mindmap: false
music: false
notes: false
onlyoffice: false
passman: false
spreed: false
tables: false
tasks: false
texteditor: true
properties:
audioplayer:
default: false
type: boolean
bookmarks:
default: false
type: boolean
bpm:
default: false
type: boolean
calendar:
default: false
type: boolean
collabora:
default: false
type: boolean
contacts:
default: false
type: boolean
deck:
default: false
type: boolean
groupfolders:
default: true
type: boolean
mindmap:
default: false
type: boolean
music:
default: false
type: boolean
notes:
default: false
type: boolean
onlyoffice:
default: false
type: boolean
passman:
default: false
type: boolean
spreed:
default: false
type: boolean
tables:
default: false
type: boolean
tasks:
default: false
type: boolean
texteditor:
default: true
type: boolean
type: object
redis:
default:
exporter:
enabled: true
examples:
- exporter:
enabled: true
properties:
exporter:
default:
enabled: true
properties:
enabled:
default: true
type: boolean
type: object
type: object
images:
default:
collabora:
@@ -558,6 +536,38 @@ options:
type: string
type: object
type: object
redis:
default:
exporter:
enabled: true
examples:
- exporter:
enabled: true
properties:
exporter:
default:
enabled: true
properties:
enabled:
default: true
type: boolean
type: object
type: object
admin:
default:
name: nextcloud_admin
examples:
- name: nextcloud_admin
properties:
name:
default: nextcloud_admin
type: string
type: object
app_group:
default: ''
examples:
- ''
type: string
domain_name:
default: your_company.com
examples:
@@ -583,31 +593,21 @@ options:
default: 1
type: integer
type: object
admin:
default:
name: nextcloud_admin
ingress_class:
default: traefik
examples:
- name: nextcloud_admin
properties:
name:
default: nextcloud_admin
type: string
type: object
postgres:
default:
replicas: 1
examples:
- replicas: 1
properties:
replicas:
default: 1
type: integer
type: object
- traefik
type: string
domain:
default: your-company
examples:
- your-company
type: string
openid-name:
default: vynil
examples:
- vynil
type: string
sub_domain:
default: files
examples:
@@ -628,7 +628,8 @@ providers:
authentik: true
kubectl: true
postgresql: null
mysql: null
restapi: null
http: null
gitea: null
tfaddtype: null
tfaddtype: true

View File

@@ -1,167 +0,0 @@
locals {
authentik_url = "http://authentik.${var.domain}-auth.svc"
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
rb-patch = <<-EOF
- op: replace
path: /subjects/0/namespace
value: "${var.namespace}"
EOF
}
data "kubernetes_secret_v1" "authentik" {
metadata {
name = "authentik"
namespace = "${var.domain}-auth"
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
patches {
target {
kind = "Alertmanager"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager-kube-promethe-alertmanager
spec:
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
version: ${var.images.alertmanager.tag}
externalUrl: http://${var.component}-${var.instance}.${var.namespace}:9093
replicas: ${var.replicas}
listenLocal: ${var.listenLocal}
logLevel: "${var.logLevel}"
retention: "${var.retention}"
EOF
}
patches {
target {
kind = "ConfigMap"
name = "alertmanager-kube-grafana-datasource"
}
patch = <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-operated
data:
datasource.yaml: |-
apiVersion: 1
datasources:
- name: Alertmanager
type: alertmanager
uid: alertmanager
url: http://${var.component}-${var.instance}.${var.namespace}:9093/
access: proxy
jsonData:
handleGrafanaManagedAlerts: false
implementation: prometheus
EOF
}
patches {
target {
kind = "ServiceMonitor"
name = "alertmanager-kube-promethe-alertmanager"
}
patch = <<-EOF
- op: replace
path: /spec/namespaceSelector/matchNames/0
value: "${var.namespace}"
EOF
}
patches {
target {
kind = "PrometheusRule"
name = "alertmanager-kube-promethe-alertmanager.rules"
}
patch = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alertmanager-kube-promethe-alertmanager.rules
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
- alert: AlertmanagerMembersInconsistent
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
- alert: AlertmanagerFailedToSendAlerts
expr: |-
(
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerClusterFailedToSendAlerts
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
)
> 0.01
- alert: AlertmanagerConfigInconsistent
expr: |-
count by (namespace,service,cluster) (
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
)
!= 1
- alert: AlertmanagerClusterDown
expr: |-
(
count by (namespace,service,cluster) (
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5
- alert: AlertmanagerClusterCrashlooping
expr: |-
(
count by (namespace,service,cluster) (
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
)
)
>= 0.5
EOF
}
}

View File

@@ -1,110 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alertmanager
description: null
options:
replicas:
default: 1
examples:
- 1
type: integer
app_group:
default: monitor
examples:
- monitor
type: string
listenLocal:
default: false
examples:
- false
type: boolean
domain_name:
default: your_company.com
examples:
- your_company.com
type: string
domain:
default: your-company
examples:
- your-company
type: string
images:
default:
alertmanager:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/alertmanager
tag: v0.26.0
examples:
- alertmanager:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/alertmanager
tag: v0.26.0
properties:
alertmanager:
default:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/alertmanager
tag: v0.26.0
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: quay.io
type: string
repository:
default: prometheus/alertmanager
type: string
tag:
default: v0.26.0
type: string
type: object
type: object
logLevel:
default: info
examples:
- info
type: string
ingress_class:
default: traefik
examples:
- traefik
type: string
sub_domain:
default: alertmanager
examples:
- alertmanager
type: string
retention:
default: 120h
examples:
- 120h
type: string
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
dependencies:
- dist: null
category: share
component: authentik-forward
providers:
kubernetes: true
authentik: true
kubectl: true
postgresql: null
restapi: true
http: true
gitea: null
tfaddtype: null

View File

@@ -1,38 +0,0 @@
# Source: kube-prometheus-stack/templates/alertmanager/alertmanager.yaml
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "alertmanager"
heritage: "Helm"
spec:
image: "quay.io/prometheus/alertmanager:v0.26.0"
version: v0.26.0
replicas: 1
listenLocal: false
serviceAccountName: alertmanager-kube-promethe-alertmanager
externalUrl: http://alertmanager-kube-promethe-alertmanager.vynil-monitor:9093
paused: false
logFormat: "logfmt"
logLevel: "info"
retention: "120h"
alertmanagerConfigSelector: {}
alertmanagerConfigNamespaceSelector: {}
routePrefix: "/"
securityContext:
fsGroup: 2000
runAsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
portName: http-web

View File

@@ -1,142 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alertmanager-kube-promethe-alertmanager.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "alertmanager"
heritage: "Helm"
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |-
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
< on (namespace,service,cluster) group_left
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]))
for: 15m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |-
(
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration=~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |-
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration!~`.*`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |-
count by (namespace,service,cluster) (
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |-
(
count by (namespace,service,cluster) (
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]) < 0.5
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |-
(
count by (namespace,service,cluster) (
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[10m]) > 4
)
/
count by (namespace,service,cluster) (
up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

View File

@@ -1,33 +0,0 @@
# Source: kube-prometheus-stack/templates/alertmanager/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "alertmanager"
heritage: "Helm"
spec:
selector:
matchLabels:
app: kube-prometheus-stack-alertmanager
release: "alertmanager"
self-monitor: "true"
namespaceSelector:
matchNames:
- "vynil-monitor"
endpoints:
- port: http-web
enableHttp2: true
path: "/metrics"
- port: reloader-web
scheme: http
path: "/metrics"

View File

@@ -1,33 +0,0 @@
locals {
svc-label = merge(local.common-labels, {
"app" = "kube-prometheus-stack-prometheus"
"release" = "prometheus"
"self-monitor" = "true"
})
}
resource "kubectl_manifest" "svc" {
yaml_body = <<-EOF
apiVersion: v1
kind: Service
metadata:
name: "${var.component}-${var.instance}"
namespace: "${var.namespace}"
labels: ${jsonencode(local.svc-label)}
spec:
ports:
- name: http-web
port: 9093
targetPort: 9093
protocol: TCP
- name: reloader-web
appProtocol: http
port: 8080
targetPort: reloader-web
selector:
app.kubernetes.io/name: alertmanager
alertmanager: alertmanager-kube-promethe-alertmanager
sessionAffinity: None
type: "ClusterIP"
EOF
}

View File

@@ -1,19 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-kube-grafana-datasource
labels:
grafana_datasource: "1"
app: alertmanager
data:
datasource.yaml: |-
apiVersion: 1
datasources:
- name: Alertmanager
type: alertmanager
uid: alertmanager
url: http://alertmanager-operated.vynil-monitor:9093/
access: proxy
jsonData:
handleGrafanaManagedAlerts: false
implementation: prometheus

File diff suppressed because one or more lines are too long

View File

@@ -1,18 +0,0 @@
# Source: kube-prometheus-stack/templates/alertmanager/secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "alertmanager"
heritage: "Helm"
data:
alertmanager.yaml: "Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0KaW5oaWJpdF9ydWxlczoKLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIC0gYWxlcnRuYW1lCiAgc291cmNlX21hdGNoZXJzOgogIC0gc2V2ZXJpdHkgPSBjcml0aWNhbAogIHRhcmdldF9tYXRjaGVyczoKICAtIHNldmVyaXR5ID1+IHdhcm5pbmd8aW5mbwotIGVxdWFsOgogIC0gbmFtZXNwYWNlCiAgLSBhbGVydG5hbWUKICBzb3VyY2VfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IHdhcm5pbmcKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIHNvdXJjZV9tYXRjaGVyczoKICAtIGFsZXJ0bmFtZSA9IEluZm9JbmhpYml0b3IKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBhbGVydG5hbWUgPSBJbmZvSW5oaWJpdG9yCnJlY2VpdmVyczoKLSBuYW1lOiAibnVsbCIKcm91dGU6CiAgZ3JvdXBfYnk6CiAgLSBuYW1lc3BhY2UKICBncm91cF9pbnRlcnZhbDogNW0KICBncm91cF93YWl0OiAzMHMKICByZWNlaXZlcjogIm51bGwiCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByb3V0ZXM6CiAgLSBtYXRjaGVyczoKICAgIC0gYWxlcnRuYW1lID0gIldhdGNoZG9nIgogICAgcmVjZWl2ZXI6ICJudWxsIgp0ZW1wbGF0ZXM6Ci0gL2V0Yy9hbGVydG1hbmFnZXIvY29uZmlnLyoudG1wbA=="

View File

@@ -1,20 +0,0 @@
---
# Source: kube-prometheus-stack/templates/alertmanager/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager-kube-promethe-alertmanager
namespace: vynil-monitor
labels:
app: kube-prometheus-stack-alertmanager
app.kubernetes.io/name: kube-prometheus-stack-alertmanager
app.kubernetes.io/component: alertmanager
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: alertmanager
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "alertmanager"
heritage: "Helm"
automountServiceAccountToken: true

View File

@@ -1,16 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -1,23 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alerts-containers
description: null
options:
useless:
default: true
examples:
- true
type: boolean
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,28 +0,0 @@
---
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-k8s.rules.container-cpu-usage-seconds
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: k8s.rules.container_cpu_usage_seconds_total
rules:
- expr: |-
sum by (cluster, namespace, pod, container) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate

View File

@@ -1,26 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-k8s.rules.container-memory-cache
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: k8s.rules.container_memory_cache
rules:
- expr: |-
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache

View File

@@ -1,26 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-k8s.rules.container-memory-rss
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: k8s.rules.container_memory_rss
rules:
- expr: |-
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss

View File

@@ -1,26 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-k8s.rules.container-memory-swap
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: k8s.rules.container_memory_swap
rules:
- expr: |-
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap

View File

@@ -1,26 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-k8s.rules.container-memory-working-se
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: k8s.rules.container_memory_working_set_bytes
rules:
- expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes

View File

@@ -1,88 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-k8s.rules.container-resource
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: k8s.rules.container_resource
rules:
- expr: |-
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
- expr: |-
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- expr: |-
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- expr: |-
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
group_left() max by (namespace, pod, cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- expr: |-
sum by (namespace, cluster) (
sum by (namespace, pod, cluster) (
max by (namespace, pod, container, cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_limits:sum

View File

@@ -1,16 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -1,23 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: alerts-core
description: null
options:
useless:
default: true
examples:
- true
type: boolean
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,32 +0,0 @@
---
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-config-reloaders
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: config-reloaders
rules:
- alert: ConfigReloaderSidecarErrors
annotations:
description: 'Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors
summary: config-reloader sidecar has not had a successful reload for 10m
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
for: 10m
labels:
severity: warning

View File

@@ -1,67 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-general.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
severity: none
- alert: InfoInhibitor
annotations:
description: 'This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
other alerts.
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
severity of ''warning'' or ''critical'' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none

View File

@@ -1,67 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-k8s.rules.pod-owner
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: k8s.rules.pod_owner
rules:
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on (replicaset, namespace) group_left(owner_name) topk by (replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |-
max by (cluster, namespace, workload, pod) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel

View File

@@ -1,24 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kube-prometheus-general.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0

View File

@@ -1,32 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kube-prometheus-node-recording.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio

View File

@@ -1,258 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-apps
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
summary: Pod is crash looping.
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
for: 15m
labels:
severity: warning
- alert: KubePodNotReady
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
sum by (namespace, pod, cluster) (
max by (namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
labels:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: |-
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |-
(
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |-
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} > 0
for: 15m
labels:
severity: warning
- alert: KubeJobNotCompleted
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
severity: warning
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
summary: Job failed to complete.
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
for: 15m
labels:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched desired number of replicas.
expr: |-
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
for: 15m
labels:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
expr: |-
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning

View File

@@ -1,122 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-resources
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: info
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
> ( 25 / 100 )
for: 15m
labels:
severity: info

View File

@@ -1,113 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-storage
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-storage
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} is only {{ $value | humanizePercentage }} free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} only has {{ $value | humanizePercentage }} free inodes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
summary: PersistentVolumeInodes are filling up.
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
summary: PersistentVolumeInodes are filling up.
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
description: The persistent volume {{ $labels.persistentvolume }} on Cluster {{ $labels.cluster }} has status {{ $labels.phase }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical

View File

@@ -1,64 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-system-apiserver
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-system-apiserver
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on (job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
for: 5m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on (job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
for: 5m
labels:
severity: critical
- alert: KubeAggregatedAPIErrors
annotations:
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
expr: sum by (name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
annotations:
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
expr: (1 - max by (name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
for: 5m
labels:
severity: warning
- alert: KubeAPITerminatedRequests
annotations:
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning

View File

@@ -1,140 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-system-kubelet
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-system-kubelet
rules:
- alert: KubeNodeNotReady
annotations:
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
summary: Node is not ready.
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
labels:
severity: warning
- alert: KubeNodeUnreachable
annotations:
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
summary: Node is unreachable.
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
summary: Kubelet is running at capacity.
expr: |-
count by (cluster, node) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster) group_left(node) topk by (instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
)
/
max by (cluster, node) (
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
) > 0.95
for: 15m
labels:
severity: info
- alert: KubeNodeReadinessFlapping
annotations:
description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2
for: 15m
labels:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletDown
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical

View File

@@ -1,42 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-kubernetes-system
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: kubernetes-system
rules:
- alert: KubeVersionMismatch
annotations:
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: |-
(sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace)
/
sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace))
> 0.01
for: 15m
labels:
severity: warning

View File

@@ -1,16 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -1,23 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: dashboards-cluster
description: null
options:
useless:
default: true
examples:
- true
type: boolean
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,16 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -1,23 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: dashboards-minimal
description: null
options:
useless:
default: true
examples:
- true
type: boolean
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,16 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -1,23 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: dashboards-namespace
description: null
options:
useless:
default: true
examples:
- true
type: boolean
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,16 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
}

View File

@@ -1,23 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: dashboards-workload
description: null
options:
useless:
default: true
examples:
- true
type: boolean
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,298 +0,0 @@
# Source: grafana/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: vynil-monitor
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
spec:
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
strategy:
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
annotations:
checksum/config: 0e9cbd0ea8e24e32f7dfca5bab17a2ba05652642f0a09a4882833ae88e4cc4a3
checksum/sc-dashboard-provider-config: 593c0a8778b83f11fe80ccb21dfb20bc46705e2be3178df1dc4c89d164c8cd9c
kubectl.kubernetes.io/default-container: grafana
spec:
serviceAccountName: grafana
automountServiceAccountToken: true
securityContext:
fsGroup: 472
runAsGroup: 472
runAsNonRoot: true
runAsUser: 472
initContainers:
- name: init-chown-data
image: "docker.io/library/busybox:1.31.1"
imagePullPolicy: IfNotPresent
securityContext:
capabilities:
add:
- CHOWN
runAsNonRoot: false
runAsUser: 0
seccompProfile:
type: RuntimeDefault
command:
- chown
- -R
- 472:472
- /var/lib/grafana
volumeMounts:
- name: storage
mountPath: "/var/lib/grafana"
enableServiceLinks: true
containers:
- name: grafana-sc-dashboard
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
imagePullPolicy: IfNotPresent
env:
- name: METHOD
value: WATCH
- name: LABEL
value: "grafana_dashboard"
- name: FOLDER
value: "/tmp/dashboards"
- name: RESOURCE
value: "both"
- name: REQ_USERNAME
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: username
- name: REQ_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: password
- name: REQ_URL
value: http://localhost:3000/api/admin/provisioning/dashboards/reload
- name: REQ_METHOD
value: POST
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
seccompProfile:
type: RuntimeDefault
volumeMounts:
- name: sc-dashboard-volume
mountPath: "/tmp/dashboards"
- name: grafana-sc-datasources
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
imagePullPolicy: IfNotPresent
env:
- name: METHOD
value: WATCH
- name: LABEL
value: "grafana_datasource"
- name: FOLDER
value: "/etc/grafana/provisioning/datasources"
- name: RESOURCE
value: "both"
- name: REQ_USERNAME
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: username
- name: REQ_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: password
- name: REQ_URL
value: http://localhost:3000/api/admin/provisioning/datasources/reload
- name: REQ_METHOD
value: POST
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
seccompProfile:
type: RuntimeDefault
volumeMounts:
- name: sc-datasources-volume
mountPath: "/etc/grafana/provisioning/datasources"
- name: grafana-sc-notifiers
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
imagePullPolicy: IfNotPresent
env:
- name: METHOD
value: WATCH
- name: LABEL
value: "grafana_notifier"
- name: FOLDER
value: "/etc/grafana/provisioning/notifiers"
- name: RESOURCE
value: "both"
- name: REQ_USERNAME
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: username
- name: REQ_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: password
- name: REQ_URL
value: http://localhost:3000/api/admin/provisioning/notifications/reload
- name: REQ_METHOD
value: POST
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
seccompProfile:
type: RuntimeDefault
volumeMounts:
- name: sc-notifiers-volume
mountPath: "/etc/grafana/provisioning/notifiers"
- name: grafana-sc-plugins
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
imagePullPolicy: IfNotPresent
env:
- name: METHOD
value: WATCH
- name: LABEL
value: "grafana_plugin"
- name: FOLDER
value: "/etc/grafana/provisioning/plugins"
- name: RESOURCE
value: "both"
- name: REQ_USERNAME
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: username
- name: REQ_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: password
- name: REQ_URL
value: http://localhost:3000/api/admin/provisioning/plugins/reload
- name: REQ_METHOD
value: POST
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
seccompProfile:
type: RuntimeDefault
volumeMounts:
- name: sc-plugins-volume
mountPath: "/etc/grafana/provisioning/plugins"
- name: grafana
image: "docker.io/grafana/grafana:10.2.3"
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
seccompProfile:
type: RuntimeDefault
volumeMounts:
- name: config
mountPath: "/etc/grafana/grafana.ini"
subPath: grafana.ini
- name: storage
mountPath: "/var/lib/grafana"
- name: sc-dashboard-volume
mountPath: "/tmp/dashboards"
- name: sc-dashboard-provider
mountPath: "/etc/grafana/provisioning/dashboards/sc-dashboardproviders.yaml"
subPath: provider.yaml
- name: sc-datasources-volume
mountPath: "/etc/grafana/provisioning/datasources"
- name: sc-plugins-volume
mountPath: "/etc/grafana/provisioning/plugins"
- name: sc-notifiers-volume
mountPath: "/etc/grafana/provisioning/notifiers"
ports:
- name: grafana
containerPort: 3000
protocol: TCP
- name: gossip-tcp
containerPort: 9094
protocol: TCP
- name: gossip-udp
containerPort: 9094
protocol: UDP
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: username
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-admin-user
key: password
- name: GF_PATHS_DATA
value: /var/lib/grafana/
- name: GF_PATHS_LOGS
value: /var/log/grafana
- name: GF_PATHS_PLUGINS
value: /var/lib/grafana/plugins
- name: GF_PATHS_PROVISIONING
value: /etc/grafana/provisioning
livenessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 60
timeoutSeconds: 30
readinessProbe:
httpGet:
path: /api/health
port: 3000
volumes:
- name: config
configMap:
name: grafana
- name: storage
persistentVolumeClaim:
claimName: grafana
- name: sc-dashboard-volume
emptyDir:
{}
- name: sc-dashboard-provider
configMap:
name: grafana-config-dashboards
- name: sc-datasources-volume
emptyDir:
{}
- name: sc-plugins-volume
emptyDir:
{}
- name: sc-notifiers-volume
emptyDir:
{}

View File

@@ -1,43 +0,0 @@
resource "kubernetes_config_map_v1" "config" {
metadata {
name = "grafana"
namespace = var.namespace
labels = local.common-labels
}
data = {
"grafana.ini" = <<-EOF
[analytics]
check_for_updates = true
[grafana_net]
url = https://grafana.net
[log]
mode = console
[paths]
data = /var/lib/grafana/
logs = /var/log/grafana
plugins = /var/lib/grafana/plugins
provisioning = /etc/grafana/provisioning
[server]
domain = ''
root_url = 'https://${local.dns_name}/'
[users]
auto_assign_org = true
auto_assign_org_id = 1
[auth]
oauth_allow_insecure_email_lookup = true
signout_redirect_url = '${module.oauth2.sso_signout_url}'
oauth_auto_login = true
[auth.generic_oauth]
enabled = true
name = vynil
scopes = openid profile email
${var.issuer=="letsencrypt-prod"?";":""}tls_client_ca = /etc/local-certs/ca.crt
client_id = '${module.oauth2.client_id}'
client_secret = '${module.oauth2.client_secret}'
auth_url = '${module.oauth2.sso_authorize_url}'
api_url = '${module.oauth2.sso_userinfo_url}'
token_url = '${module.oauth2.sso_token_url}'
role_attribute_path = contains(groups, '${module.application.main_group}-admin') && 'Admin' || contains(groups, '${module.application.main_group}') && 'Editor' || 'Viewer'
EOF
}
}

View File

@@ -1,112 +0,0 @@
locals {
authentik_url = "http://authentik.${var.domain}-auth.svc"
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
pvc_spec = merge({
"accessModes" = [var.storage.volume.accessMode]
"volumeMode" = var.storage.volume.type
"resources" = {
"requests" = {
"storage" = "${var.storage.volume.size}"
}
}
}, var.storage.volume.class != "" ?{
"storageClassName" = var.storage.volume.class
}:{})
}
data "kubernetes_secret_v1" "authentik" {
metadata {
name = "authentik"
namespace = "${var.domain}-auth"
}
}
data "kubernetes_ingress_v1" "authentik" {
metadata {
name = "authentik"
namespace = "${var.domain}-auth"
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
images {
name = "docker.io/grafana/grafana"
new_name = "${var.images.grafana.registry}/${var.images.grafana.repository}"
new_tag = "${var.images.grafana.tag}"
}
images {
name = "docker.io/library/busybox"
new_name = "${var.images.busybox.registry}/${var.images.busybox.repository}"
new_tag = "${var.images.busybox.tag}"
}
images {
name = "quay.io/kiwigrid/k8s-sidecar"
new_name = "${var.images.sidecar.registry}/${var.images.sidecar.repository}"
new_tag = "${var.images.sidecar.tag}"
}
patches {
target {
kind = "PersistentVolumeClaim"
name = "grafana"
}
patch = <<-EOF
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: grafana
annotations:
k8up.io/backup: "true"
spec: ${jsonencode(local.pvc_spec)}
EOF
}
patches {
target {
kind = "ServiceMonitor"
name = "grafana"
}
patch = <<-EOF
- op: replace
path: /spec/namespaceSelector/matchNames/0
value: "${var.namespace}"
EOF
}
patches {
target {
kind = "Deployment"
name = "grafana"
}
patch = <<-EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
annotations:
configmap.reloader.stakater.com/reload: "grafana"
spec:
template:
spec:
containers:
- name: grafana
volumeMounts:
- name: local-certs
mountPath: "/etc/local-certs"
volumes:
- name: local-certs
secret:
secretName: "${var.instance}-cert"
defaultMode: 0444
EOF
}
}

View File

@@ -1,28 +0,0 @@
# Source: grafana/templates/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: grafana
namespace: vynil-monitor
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
spec:
endpoints:
- port: service
interval: 30s
scrapeTimeout: 30s
honorLabels: true
path: /metrics
scheme: http
jobLabel: "grafana"
selector:
matchLabels:
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
namespaceSelector:
matchNames:
- vynil-monitor

View File

@@ -1,72 +0,0 @@
locals {
dns_name = "${var.sub_domain}.${var.domain_name}"
dns_names = [local.dns_name]
app_name = var.component == var.instance ? var.instance : format("%s-%s", var.component, var.instance)
icon = "public/img/grafana_icon.svg"
request_headers = {
"Content-Type" = "application/json"
Authorization = "Bearer ${data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]}"
}
service = {
"name" = "grafana"
"port" = {
"number" = 80
}
}
}
module "ingress" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//ingress"
component = ""
instance = var.instance
namespace = var.namespace
issuer = var.issuer
ingress_class = var.ingress_class
labels = local.common-labels
dns_names = local.dns_names
middlewares = []
services = [local.service]
providers = {
kubectl = kubectl
}
}
module "application" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//application"
component = var.component
instance = var.instance
app_group = var.app_group
dns_name = local.dns_name
icon = local.icon
sub_groups = ["admin"]
protocol_provider = module.oauth2.provider-id
providers = {
authentik = authentik
}
}
module "oauth2" {
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//oauth2"
component = var.component
instance = var.instance
namespace = var.namespace
domain = var.domain
labels = local.common-labels
dns_name = local.dns_name
redirect_path = "login/generic_oauth"
providers = {
kubernetes = kubernetes
kubectl = kubectl
authentik = authentik
}
}
provider "restapi" {
uri = "http://authentik.${var.domain}-auth.svc/api/v3/"
headers = local.request_headers
create_method = "PATCH"
update_method = "PATCH"
destroy_method = "PATCH"
write_returns_object = true
id_attribute = "name"
}

View File

@@ -1,20 +0,0 @@
# Source: grafana/templates/rolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: grafana
namespace: vynil-monitor
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: grafana
subjects:
- kind: ServiceAccount
name: grafana
namespace: vynil-monitor

View File

@@ -1,16 +0,0 @@
# Source: grafana/templates/role.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: grafana
namespace: vynil-monitor
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
rules:
- apiGroups: [""] # "" indicates the core API group
resources: ["configmaps", "secrets"]
verbs: ["get", "watch", "list"]

View File

@@ -1,19 +0,0 @@
resource "kubectl_manifest" "grafana_secret" {
ignore_fields = ["metadata.annotations"]
yaml_body = <<-EOF
apiVersion: "secretgenerator.mittwald.de/v1alpha1"
kind: "StringSecret"
metadata:
name: "grafana-admin-user"
namespace: "${var.namespace}"
labels: ${jsonencode(local.common-labels)}
spec:
forceRegenerate: false
data:
username: "${var.admin_name}"
fields:
- fieldName: "password"
length: "32"
EOF
}

View File

@@ -1,26 +0,0 @@
# Source: grafana/templates/configmap-dashboard-provider.yaml
apiVersion: v1
kind: ConfigMap
metadata:
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
name: grafana-config-dashboards
namespace: vynil-monitor
data:
provider.yaml: |-
apiVersion: 1
providers:
- name: 'sidecarProvider'
orgId: 1
folder: ''
type: file
disableDeletion: false
allowUiUpdates: false
updateIntervalSeconds: 30
options:
foldersFromFilesStructure: false
path: /tmp/dashboards

View File

@@ -1,20 +0,0 @@
# Source: grafana/templates/pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana
namespace: vynil-monitor
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
finalizers:
- kubernetes.io/pvc-protection
spec:
accessModes:
- "ReadWriteOnce"
resources:
requests:
storage: "10Gi"

View File

@@ -1,13 +0,0 @@
---
# Source: grafana/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
name: grafana
namespace: vynil-monitor

View File

@@ -1,22 +0,0 @@
# Source: grafana/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: vynil-monitor
labels:
helm.sh/chart: grafana-7.2.4
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana
app.kubernetes.io/version: "10.2.3"
app.kubernetes.io/managed-by: Helm
spec:
type: ClusterIP
ports:
- name: service
port: 80
protocol: TCP
targetPort: 3000
selector:
app.kubernetes.io/name: grafana
app.kubernetes.io/instance: grafana

View File

@@ -1,82 +0,0 @@
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: vynil-monitor
labels:
helm.sh/chart: kube-state-metrics-5.16.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: kube-state-metrics
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "2.10.1"
release: kube-state-metrics
spec:
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
replicas: 1
strategy:
type: RollingUpdate
revisionHistoryLimit: 10
template:
metadata:
labels:
helm.sh/chart: kube-state-metrics-5.16.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: kube-state-metrics
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "2.10.1"
release: kube-state-metrics
spec:
hostNetwork: false
serviceAccountName: kube-state-metrics
securityContext:
fsGroup: 65534
runAsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
seccompProfile:
type: RuntimeDefault
containers:
- name: kube-state-metrics
args:
- --port=8080
- --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments
imagePullPolicy: IfNotPresent
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.10.1
ports:
- containerPort: 8080
name: "http"
livenessProbe:
failureThreshold: 3
httpGet:
httpHeaders:
path: /healthz
port: 8080
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
readinessProbe:
failureThreshold: 3
httpGet:
httpHeaders:
path: /
port: 8080
scheme: HTTP
initialDelaySeconds: 5
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL

View File

@@ -1,49 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
rb-patch = <<-EOF
- op: replace
path: /subjects/0/namespace
value: "${var.namespace}"
EOF
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
images {
name = "registry.k8s.io/kube-state-metrics/kube-state-metrics"
new_name = "${var.images.kube-state-metrics.registry}/${var.images.kube-state-metrics.repository}"
new_tag = "${var.images.kube-state-metrics.tag}"
}
patches {
target {
kind = "ServiceMonitor"
name = "kube-state-metrics"
}
patch = <<-EOF
- op: replace
path: /spec/selector/matchLabels/app.kubernetes.io~1instance
value: "${var.instance}"
EOF
}
}
data "kustomization_overlay" "data_no_ns" {
common_labels = local.common-labels
resources = [for file in fileset(path.module, "*.yaml"): file if length(regexall("ClusterRole",file))>0]
patches {
target {
kind = "ClusterRoleBinding"
name = "kube-state-metrics"
}
patch = local.rb-patch
}
}

View File

@@ -1,57 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: kube-state-metrics
description: null
options:
images:
default:
kube-state-metrics:
pullPolicy: IfNotPresent
registry: registry.k8s.io
repository: kube-state-metrics/kube-state-metrics
tag: v2.10.1
examples:
- kube-state-metrics:
pullPolicy: IfNotPresent
registry: registry.k8s.io
repository: kube-state-metrics/kube-state-metrics
tag: v2.10.1
properties:
kube-state-metrics:
default:
pullPolicy: IfNotPresent
registry: registry.k8s.io
repository: kube-state-metrics/kube-state-metrics
tag: v2.10.1
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: registry.k8s.io
type: string
repository:
default: kube-state-metrics/kube-state-metrics
type: string
tag:
default: v2.10.1
type: string
type: object
type: object
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,68 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: kube-state-metrics-kube-pr-kube-state-metrics
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "kube-state-metrics"
heritage: "Helm"
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
-
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
!= 0
for: 15m
labels:
severity: critical

View File

@@ -1,24 +0,0 @@
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-state-metrics
namespace: vynil-monitor
labels:
helm.sh/chart: kube-state-metrics-5.16.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: kube-state-metrics
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "2.10.1"
release: kube-state-metrics
spec:
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
endpoints:
- port: http
honorLabels: true

View File

@@ -1,22 +0,0 @@
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
helm.sh/chart: kube-state-metrics-5.16.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: kube-state-metrics
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "2.10.1"
release: kube-state-metrics
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: vynil-monitor

View File

@@ -1,155 +0,0 @@
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
helm.sh/chart: kube-state-metrics-5.16.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: kube-state-metrics
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "2.10.1"
release: kube-state-metrics
name: kube-state-metrics
rules:
- apiGroups: ["certificates.k8s.io"]
resources:
- certificatesigningrequests
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources:
- cronjobs
verbs: ["list", "watch"]
- apiGroups: ["extensions", "apps"]
resources:
- daemonsets
verbs: ["list", "watch"]
- apiGroups: ["extensions", "apps"]
resources:
- deployments
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- endpoints
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["list", "watch"]
- apiGroups: ["extensions", "networking.k8s.io"]
resources:
- ingresses
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources:
- jobs
verbs: ["list", "watch"]
- apiGroups: ["coordination.k8s.io"]
resources:
- leases
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- limitranges
verbs: ["list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- mutatingwebhookconfigurations
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- namespaces
verbs: ["list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources:
- networkpolicies
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- nodes
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- persistentvolumeclaims
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- persistentvolumes
verbs: ["list", "watch"]
- apiGroups: ["policy"]
resources:
- poddisruptionbudgets
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- pods
verbs: ["list", "watch"]
- apiGroups: ["extensions", "apps"]
resources:
- replicasets
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- replicationcontrollers
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- resourcequotas
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- secrets
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- services
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources:
- statefulsets
verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
resources:
- storageclasses
verbs: ["list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- validatingwebhookconfigurations
verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
resources:
- volumeattachments
verbs: ["list", "watch"]

View File

@@ -1,45 +0,0 @@
# first loop through resources in ids_prio[0]
resource "kustomization_resource" "pre_no_ns" {
for_each = data.kustomization_overlay.data_no_ns.ids_prio[0]
manifest = (
contains(["_/Secret"], regex("(?P<group_kind>.*/.*)/.*/.*", each.value)["group_kind"])
? sensitive(data.kustomization_overlay.data_no_ns.manifests[each.value])
: data.kustomization_overlay.data_no_ns.manifests[each.value]
)
}
# then loop through resources in ids_prio[1]
# and set an explicit depends_on on kustomization_resource.pre
# wait 2 minutes for any deployment or daemonset to become ready
resource "kustomization_resource" "main_no_ns" {
for_each = data.kustomization_overlay.data_no_ns.ids_prio[1]
manifest = (
contains(["_/Secret"], regex("(?P<group_kind>.*/.*)/.*/.*", each.value)["group_kind"])
? sensitive(data.kustomization_overlay.data_no_ns.manifests[each.value])
: data.kustomization_overlay.data_no_ns.manifests[each.value]
)
wait = true
timeouts {
create = "5m"
update = "5m"
}
depends_on = [kustomization_resource.pre_no_ns]
}
# finally, loop through resources in ids_prio[2]
# and set an explicit depends_on on kustomization_resource.main
resource "kustomization_resource" "post_no_ns" {
for_each = data.kustomization_overlay.data_no_ns.ids_prio[2]
manifest = (
contains(["_/Secret"], regex("(?P<group_kind>.*/.*)/.*/.*", each.value)["group_kind"])
? sensitive(data.kustomization_overlay.data_no_ns.manifests[each.value])
: data.kustomization_overlay.data_no_ns.manifests[each.value]
)
depends_on = [kustomization_resource.main_no_ns]
}

View File

@@ -1,16 +0,0 @@
---
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
helm.sh/chart: kube-state-metrics-5.16.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: kube-state-metrics
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "2.10.1"
release: kube-state-metrics
name: kube-state-metrics
namespace: vynil-monitor

View File

@@ -1,28 +0,0 @@
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
name: kube-state-metrics
namespace: vynil-monitor
labels:
helm.sh/chart: kube-state-metrics-5.16.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: kube-state-metrics
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics
app.kubernetes.io/version: "2.10.1"
release: kube-state-metrics
annotations:
prometheus.io/scrape: 'true'
spec:
type: "ClusterIP"
ports:
- name: "http"
protocol: TCP
port: 8080
targetPort: 8080
selector:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/instance: kube-state-metrics

View File

@@ -1,82 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: loki-dashboard
description: null
options:
ingress_class:
default: traefik
examples:
- traefik
type: string
issuer:
default: letsencrypt-prod
examples:
- letsencrypt-prod
type: string
domain:
default: your-company
examples:
- your-company
type: string
images:
default:
operator:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
examples:
- operator:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
properties:
operator:
default:
pullPolicy: IfNotPresent
registry: docker.io
repository: to-be/defined
tag: v1.0.0
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: docker.io
type: string
repository:
default: to-be/defined
type: string
tag:
default: v1.0.0
type: string
type: object
type: object
sub_domain:
default: to-be-set
examples:
- to-be-set
type: string
domain_name:
default: your_company.com
examples:
- your_company.com
type: string
dependencies: []
providers:
kubernetes: true
authentik: true
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,124 +0,0 @@
# Source: loki/templates/single-binary/statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: loki
namespace: vynil-monitor
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: single-binary
app.kubernetes.io/part-of: memberlist
spec:
replicas: 1
podManagementPolicy: Parallel
updateStrategy:
rollingUpdate:
partition: 0
serviceName: loki-headless
revisionHistoryLimit: 10
persistentVolumeClaimRetentionPolicy:
whenDeleted: Delete
whenScaled: Delete
selector:
matchLabels:
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/component: single-binary
template:
metadata:
annotations:
checksum/config: f0b5fe7288abac6838f61aacfebeba1a01d5cf3d391971062c58797d2f0ea40f
labels:
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/component: single-binary
app.kubernetes.io/part-of: memberlist
spec:
serviceAccountName: loki
automountServiceAccountToken: true
enableServiceLinks: true
securityContext:
fsGroup: 10001
runAsGroup: 10001
runAsNonRoot: true
runAsUser: 10001
terminationGracePeriodSeconds: 30
containers:
- name: loki
image: docker.io/grafana/loki:2.9.3
imagePullPolicy: IfNotPresent
args:
- -config.file=/etc/loki/config/config.yaml
- -target=all
ports:
- name: http-metrics
containerPort: 3100
protocol: TCP
- name: grpc
containerPort: 9095
protocol: TCP
- name: http-memberlist
containerPort: 7946
protocol: TCP
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
readinessProbe:
httpGet:
path: /ready
port: http-metrics
initialDelaySeconds: 30
timeoutSeconds: 1
volumeMounts:
- name: tmp
mountPath: /tmp
- name: config
mountPath: /etc/loki/config
- name: runtime-config
mountPath: /etc/loki/runtime-config
- name: storage
mountPath: /var/loki
resources:
{}
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/component: single-binary
topologyKey: kubernetes.io/hostname
volumes:
- name: tmp
emptyDir: {}
- name: config
configMap:
name: loki
items:
- key: "config.yaml"
path: "config.yaml"
- name: runtime-config
configMap:
name: loki-runtime
volumeClaimTemplates:
- apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: storage
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "17Gi"

View File

@@ -1,91 +0,0 @@
resource "kubectl_manifest" "datasource" {
yaml_body = <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-datasource
namespace: "${var.namespace}"
labels: ${jsonencode(merge(local.common-labels, {"grafana_datasource" = "1"}))}
data:
loki-datasource.yaml: |-
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: "http://loki.${var.namespace}.svc:3100"
version: 1
isDefault: false
jsonData:
{}
EOF
}
resource "kubectl_manifest" "config" {
yaml_body = <<-EOF
apiVersion: v1
kind: ConfigMap
metadata:
name: loki
namespace: "${var.namespace}"
labels: ${jsonencode(local.common-labels)}
data:
config.yaml: |
auth_enabled: false
common:
compactor_address: 'loki'
path_prefix: /var/loki
replication_factor: 1
storage:
filesystem:
chunks_directory: /var/loki/chunks
rules_directory: /var/loki/rules
frontend:
scheduler_address: ""
frontend_worker:
scheduler_address: ""
index_gateway:
mode: ring
limits_config:
max_cache_freshness_per_query: 10m
reject_old_samples: true
reject_old_samples_max_age: 168h
split_queries_by_interval: 15m
memberlist:
join_members:
- loki-memberlist
query_range:
align_queries_with_step: true
ruler:
storage:
type: local
local:
directory: /tmp/rules
rule_path: /tmp/scratch
alertmanager_url: http://${var.alertmanager}:9093
ring:
kvstore:
store: inmemory
enable_api: true
runtime_config:
file: /etc/loki/runtime-config/runtime-config.yaml
schema_config:
configs:
- from: "2022-01-11"
index:
period: 24h
prefix: loki_index_
object_store: filesystem
schema: v12
store: boltdb-shipper
server:
grpc_listen_port: 9095
http_listen_port: 3100
storage_config:
hedging:
at: 250ms
max_per_second: 20
up_to: 3
tracing:
enabled: false
EOF
}

View File

@@ -1,77 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
pvc_spec = merge({
"accessModes" = [var.storage.volume.accessMode]
"volumeMode" = var.storage.volume.type
"resources" = {
"requests" = {
"storage" = "${var.storage.volume.size}"
}
}
}, var.storage.volume.class != "" ?{
"storageClassName" = var.storage.volume.class
}:{})
rb-patch = <<-EOF
- op: replace
path: /subjects/0/namespace
value: "${var.namespace}"
EOF
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
images {
name = "docker.io/grafana/loki"
new_name = "${var.images.loki.registry}/${var.images.loki.repository}"
new_tag = "${var.images.loki.tag}"
}
patches {
target {
kind = "ServiceMonitor"
name = "loki"
}
patch = <<-EOF
- op: replace
path: /spec/endpoints/0/relabelings/0/replacement
value: "${var.namespace}/$1"
EOF
}
patches {
target {
kind = "StatefulSet"
name = "loki"
}
patch = <<-EOF
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: loki
spec:
replicas: 1
template:
spec:
containers:
- name: loki
imagePullPolicy: ${var.images.loki.pullPolicy}
volumeClaimTemplates:
- apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: storage
annotations:
k8up.io/backup: "true"
spec: ${jsonencode(local.pvc_spec)}
EOF
}
}

View File

@@ -1,105 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: loki
description: null
options:
alertmanager:
default: alertmanager-alertmanager
examples:
- alertmanager-alertmanager
type: string
images:
default:
loki:
pullPolicy: IfNotPresent
registry: docker.io
repository: grafana/loki
tag: 2.9.3
examples:
- loki:
pullPolicy: IfNotPresent
registry: docker.io
repository: grafana/loki
tag: 2.9.3
properties:
loki:
default:
pullPolicy: IfNotPresent
registry: docker.io
repository: grafana/loki
tag: 2.9.3
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: docker.io
type: string
repository:
default: grafana/loki
type: string
tag:
default: 2.9.3
type: string
type: object
type: object
storage:
default:
volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
description: Configure this app storage
examples:
- volume:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
volume:
default:
accessMode: ReadWriteOnce
class: ''
size: 10Gi
type: Filesystem
properties:
accessMode:
default: ReadWriteOnce
enum:
- ReadWriteOnce
- ReadOnlyMany
- ReadWriteMany
type: string
class:
default: ''
type: string
size:
default: 10Gi
type: string
type:
default: Filesystem
enum:
- Filesystem
- Block
type: string
type: object
type: object
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,65 +0,0 @@
# Source: loki/templates/monitoring/loki-alerts.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
name: loki-loki-alerts
namespace: vynil-monitor
spec:
groups:
- name: loki_alerts
rules:
- alert: LokiRequestErrors
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
for: 15m
labels:
severity: critical
- alert: LokiTooManyCompactorsRunning
annotations:
message: |
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
for: 5m
labels:
severity: warning
- name: loki_canaries_alerts
rules:
- alert: LokiCanaryLatency
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
for: 15m
labels:
severity: warning

View File

@@ -1,98 +0,0 @@
# Source: loki/templates/monitoring/loki-rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
name: loki-loki-rules
namespace: vynil-monitor
spec:
groups:
- name: loki_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
labels:
cluster: loki
record: job:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
labels:
cluster: loki
record: job:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (job)
labels:
cluster: loki
record: job:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)
labels:
cluster: loki
record: job:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
labels:
cluster: loki
record: job:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job)
labels:
cluster: loki
record: job:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
labels:
cluster: loki
record: job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
labels:
cluster: loki
record: job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m]))
by (job, route)
labels:
cluster: loki
record: job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)
labels:
cluster: loki
record: job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
labels:
cluster: loki
record: job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
labels:
cluster: loki
record: job_route:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
labels:
cluster: loki
record: namespace_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
labels:
cluster: loki
record: namespace_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
labels:
cluster: loki
record: namespace_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job,
route)
labels:
cluster: loki
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
labels:
cluster: loki
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
labels:
cluster: loki
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate

View File

@@ -1,35 +0,0 @@
# Source: loki/templates/monitoring/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: loki
namespace: vynil-monitor
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
spec:
selector:
matchLabels:
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
matchExpressions:
- key: prometheus.io/service-monitor
operator: NotIn
values:
- "false"
endpoints:
- port: http-metrics
path: /metrics
interval: 15s
relabelings:
- sourceLabels: [job]
action: replace
replacement: "vynil-monitor/$1"
targetLabel: job
- action: replace
replacement: "loki"
targetLabel: cluster
scheme: http

View File

@@ -1,15 +0,0 @@
# Source: loki/templates/runtime-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-runtime
namespace: vynil-monitor
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
data:
runtime-config.yaml: |
{}

View File

@@ -1,14 +0,0 @@
---
# Source: loki/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: loki
namespace: vynil-monitor
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
automountServiceAccountToken: true

View File

@@ -1,25 +0,0 @@
# Source: loki/templates/single-binary/service-headless.yaml
apiVersion: v1
kind: Service
metadata:
name: loki-headless
namespace: vynil-monitor
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
variant: headless
prometheus.io/service-monitor: "false"
annotations:
spec:
clusterIP: None
ports:
- name: http-metrics
port: 3100
targetPort: http-metrics
protocol: TCP
selector:
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki

View File

@@ -1,24 +0,0 @@
# Source: loki/templates/service-memberlist.yaml
apiVersion: v1
kind: Service
metadata:
name: loki-memberlist
namespace: vynil-monitor
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
spec:
type: ClusterIP
clusterIP: None
ports:
- name: tcp
port: 7946
targetPort: http-memberlist
protocol: TCP
selector:
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/part-of: memberlist

View File

@@ -1,28 +0,0 @@
# Source: loki/templates/single-binary/service.yaml
apiVersion: v1
kind: Service
metadata:
name: loki
namespace: vynil-monitor
labels:
helm.sh/chart: loki-5.41.8
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/version: "2.9.3"
app.kubernetes.io/managed-by: Helm
annotations:
spec:
type: ClusterIP
ports:
- name: http-metrics
port: 3100
targetPort: http-metrics
protocol: TCP
- name: grpc
port: 9095
targetPort: grpc
protocol: TCP
selector:
app.kubernetes.io/name: loki
app.kubernetes.io/instance: loki
app.kubernetes.io/component: single-binary

View File

@@ -1,23 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: monitor-control-plan
description: null
options:
useless:
default: true
examples:
- true
type: boolean
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,167 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-community-kube-etcd
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: prometheus-community
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "prometheus-community"
heritage: "Helm"
spec:
groups:
- name: etcd
rules:
- alert: etcdMembersDown
annotations:
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
summary: etcd cluster members are down.
expr: |-
max without (endpoint) (
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
or
count without (To) (
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
)
)
> 0
for: 10m
labels:
severity: critical
- alert: etcdInsufficientMembers
annotations:
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
summary: etcd cluster has insufficient number of members.
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
summary: etcd cluster has high number of leader changes.
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of failed grpc requests.
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of failed grpc requests.
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
summary: etcd grpc requests are slow
expr: |-
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: critical
- alert: etcdHighCommitDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- alert: etcdDatabaseQuotaLowSpace
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
summary: etcd cluster database is running full.
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
for: 10m
labels:
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
summary: etcd cluster database growing very fast.
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
labels:
severity: warning
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
summary: etcd database size in use is less than 50% of the actual allocated storage.
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
labels:
severity: warning

Some files were not shown because too many files have changed in this diff Show More