fix
This commit is contained in:
21
apps/infisical/configs.tf
Normal file
21
apps/infisical/configs.tf
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
resource "kubectl_manifest" "config" {
|
||||||
|
yaml_body = <<-EOF
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: "${var.component}-${var.instance}"
|
||||||
|
namespace: "${var.namespace}"
|
||||||
|
labels: ${jsonencode(local.common-labels)}
|
||||||
|
data:
|
||||||
|
INVITE_ONLY_SIGNUP: ""
|
||||||
|
REDIS_URL: "${module.redis.url}"
|
||||||
|
SITE_URL: "https://${local.dns_name}"
|
||||||
|
SMTP_FROM_ADDRESS: ""
|
||||||
|
SMTP_FROM_NAME: "Infisical"
|
||||||
|
SMTP_HOST: ""
|
||||||
|
SMTP_PASSWORD: ""
|
||||||
|
SMTP_PORT: "587"
|
||||||
|
SMTP_SECURE: ""
|
||||||
|
SMTP_USERNAME: ""
|
||||||
|
EOF
|
||||||
|
}
|
||||||
@@ -1,20 +1,26 @@
|
|||||||
locals {
|
locals {
|
||||||
|
authentik_url = "http://authentik.${var.domain}-auth.svc"
|
||||||
|
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
|
||||||
common-labels = {
|
common-labels = {
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
"vynil.solidite.fr/owner-name" = var.instance
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
"vynil.solidite.fr/owner-namespace" = var.namespace
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
"vynil.solidite.fr/owner-category" = var.category
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
"vynil.solidite.fr/owner-component" = var.component
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
"app.kubernetes.io/managed-by" = "vynil"
|
||||||
|
"app.kubernetes.io/name" = var.component
|
||||||
"app.kubernetes.io/instance" = var.instance
|
"app.kubernetes.io/instance" = var.instance
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
data "kubernetes_secret_v1" "authentik" {
|
||||||
common_labels = local.common-labels
|
metadata {
|
||||||
namespace = var.namespace
|
name = "authentik"
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("v1_Service_prometheus",file))<1]
|
namespace = "${var.domain}-auth"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
data "kustomization_overlay" "data_no_ns" {
|
data "kustomization_overlay" "data" {
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if length(regexall("v1_Service_prometheus",file))>0]
|
namespace = var.namespace
|
||||||
|
common_labels = local.common-labels
|
||||||
|
resources = []
|
||||||
}
|
}
|
||||||
40
apps/infisical/deploy.tf
Normal file
40
apps/infisical/deploy.tf
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
resource "kubectl_manifest" "deploy" {
|
||||||
|
yaml_body = <<-EOF
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: "${var.component}-${var.instance}"
|
||||||
|
namespace: "${var.namespace}"
|
||||||
|
labels: ${jsonencode(local.common-labels)}
|
||||||
|
annotations:
|
||||||
|
configmap.reloader.stakater.com/reload: "${kubectl_manifest.config.name}"
|
||||||
|
secret.reloader.stakater.com/reload: "${kubectl_manifest.secret.name}"
|
||||||
|
spec:
|
||||||
|
replicas: ${var.replicas}
|
||||||
|
selector:
|
||||||
|
matchLabels: ${jsonencode(local.common-labels)}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels: ${jsonencode(local.common-labels)}
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: infisical-backend
|
||||||
|
image: "${var.images.infiscal.registry}/${var.images.infiscal.repository}:${var.images.infiscal.tag}"
|
||||||
|
imagePullPolicy: "${var.images.infiscal.pullPolicy}"
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /api/status
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
name: http
|
||||||
|
protocol: TCP
|
||||||
|
envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: "${kubectl_manifest.config.name}"
|
||||||
|
- secretRef:
|
||||||
|
name: "${kubectl_manifest.secret.name}"
|
||||||
|
EOF
|
||||||
|
}
|
||||||
@@ -1,70 +1,66 @@
|
|||||||
---
|
---
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
apiVersion: vinyl.solidite.fr/v1beta1
|
||||||
kind: Component
|
kind: Component
|
||||||
category: monitor
|
category: apps
|
||||||
metadata:
|
metadata:
|
||||||
name: grafana
|
name: infisical
|
||||||
description: null
|
description: null
|
||||||
options:
|
options:
|
||||||
domain:
|
language:
|
||||||
default: your-company
|
default: fr_FR
|
||||||
examples:
|
examples:
|
||||||
- your-company
|
- fr_FR
|
||||||
type: string
|
type: string
|
||||||
|
sub_domain:
|
||||||
|
default: to-be-set
|
||||||
|
examples:
|
||||||
|
- to-be-set
|
||||||
|
type: string
|
||||||
|
replicas:
|
||||||
|
default: 1
|
||||||
|
examples:
|
||||||
|
- 1
|
||||||
|
type: integer
|
||||||
images:
|
images:
|
||||||
default:
|
default:
|
||||||
busybox:
|
infisical:
|
||||||
registry: docker.io
|
|
||||||
repository: library/busybox
|
|
||||||
tag: 1.31.1
|
|
||||||
grafana:
|
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
registry: docker.io
|
registry: docker.io
|
||||||
repository: grafana/grafana
|
repository: infisical/infisical
|
||||||
tag: 10.2.3
|
tag: v1.0.0
|
||||||
sidecar:
|
redis:
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
registry: quay.io
|
registry: quay.io
|
||||||
repository: kiwigrid/k8s-sidecar
|
repository: opstree/redis
|
||||||
tag: 1.25.2
|
tag: v7.0.12
|
||||||
|
redis_exporter:
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
registry: quay.io
|
||||||
|
repository: opstree/redis-exporter
|
||||||
|
tag: v1.44.0
|
||||||
examples:
|
examples:
|
||||||
- busybox:
|
- infisical:
|
||||||
registry: docker.io
|
|
||||||
repository: library/busybox
|
|
||||||
tag: 1.31.1
|
|
||||||
grafana:
|
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
registry: docker.io
|
registry: docker.io
|
||||||
repository: grafana/grafana
|
repository: infisical/infisical
|
||||||
tag: 10.2.3
|
tag: v1.0.0
|
||||||
sidecar:
|
redis:
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
registry: quay.io
|
registry: quay.io
|
||||||
repository: kiwigrid/k8s-sidecar
|
repository: opstree/redis
|
||||||
tag: 1.25.2
|
tag: v7.0.12
|
||||||
|
redis_exporter:
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
registry: quay.io
|
||||||
|
repository: opstree/redis-exporter
|
||||||
|
tag: v1.44.0
|
||||||
properties:
|
properties:
|
||||||
busybox:
|
infisical:
|
||||||
default:
|
|
||||||
registry: docker.io
|
|
||||||
repository: library/busybox
|
|
||||||
tag: 1.31.1
|
|
||||||
properties:
|
|
||||||
registry:
|
|
||||||
default: docker.io
|
|
||||||
type: string
|
|
||||||
repository:
|
|
||||||
default: library/busybox
|
|
||||||
type: string
|
|
||||||
tag:
|
|
||||||
default: 1.31.1
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
grafana:
|
|
||||||
default:
|
default:
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
registry: docker.io
|
registry: docker.io
|
||||||
repository: grafana/grafana
|
repository: infisical/infisical
|
||||||
tag: 10.2.3
|
tag: v1.0.0
|
||||||
properties:
|
properties:
|
||||||
pullPolicy:
|
pullPolicy:
|
||||||
default: IfNotPresent
|
default: IfNotPresent
|
||||||
@@ -77,18 +73,18 @@ options:
|
|||||||
default: docker.io
|
default: docker.io
|
||||||
type: string
|
type: string
|
||||||
repository:
|
repository:
|
||||||
default: grafana/grafana
|
default: infisical/infisical
|
||||||
type: string
|
type: string
|
||||||
tag:
|
tag:
|
||||||
default: 10.2.3
|
default: v1.0.0
|
||||||
type: string
|
type: string
|
||||||
type: object
|
type: object
|
||||||
sidecar:
|
redis:
|
||||||
default:
|
default:
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
registry: quay.io
|
registry: quay.io
|
||||||
repository: kiwigrid/k8s-sidecar
|
repository: opstree/redis
|
||||||
tag: 1.25.2
|
tag: v7.0.12
|
||||||
properties:
|
properties:
|
||||||
pullPolicy:
|
pullPolicy:
|
||||||
default: IfNotPresent
|
default: IfNotPresent
|
||||||
@@ -101,92 +97,88 @@ options:
|
|||||||
default: quay.io
|
default: quay.io
|
||||||
type: string
|
type: string
|
||||||
repository:
|
repository:
|
||||||
default: kiwigrid/k8s-sidecar
|
default: opstree/redis
|
||||||
type: string
|
type: string
|
||||||
tag:
|
tag:
|
||||||
default: 1.25.2
|
default: v7.0.12
|
||||||
|
type: string
|
||||||
|
type: object
|
||||||
|
redis_exporter:
|
||||||
|
default:
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
registry: quay.io
|
||||||
|
repository: opstree/redis-exporter
|
||||||
|
tag: v1.44.0
|
||||||
|
properties:
|
||||||
|
pullPolicy:
|
||||||
|
default: IfNotPresent
|
||||||
|
enum:
|
||||||
|
- Always
|
||||||
|
- Never
|
||||||
|
- IfNotPresent
|
||||||
|
type: string
|
||||||
|
registry:
|
||||||
|
default: quay.io
|
||||||
|
type: string
|
||||||
|
repository:
|
||||||
|
default: opstree/redis-exporter
|
||||||
|
type: string
|
||||||
|
tag:
|
||||||
|
default: v1.44.0
|
||||||
type: string
|
type: string
|
||||||
type: object
|
type: object
|
||||||
type: object
|
type: object
|
||||||
ingress_class:
|
sso_vynil:
|
||||||
default: traefik
|
default: true
|
||||||
examples:
|
examples:
|
||||||
- traefik
|
- true
|
||||||
|
type: boolean
|
||||||
|
timezone:
|
||||||
|
default: Europe/Paris
|
||||||
|
examples:
|
||||||
|
- Europe/Paris
|
||||||
type: string
|
type: string
|
||||||
domain_name:
|
domain_name:
|
||||||
default: your_company.com
|
default: your-company.com
|
||||||
examples:
|
examples:
|
||||||
- your_company.com
|
- your-company.com
|
||||||
|
type: string
|
||||||
|
domain:
|
||||||
|
default: your-company
|
||||||
|
examples:
|
||||||
|
- your-company
|
||||||
type: string
|
type: string
|
||||||
issuer:
|
issuer:
|
||||||
default: letsencrypt-prod
|
default: letsencrypt-prod
|
||||||
examples:
|
examples:
|
||||||
- letsencrypt-prod
|
- letsencrypt-prod
|
||||||
type: string
|
type: string
|
||||||
admin_name:
|
ingress_class:
|
||||||
default: grafana_admin
|
default: traefik
|
||||||
examples:
|
examples:
|
||||||
- grafana_admin
|
- traefik
|
||||||
type: string
|
|
||||||
sub_domain:
|
|
||||||
default: grafana
|
|
||||||
examples:
|
|
||||||
- grafana
|
|
||||||
type: string
|
type: string
|
||||||
app_group:
|
app_group:
|
||||||
default: monitor
|
default: apps
|
||||||
examples:
|
examples:
|
||||||
- monitor
|
- apps
|
||||||
type: string
|
type: string
|
||||||
storage:
|
dependencies:
|
||||||
default:
|
- dist: null
|
||||||
volume:
|
category: core
|
||||||
accessMode: ReadWriteOnce
|
component: secret-generator
|
||||||
class: ''
|
- dist: null
|
||||||
size: 10Gi
|
category: dbo
|
||||||
type: Filesystem
|
component: mongo
|
||||||
description: Configure this app storage
|
- dist: null
|
||||||
examples:
|
category: dbo
|
||||||
- volume:
|
component: redis
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
properties:
|
|
||||||
volume:
|
|
||||||
default:
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
properties:
|
|
||||||
accessMode:
|
|
||||||
default: ReadWriteOnce
|
|
||||||
enum:
|
|
||||||
- ReadWriteOnce
|
|
||||||
- ReadOnlyMany
|
|
||||||
- ReadWriteMany
|
|
||||||
type: string
|
|
||||||
class:
|
|
||||||
default: ''
|
|
||||||
type: string
|
|
||||||
size:
|
|
||||||
default: 10Gi
|
|
||||||
type: string
|
|
||||||
type:
|
|
||||||
default: Filesystem
|
|
||||||
enum:
|
|
||||||
- Filesystem
|
|
||||||
- Block
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
providers:
|
||||||
kubernetes: true
|
kubernetes: true
|
||||||
authentik: true
|
authentik: true
|
||||||
kubectl: true
|
kubectl: true
|
||||||
postgresql: null
|
postgresql: null
|
||||||
|
mysql: null
|
||||||
restapi: true
|
restapi: true
|
||||||
http: true
|
http: true
|
||||||
gitea: null
|
gitea: null
|
||||||
@@ -1,17 +1,22 @@
|
|||||||
locals {
|
locals {
|
||||||
dns_name = "${var.sub_domain}.${var.domain_name}"
|
dns_name = "${var.sub_domain}.${var.domain_name}"
|
||||||
dns_names = [local.dns_name]
|
dns_names = [local.dns_name]
|
||||||
app_name = var.component == var.instance ? var.instance : format("%s-%s", var.component, var.instance)
|
icon = "icon.svg"
|
||||||
icon = "favicon.ico"
|
|
||||||
request_headers = {
|
request_headers = {
|
||||||
"Content-Type" = "application/json"
|
"Content-Type" = "application/json"
|
||||||
Authorization = "Bearer ${data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]}"
|
Authorization = "Bearer ${data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]}"
|
||||||
}
|
}
|
||||||
service = {
|
}
|
||||||
"name" = "alertmanager-operated"
|
|
||||||
"port" = {
|
module "service" {
|
||||||
"number" = 9093
|
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//service"
|
||||||
}
|
component = var.component
|
||||||
|
instance = var.instance
|
||||||
|
namespace = var.namespace
|
||||||
|
labels = local.common-labels
|
||||||
|
targets = ["http"]
|
||||||
|
providers = {
|
||||||
|
kubectl = kubectl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,7 +30,7 @@ module "ingress" {
|
|||||||
labels = local.common-labels
|
labels = local.common-labels
|
||||||
dns_names = local.dns_names
|
dns_names = local.dns_names
|
||||||
middlewares = [module.forward.middleware]
|
middlewares = [module.forward.middleware]
|
||||||
services = [local.service]
|
services = [module.service.default_definition]
|
||||||
providers = {
|
providers = {
|
||||||
kubectl = kubectl
|
kubectl = kubectl
|
||||||
}
|
}
|
||||||
@@ -63,7 +68,7 @@ module "forward" {
|
|||||||
ingress_class = var.ingress_class
|
ingress_class = var.ingress_class
|
||||||
labels = local.common-labels
|
labels = local.common-labels
|
||||||
dns_names = local.dns_names
|
dns_names = local.dns_names
|
||||||
service = local.service
|
service = module.service.default_definition
|
||||||
icon = local.icon
|
icon = local.icon
|
||||||
request_headers = local.request_headers
|
request_headers = local.request_headers
|
||||||
providers = {
|
providers = {
|
||||||
31
apps/infisical/secret.tf
Normal file
31
apps/infisical/secret.tf
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
|
||||||
|
resource "kubectl_manifest" "secret" {
|
||||||
|
ignore_fields = ["metadata.annotations"]
|
||||||
|
yaml_body = <<-EOF
|
||||||
|
apiVersion: "secretgenerator.mittwald.de/v1alpha1"
|
||||||
|
kind: "StringSecret"
|
||||||
|
metadata:
|
||||||
|
name: "${var.component}-${var.instance}"
|
||||||
|
namespace: "${var.namespace}"
|
||||||
|
labels: ${jsonencode(local.common-labels)}
|
||||||
|
spec:
|
||||||
|
forceRegenerate: false
|
||||||
|
data:
|
||||||
|
MONGO_URL: "${module.mongo.url}"
|
||||||
|
fields:
|
||||||
|
- fieldName: "JWT_SIGNUP_SECRET"
|
||||||
|
length: "32"
|
||||||
|
- fieldName: "JWT_SERVICE_SECRET"
|
||||||
|
length: "32"
|
||||||
|
- fieldName: "JWT_REFRESH_SECRET"
|
||||||
|
length: "32"
|
||||||
|
- fieldName: "JWT_PROVIDER_AUTH_SECRET"
|
||||||
|
length: "32"
|
||||||
|
- fieldName: "JWT_MFA_SECRET"
|
||||||
|
length: "32"
|
||||||
|
- fieldName: "JWT_AUTH_SECRET"
|
||||||
|
length: "32"
|
||||||
|
- fieldName: "ENCRYPTION_KEY"
|
||||||
|
length: "32"
|
||||||
|
EOF
|
||||||
|
}
|
||||||
21
apps/infisical/storage.tf
Normal file
21
apps/infisical/storage.tf
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
module "redis" {
|
||||||
|
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//redis"
|
||||||
|
component = var.component
|
||||||
|
instance = var.instance
|
||||||
|
namespace = var.namespace
|
||||||
|
labels = local.common-labels
|
||||||
|
images = var.images
|
||||||
|
providers = {
|
||||||
|
kubectl = kubectl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
module "mongo" {
|
||||||
|
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//mongo"
|
||||||
|
component = var.component
|
||||||
|
instance = var.instance
|
||||||
|
namespace = var.namespace
|
||||||
|
labels = local.common-labels
|
||||||
|
providers = {
|
||||||
|
kubectl = kubectl
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,21 +6,175 @@ metadata:
|
|||||||
name: nextcloud
|
name: nextcloud
|
||||||
description: null
|
description: null
|
||||||
options:
|
options:
|
||||||
ingress_class:
|
issuer:
|
||||||
default: traefik
|
default: letsencrypt-prod
|
||||||
examples:
|
examples:
|
||||||
- traefik
|
- letsencrypt-prod
|
||||||
type: string
|
type: string
|
||||||
app_group:
|
postgres:
|
||||||
|
default:
|
||||||
|
replicas: 1
|
||||||
|
examples:
|
||||||
|
- replicas: 1
|
||||||
|
properties:
|
||||||
|
replicas:
|
||||||
|
default: 1
|
||||||
|
type: integer
|
||||||
|
type: object
|
||||||
|
apps:
|
||||||
|
default:
|
||||||
|
audioplayer: false
|
||||||
|
bookmarks: false
|
||||||
|
bpm: false
|
||||||
|
calendar: false
|
||||||
|
collabora: false
|
||||||
|
contacts: false
|
||||||
|
deck: false
|
||||||
|
groupfolders: true
|
||||||
|
mindmap: false
|
||||||
|
music: false
|
||||||
|
notes: false
|
||||||
|
onlyoffice: false
|
||||||
|
passman: false
|
||||||
|
spreed: false
|
||||||
|
tables: false
|
||||||
|
tasks: false
|
||||||
|
texteditor: true
|
||||||
|
examples:
|
||||||
|
- audioplayer: false
|
||||||
|
bookmarks: false
|
||||||
|
bpm: false
|
||||||
|
calendar: false
|
||||||
|
collabora: false
|
||||||
|
contacts: false
|
||||||
|
deck: false
|
||||||
|
groupfolders: true
|
||||||
|
mindmap: false
|
||||||
|
music: false
|
||||||
|
notes: false
|
||||||
|
onlyoffice: false
|
||||||
|
passman: false
|
||||||
|
spreed: false
|
||||||
|
tables: false
|
||||||
|
tasks: false
|
||||||
|
texteditor: true
|
||||||
|
properties:
|
||||||
|
audioplayer:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
bookmarks:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
bpm:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
calendar:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
collabora:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
contacts:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
deck:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
groupfolders:
|
||||||
|
default: true
|
||||||
|
type: boolean
|
||||||
|
mindmap:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
music:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
notes:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
onlyoffice:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
passman:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
spreed:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
tables:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
tasks:
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
texteditor:
|
||||||
|
default: true
|
||||||
|
type: boolean
|
||||||
|
type: object
|
||||||
|
storage:
|
||||||
|
default:
|
||||||
|
postgres:
|
||||||
|
size: 5Gi
|
||||||
|
redis:
|
||||||
|
size: 2Gi
|
||||||
|
volume:
|
||||||
|
accessMode: ReadWriteOnce
|
||||||
|
class: ''
|
||||||
|
size: 10Gi
|
||||||
|
type: Filesystem
|
||||||
|
description: Configure this app storage
|
||||||
|
examples:
|
||||||
|
- postgres:
|
||||||
|
size: 5Gi
|
||||||
|
redis:
|
||||||
|
size: 2Gi
|
||||||
|
volume:
|
||||||
|
accessMode: ReadWriteOnce
|
||||||
|
class: ''
|
||||||
|
size: 10Gi
|
||||||
|
type: Filesystem
|
||||||
|
properties:
|
||||||
|
postgres:
|
||||||
|
default:
|
||||||
|
size: 5Gi
|
||||||
|
properties:
|
||||||
|
size:
|
||||||
|
default: 5Gi
|
||||||
|
type: string
|
||||||
|
type: object
|
||||||
|
redis:
|
||||||
|
default:
|
||||||
|
size: 2Gi
|
||||||
|
properties:
|
||||||
|
size:
|
||||||
|
default: 2Gi
|
||||||
|
type: string
|
||||||
|
type: object
|
||||||
|
volume:
|
||||||
|
default:
|
||||||
|
accessMode: ReadWriteOnce
|
||||||
|
class: ''
|
||||||
|
size: 10Gi
|
||||||
|
type: Filesystem
|
||||||
|
properties:
|
||||||
|
accessMode:
|
||||||
|
default: ReadWriteOnce
|
||||||
|
enum:
|
||||||
|
- ReadWriteOnce
|
||||||
|
- ReadOnlyMany
|
||||||
|
- ReadWriteMany
|
||||||
|
type: string
|
||||||
|
class:
|
||||||
default: ''
|
default: ''
|
||||||
examples:
|
|
||||||
- ''
|
|
||||||
type: string
|
type: string
|
||||||
openid-name:
|
size:
|
||||||
default: vynil
|
default: 10Gi
|
||||||
examples:
|
|
||||||
- vynil
|
|
||||||
type: string
|
type: string
|
||||||
|
type:
|
||||||
|
default: Filesystem
|
||||||
|
type: string
|
||||||
|
type: object
|
||||||
|
type: object
|
||||||
backups:
|
backups:
|
||||||
default:
|
default:
|
||||||
enable: false
|
enable: false
|
||||||
@@ -127,182 +281,6 @@ options:
|
|||||||
default: false
|
default: false
|
||||||
type: boolean
|
type: boolean
|
||||||
type: object
|
type: object
|
||||||
issuer:
|
|
||||||
default: letsencrypt-prod
|
|
||||||
examples:
|
|
||||||
- letsencrypt-prod
|
|
||||||
type: string
|
|
||||||
storage:
|
|
||||||
default:
|
|
||||||
postgres:
|
|
||||||
size: 5Gi
|
|
||||||
redis:
|
|
||||||
size: 2Gi
|
|
||||||
volume:
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
description: Configure this app storage
|
|
||||||
examples:
|
|
||||||
- postgres:
|
|
||||||
size: 5Gi
|
|
||||||
redis:
|
|
||||||
size: 2Gi
|
|
||||||
volume:
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
properties:
|
|
||||||
postgres:
|
|
||||||
default:
|
|
||||||
size: 5Gi
|
|
||||||
properties:
|
|
||||||
size:
|
|
||||||
default: 5Gi
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
redis:
|
|
||||||
default:
|
|
||||||
size: 2Gi
|
|
||||||
properties:
|
|
||||||
size:
|
|
||||||
default: 2Gi
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
volume:
|
|
||||||
default:
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
properties:
|
|
||||||
accessMode:
|
|
||||||
default: ReadWriteOnce
|
|
||||||
enum:
|
|
||||||
- ReadWriteOnce
|
|
||||||
- ReadOnlyMany
|
|
||||||
- ReadWriteMany
|
|
||||||
type: string
|
|
||||||
class:
|
|
||||||
default: ''
|
|
||||||
type: string
|
|
||||||
size:
|
|
||||||
default: 10Gi
|
|
||||||
type: string
|
|
||||||
type:
|
|
||||||
default: Filesystem
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
apps:
|
|
||||||
default:
|
|
||||||
audioplayer: false
|
|
||||||
bookmarks: false
|
|
||||||
bpm: false
|
|
||||||
calendar: false
|
|
||||||
collabora: false
|
|
||||||
contacts: false
|
|
||||||
deck: false
|
|
||||||
groupfolders: true
|
|
||||||
mindmap: false
|
|
||||||
music: false
|
|
||||||
notes: false
|
|
||||||
onlyoffice: false
|
|
||||||
passman: false
|
|
||||||
spreed: false
|
|
||||||
tables: false
|
|
||||||
tasks: false
|
|
||||||
texteditor: true
|
|
||||||
examples:
|
|
||||||
- audioplayer: false
|
|
||||||
bookmarks: false
|
|
||||||
bpm: false
|
|
||||||
calendar: false
|
|
||||||
collabora: false
|
|
||||||
contacts: false
|
|
||||||
deck: false
|
|
||||||
groupfolders: true
|
|
||||||
mindmap: false
|
|
||||||
music: false
|
|
||||||
notes: false
|
|
||||||
onlyoffice: false
|
|
||||||
passman: false
|
|
||||||
spreed: false
|
|
||||||
tables: false
|
|
||||||
tasks: false
|
|
||||||
texteditor: true
|
|
||||||
properties:
|
|
||||||
audioplayer:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
bookmarks:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
bpm:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
calendar:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
collabora:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
contacts:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
deck:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
groupfolders:
|
|
||||||
default: true
|
|
||||||
type: boolean
|
|
||||||
mindmap:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
music:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
notes:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
onlyoffice:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
passman:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
spreed:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
tables:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
tasks:
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
texteditor:
|
|
||||||
default: true
|
|
||||||
type: boolean
|
|
||||||
type: object
|
|
||||||
redis:
|
|
||||||
default:
|
|
||||||
exporter:
|
|
||||||
enabled: true
|
|
||||||
examples:
|
|
||||||
- exporter:
|
|
||||||
enabled: true
|
|
||||||
properties:
|
|
||||||
exporter:
|
|
||||||
default:
|
|
||||||
enabled: true
|
|
||||||
properties:
|
|
||||||
enabled:
|
|
||||||
default: true
|
|
||||||
type: boolean
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
images:
|
images:
|
||||||
default:
|
default:
|
||||||
collabora:
|
collabora:
|
||||||
@@ -558,6 +536,38 @@ options:
|
|||||||
type: string
|
type: string
|
||||||
type: object
|
type: object
|
||||||
type: object
|
type: object
|
||||||
|
redis:
|
||||||
|
default:
|
||||||
|
exporter:
|
||||||
|
enabled: true
|
||||||
|
examples:
|
||||||
|
- exporter:
|
||||||
|
enabled: true
|
||||||
|
properties:
|
||||||
|
exporter:
|
||||||
|
default:
|
||||||
|
enabled: true
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
default: true
|
||||||
|
type: boolean
|
||||||
|
type: object
|
||||||
|
type: object
|
||||||
|
admin:
|
||||||
|
default:
|
||||||
|
name: nextcloud_admin
|
||||||
|
examples:
|
||||||
|
- name: nextcloud_admin
|
||||||
|
properties:
|
||||||
|
name:
|
||||||
|
default: nextcloud_admin
|
||||||
|
type: string
|
||||||
|
type: object
|
||||||
|
app_group:
|
||||||
|
default: ''
|
||||||
|
examples:
|
||||||
|
- ''
|
||||||
|
type: string
|
||||||
domain_name:
|
domain_name:
|
||||||
default: your_company.com
|
default: your_company.com
|
||||||
examples:
|
examples:
|
||||||
@@ -583,31 +593,21 @@ options:
|
|||||||
default: 1
|
default: 1
|
||||||
type: integer
|
type: integer
|
||||||
type: object
|
type: object
|
||||||
admin:
|
ingress_class:
|
||||||
default:
|
default: traefik
|
||||||
name: nextcloud_admin
|
|
||||||
examples:
|
examples:
|
||||||
- name: nextcloud_admin
|
- traefik
|
||||||
properties:
|
|
||||||
name:
|
|
||||||
default: nextcloud_admin
|
|
||||||
type: string
|
type: string
|
||||||
type: object
|
|
||||||
postgres:
|
|
||||||
default:
|
|
||||||
replicas: 1
|
|
||||||
examples:
|
|
||||||
- replicas: 1
|
|
||||||
properties:
|
|
||||||
replicas:
|
|
||||||
default: 1
|
|
||||||
type: integer
|
|
||||||
type: object
|
|
||||||
domain:
|
domain:
|
||||||
default: your-company
|
default: your-company
|
||||||
examples:
|
examples:
|
||||||
- your-company
|
- your-company
|
||||||
type: string
|
type: string
|
||||||
|
openid-name:
|
||||||
|
default: vynil
|
||||||
|
examples:
|
||||||
|
- vynil
|
||||||
|
type: string
|
||||||
sub_domain:
|
sub_domain:
|
||||||
default: files
|
default: files
|
||||||
examples:
|
examples:
|
||||||
@@ -628,7 +628,8 @@ providers:
|
|||||||
authentik: true
|
authentik: true
|
||||||
kubectl: true
|
kubectl: true
|
||||||
postgresql: null
|
postgresql: null
|
||||||
|
mysql: null
|
||||||
restapi: null
|
restapi: null
|
||||||
http: null
|
http: null
|
||||||
gitea: null
|
gitea: null
|
||||||
tfaddtype: null
|
tfaddtype: true
|
||||||
|
|||||||
@@ -1,167 +0,0 @@
|
|||||||
locals {
|
|
||||||
authentik_url = "http://authentik.${var.domain}-auth.svc"
|
|
||||||
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
rb-patch = <<-EOF
|
|
||||||
- op: replace
|
|
||||||
path: /subjects/0/namespace
|
|
||||||
value: "${var.namespace}"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kubernetes_secret_v1" "authentik" {
|
|
||||||
metadata {
|
|
||||||
name = "authentik"
|
|
||||||
namespace = "${var.domain}-auth"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "Alertmanager"
|
|
||||||
name = "alertmanager-kube-promethe-alertmanager"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: Alertmanager
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-kube-promethe-alertmanager
|
|
||||||
spec:
|
|
||||||
image: "${var.images.alertmanager.registry}/${var.images.alertmanager.repository}:${var.images.alertmanager.tag}"
|
|
||||||
version: ${var.images.alertmanager.tag}
|
|
||||||
externalUrl: http://${var.component}-${var.instance}.${var.namespace}:9093
|
|
||||||
replicas: ${var.replicas}
|
|
||||||
listenLocal: ${var.listenLocal}
|
|
||||||
logLevel: "${var.logLevel}"
|
|
||||||
retention: "${var.retention}"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "ConfigMap"
|
|
||||||
name = "alertmanager-kube-grafana-datasource"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-operated
|
|
||||||
data:
|
|
||||||
datasource.yaml: |-
|
|
||||||
apiVersion: 1
|
|
||||||
datasources:
|
|
||||||
- name: Alertmanager
|
|
||||||
type: alertmanager
|
|
||||||
uid: alertmanager
|
|
||||||
url: http://${var.component}-${var.instance}.${var.namespace}:9093/
|
|
||||||
access: proxy
|
|
||||||
jsonData:
|
|
||||||
handleGrafanaManagedAlerts: false
|
|
||||||
implementation: prometheus
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "ServiceMonitor"
|
|
||||||
name = "alertmanager-kube-promethe-alertmanager"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
- op: replace
|
|
||||||
path: /spec/namespaceSelector/matchNames/0
|
|
||||||
value: "${var.namespace}"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "PrometheusRule"
|
|
||||||
name = "alertmanager-kube-promethe-alertmanager.rules"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-kube-promethe-alertmanager.rules
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: alertmanager.rules
|
|
||||||
rules:
|
|
||||||
- alert: AlertmanagerFailedReload
|
|
||||||
expr: |-
|
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
||||||
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) == 0
|
|
||||||
- alert: AlertmanagerMembersInconsistent
|
|
||||||
expr: |-
|
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
||||||
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
|
||||||
< on (namespace,service,cluster) group_left
|
|
||||||
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]))
|
|
||||||
- alert: AlertmanagerFailedToSendAlerts
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
|
||||||
/
|
|
||||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m])
|
|
||||||
)
|
|
||||||
> 0.01
|
|
||||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
||||||
expr: |-
|
|
||||||
min by (namespace,service, integration) (
|
|
||||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
|
||||||
/
|
|
||||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration=~`.*`}[5m])
|
|
||||||
)
|
|
||||||
> 0.01
|
|
||||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
||||||
expr: |-
|
|
||||||
min by (namespace,service, integration) (
|
|
||||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
|
||||||
/
|
|
||||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}", integration!~`.*`}[5m])
|
|
||||||
)
|
|
||||||
> 0.01
|
|
||||||
- alert: AlertmanagerConfigInconsistent
|
|
||||||
expr: |-
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"})
|
|
||||||
)
|
|
||||||
!= 1
|
|
||||||
- alert: AlertmanagerClusterDown
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[5m]) < 0.5
|
|
||||||
)
|
|
||||||
/
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
>= 0.5
|
|
||||||
- alert: AlertmanagerClusterCrashlooping
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}[10m]) > 4
|
|
||||||
)
|
|
||||||
/
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="${var.namespace}"}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
>= 0.5
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,110 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: alertmanager
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
replicas:
|
|
||||||
default: 1
|
|
||||||
examples:
|
|
||||||
- 1
|
|
||||||
type: integer
|
|
||||||
app_group:
|
|
||||||
default: monitor
|
|
||||||
examples:
|
|
||||||
- monitor
|
|
||||||
type: string
|
|
||||||
listenLocal:
|
|
||||||
default: false
|
|
||||||
examples:
|
|
||||||
- false
|
|
||||||
type: boolean
|
|
||||||
domain_name:
|
|
||||||
default: your_company.com
|
|
||||||
examples:
|
|
||||||
- your_company.com
|
|
||||||
type: string
|
|
||||||
domain:
|
|
||||||
default: your-company
|
|
||||||
examples:
|
|
||||||
- your-company
|
|
||||||
type: string
|
|
||||||
images:
|
|
||||||
default:
|
|
||||||
alertmanager:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: quay.io
|
|
||||||
repository: prometheus/alertmanager
|
|
||||||
tag: v0.26.0
|
|
||||||
examples:
|
|
||||||
- alertmanager:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: quay.io
|
|
||||||
repository: prometheus/alertmanager
|
|
||||||
tag: v0.26.0
|
|
||||||
properties:
|
|
||||||
alertmanager:
|
|
||||||
default:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: quay.io
|
|
||||||
repository: prometheus/alertmanager
|
|
||||||
tag: v0.26.0
|
|
||||||
properties:
|
|
||||||
pullPolicy:
|
|
||||||
default: IfNotPresent
|
|
||||||
enum:
|
|
||||||
- Always
|
|
||||||
- Never
|
|
||||||
- IfNotPresent
|
|
||||||
type: string
|
|
||||||
registry:
|
|
||||||
default: quay.io
|
|
||||||
type: string
|
|
||||||
repository:
|
|
||||||
default: prometheus/alertmanager
|
|
||||||
type: string
|
|
||||||
tag:
|
|
||||||
default: v0.26.0
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
logLevel:
|
|
||||||
default: info
|
|
||||||
examples:
|
|
||||||
- info
|
|
||||||
type: string
|
|
||||||
ingress_class:
|
|
||||||
default: traefik
|
|
||||||
examples:
|
|
||||||
- traefik
|
|
||||||
type: string
|
|
||||||
sub_domain:
|
|
||||||
default: alertmanager
|
|
||||||
examples:
|
|
||||||
- alertmanager
|
|
||||||
type: string
|
|
||||||
retention:
|
|
||||||
default: 120h
|
|
||||||
examples:
|
|
||||||
- 120h
|
|
||||||
type: string
|
|
||||||
issuer:
|
|
||||||
default: letsencrypt-prod
|
|
||||||
examples:
|
|
||||||
- letsencrypt-prod
|
|
||||||
type: string
|
|
||||||
dependencies:
|
|
||||||
- dist: null
|
|
||||||
category: share
|
|
||||||
component: authentik-forward
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: true
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: true
|
|
||||||
http: true
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/alertmanager/alertmanager.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: Alertmanager
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-kube-promethe-alertmanager
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack-alertmanager
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: alertmanager
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "alertmanager"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
image: "quay.io/prometheus/alertmanager:v0.26.0"
|
|
||||||
version: v0.26.0
|
|
||||||
replicas: 1
|
|
||||||
listenLocal: false
|
|
||||||
serviceAccountName: alertmanager-kube-promethe-alertmanager
|
|
||||||
externalUrl: http://alertmanager-kube-promethe-alertmanager.vynil-monitor:9093
|
|
||||||
paused: false
|
|
||||||
logFormat: "logfmt"
|
|
||||||
logLevel: "info"
|
|
||||||
retention: "120h"
|
|
||||||
alertmanagerConfigSelector: {}
|
|
||||||
alertmanagerConfigNamespaceSelector: {}
|
|
||||||
routePrefix: "/"
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 2000
|
|
||||||
runAsGroup: 2000
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 1000
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
portName: http-web
|
|
||||||
@@ -1,142 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/alertmanager.rules.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-kube-promethe-alertmanager.rules
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: alertmanager
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "alertmanager"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: alertmanager.rules
|
|
||||||
rules:
|
|
||||||
- alert: AlertmanagerFailedReload
|
|
||||||
annotations:
|
|
||||||
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
|
|
||||||
summary: Reloading an Alertmanager configuration has failed.
|
|
||||||
expr: |-
|
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
||||||
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]) == 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: AlertmanagerMembersInconsistent
|
|
||||||
annotations:
|
|
||||||
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
|
|
||||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
|
||||||
expr: |-
|
|
||||||
# Without max_over_time, failed scrapes could create false negatives, see
|
|
||||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
||||||
max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
|
|
||||||
< on (namespace,service,cluster) group_left
|
|
||||||
count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]))
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: AlertmanagerFailedToSendAlerts
|
|
||||||
annotations:
|
|
||||||
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
|
|
||||||
summary: An Alertmanager instance failed to send notifications.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
|
|
||||||
/
|
|
||||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m])
|
|
||||||
)
|
|
||||||
> 0.01
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
||||||
annotations:
|
|
||||||
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
|
||||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
|
||||||
expr: |-
|
|
||||||
min by (namespace,service, integration) (
|
|
||||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration=~`.*`}[5m])
|
|
||||||
/
|
|
||||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration=~`.*`}[5m])
|
|
||||||
)
|
|
||||||
> 0.01
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
|
||||||
annotations:
|
|
||||||
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
|
||||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
|
||||||
expr: |-
|
|
||||||
min by (namespace,service, integration) (
|
|
||||||
rate(alertmanager_notifications_failed_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration!~`.*`}[5m])
|
|
||||||
/
|
|
||||||
ignoring (reason) group_left rate(alertmanager_notifications_total{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor", integration!~`.*`}[5m])
|
|
||||||
)
|
|
||||||
> 0.01
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: AlertmanagerConfigInconsistent
|
|
||||||
annotations:
|
|
||||||
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
|
|
||||||
summary: Alertmanager instances within the same cluster have different configurations.
|
|
||||||
expr: |-
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"})
|
|
||||||
)
|
|
||||||
!= 1
|
|
||||||
for: 20m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: AlertmanagerClusterDown
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
|
|
||||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
avg_over_time(up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[5m]) < 0.5
|
|
||||||
)
|
|
||||||
/
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
>= 0.5
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: AlertmanagerClusterCrashlooping
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
|
|
||||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
changes(process_start_time_seconds{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}[10m]) > 4
|
|
||||||
)
|
|
||||||
/
|
|
||||||
count by (namespace,service,cluster) (
|
|
||||||
up{job="alertmanager-kube-promethe-alertmanager",namespace="vynil-monitor"}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
>= 0.5
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/alertmanager/servicemonitor.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: ServiceMonitor
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-kube-promethe-alertmanager
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack-alertmanager
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: alertmanager
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "alertmanager"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: kube-prometheus-stack-alertmanager
|
|
||||||
release: "alertmanager"
|
|
||||||
self-monitor: "true"
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- "vynil-monitor"
|
|
||||||
endpoints:
|
|
||||||
- port: http-web
|
|
||||||
enableHttp2: true
|
|
||||||
path: "/metrics"
|
|
||||||
- port: reloader-web
|
|
||||||
scheme: http
|
|
||||||
path: "/metrics"
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
locals {
|
|
||||||
svc-label = merge(local.common-labels, {
|
|
||||||
"app" = "kube-prometheus-stack-prometheus"
|
|
||||||
"release" = "prometheus"
|
|
||||||
"self-monitor" = "true"
|
|
||||||
|
|
||||||
})
|
|
||||||
}
|
|
||||||
resource "kubectl_manifest" "svc" {
|
|
||||||
yaml_body = <<-EOF
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: "${var.component}-${var.instance}"
|
|
||||||
namespace: "${var.namespace}"
|
|
||||||
labels: ${jsonencode(local.svc-label)}
|
|
||||||
spec:
|
|
||||||
ports:
|
|
||||||
- name: http-web
|
|
||||||
port: 9093
|
|
||||||
targetPort: 9093
|
|
||||||
protocol: TCP
|
|
||||||
- name: reloader-web
|
|
||||||
appProtocol: http
|
|
||||||
port: 8080
|
|
||||||
targetPort: reloader-web
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: alertmanager
|
|
||||||
alertmanager: alertmanager-kube-promethe-alertmanager
|
|
||||||
sessionAffinity: None
|
|
||||||
type: "ClusterIP"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-kube-grafana-datasource
|
|
||||||
labels:
|
|
||||||
grafana_datasource: "1"
|
|
||||||
app: alertmanager
|
|
||||||
data:
|
|
||||||
datasource.yaml: |-
|
|
||||||
apiVersion: 1
|
|
||||||
datasources:
|
|
||||||
- name: Alertmanager
|
|
||||||
type: alertmanager
|
|
||||||
uid: alertmanager
|
|
||||||
url: http://alertmanager-operated.vynil-monitor:9093/
|
|
||||||
access: proxy
|
|
||||||
jsonData:
|
|
||||||
handleGrafanaManagedAlerts: false
|
|
||||||
implementation: prometheus
|
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,18 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/alertmanager/secret.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-alertmanager-kube-promethe-alertmanager
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack-alertmanager
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: alertmanager
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "alertmanager"
|
|
||||||
heritage: "Helm"
|
|
||||||
data:
|
|
||||||
alertmanager.yaml: "Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0KaW5oaWJpdF9ydWxlczoKLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIC0gYWxlcnRuYW1lCiAgc291cmNlX21hdGNoZXJzOgogIC0gc2V2ZXJpdHkgPSBjcml0aWNhbAogIHRhcmdldF9tYXRjaGVyczoKICAtIHNldmVyaXR5ID1+IHdhcm5pbmd8aW5mbwotIGVxdWFsOgogIC0gbmFtZXNwYWNlCiAgLSBhbGVydG5hbWUKICBzb3VyY2VfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IHdhcm5pbmcKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSBlcXVhbDoKICAtIG5hbWVzcGFjZQogIHNvdXJjZV9tYXRjaGVyczoKICAtIGFsZXJ0bmFtZSA9IEluZm9JbmhpYml0b3IKICB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBzZXZlcml0eSA9IGluZm8KLSB0YXJnZXRfbWF0Y2hlcnM6CiAgLSBhbGVydG5hbWUgPSBJbmZvSW5oaWJpdG9yCnJlY2VpdmVyczoKLSBuYW1lOiAibnVsbCIKcm91dGU6CiAgZ3JvdXBfYnk6CiAgLSBuYW1lc3BhY2UKICBncm91cF9pbnRlcnZhbDogNW0KICBncm91cF93YWl0OiAzMHMKICByZWNlaXZlcjogIm51bGwiCiAgcmVwZWF0X2ludGVydmFsOiAxMmgKICByb3V0ZXM6CiAgLSBtYXRjaGVyczoKICAgIC0gYWxlcnRuYW1lID0gIldhdGNoZG9nIgogICAgcmVjZWl2ZXI6ICJudWxsIgp0ZW1wbGF0ZXM6Ci0gL2V0Yy9hbGVydG1hbmFnZXIvY29uZmlnLyoudG1wbA=="
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
---
|
|
||||||
# Source: kube-prometheus-stack/templates/alertmanager/serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-kube-promethe-alertmanager
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack-alertmanager
|
|
||||||
app.kubernetes.io/name: kube-prometheus-stack-alertmanager
|
|
||||||
app.kubernetes.io/component: alertmanager
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: alertmanager
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "alertmanager"
|
|
||||||
heritage: "Helm"
|
|
||||||
automountServiceAccountToken: true
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: alerts-containers
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
useless:
|
|
||||||
default: true
|
|
||||||
examples:
|
|
||||||
- true
|
|
||||||
type: boolean
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
---
|
|
||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_cpu_usage_seconds_total.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-k8s.rules.container-cpu-usage-seconds
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: k8s.rules.container_cpu_usage_seconds_total
|
|
||||||
rules:
|
|
||||||
- expr: |-
|
|
||||||
sum by (cluster, namespace, pod, container) (
|
|
||||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
|
||||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
|
||||||
1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
||||||
)
|
|
||||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_cache.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-k8s.rules.container-memory-cache
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: k8s.rules.container_memory_cache
|
|
||||||
rules:
|
|
||||||
- expr: |-
|
|
||||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
||||||
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
||||||
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
||||||
)
|
|
||||||
record: node_namespace_pod_container:container_memory_cache
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_rss.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-k8s.rules.container-memory-rss
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: k8s.rules.container_memory_rss
|
|
||||||
rules:
|
|
||||||
- expr: |-
|
|
||||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
||||||
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
||||||
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
||||||
)
|
|
||||||
record: node_namespace_pod_container:container_memory_rss
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_swap.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-k8s.rules.container-memory-swap
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: k8s.rules.container_memory_swap
|
|
||||||
rules:
|
|
||||||
- expr: |-
|
|
||||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
||||||
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
||||||
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
||||||
)
|
|
||||||
record: node_namespace_pod_container:container_memory_swap
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_memory_working_set_bytes.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-k8s.rules.container-memory-working-se
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: k8s.rules.container_memory_working_set_bytes
|
|
||||||
rules:
|
|
||||||
- expr: |-
|
|
||||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
|
||||||
* on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
|
|
||||||
max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
|
||||||
)
|
|
||||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.container_resource.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-k8s.rules.container-resource
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: k8s.rules.container_resource
|
|
||||||
rules:
|
|
||||||
- expr: |-
|
|
||||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
||||||
group_left() max by (namespace, pod, cluster) (
|
|
||||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
||||||
)
|
|
||||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
|
||||||
- expr: |-
|
|
||||||
sum by (namespace, cluster) (
|
|
||||||
sum by (namespace, pod, cluster) (
|
|
||||||
max by (namespace, pod, container, cluster) (
|
|
||||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
|
|
||||||
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
||||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
record: namespace_memory:kube_pod_container_resource_requests:sum
|
|
||||||
- expr: |-
|
|
||||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
||||||
group_left() max by (namespace, pod, cluster) (
|
|
||||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
||||||
)
|
|
||||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
|
||||||
- expr: |-
|
|
||||||
sum by (namespace, cluster) (
|
|
||||||
sum by (namespace, pod, cluster) (
|
|
||||||
max by (namespace, pod, container, cluster) (
|
|
||||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
|
|
||||||
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
||||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
|
||||||
- expr: |-
|
|
||||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
||||||
group_left() max by (namespace, pod, cluster) (
|
|
||||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
||||||
)
|
|
||||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
|
||||||
- expr: |-
|
|
||||||
sum by (namespace, cluster) (
|
|
||||||
sum by (namespace, pod, cluster) (
|
|
||||||
max by (namespace, pod, container, cluster) (
|
|
||||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
|
||||||
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
||||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
|
||||||
- expr: |-
|
|
||||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)
|
|
||||||
group_left() max by (namespace, pod, cluster) (
|
|
||||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
|
||||||
)
|
|
||||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
|
||||||
- expr: |-
|
|
||||||
sum by (namespace, cluster) (
|
|
||||||
sum by (namespace, pod, cluster) (
|
|
||||||
max by (namespace, pod, container, cluster) (
|
|
||||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
|
||||||
) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
|
|
||||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: alerts-core
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
useless:
|
|
||||||
default: true
|
|
||||||
examples:
|
|
||||||
- true
|
|
||||||
type: boolean
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
---
|
|
||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/config-reloaders.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-config-reloaders
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: config-reloaders
|
|
||||||
rules:
|
|
||||||
- alert: ConfigReloaderSidecarErrors
|
|
||||||
annotations:
|
|
||||||
description: 'Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
|
|
||||||
|
|
||||||
As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors
|
|
||||||
summary: config-reloader sidecar has not had a successful reload for 10m
|
|
||||||
expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-general.rules
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: general.rules
|
|
||||||
rules:
|
|
||||||
- alert: TargetDown
|
|
||||||
annotations:
|
|
||||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
|
||||||
summary: One or more targets are unreachable.
|
|
||||||
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: Watchdog
|
|
||||||
annotations:
|
|
||||||
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
|
||||||
|
|
||||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
|
||||||
|
|
||||||
and always fire against a receiver. There are integrations with various notification
|
|
||||||
|
|
||||||
mechanisms that send a notification when this alert is not firing. For example the
|
|
||||||
|
|
||||||
"DeadMansSnitch" integration in PagerDuty.
|
|
||||||
|
|
||||||
'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
|
||||||
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
|
||||||
expr: vector(1)
|
|
||||||
labels:
|
|
||||||
severity: none
|
|
||||||
- alert: InfoInhibitor
|
|
||||||
annotations:
|
|
||||||
description: 'This is an alert that is used to inhibit info alerts.
|
|
||||||
|
|
||||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
|
||||||
|
|
||||||
other alerts.
|
|
||||||
|
|
||||||
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
|
|
||||||
|
|
||||||
severity of ''warning'' or ''critical'' starts firing on the same namespace.
|
|
||||||
|
|
||||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
|
||||||
|
|
||||||
'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
|
||||||
summary: Info-level alert inhibition.
|
|
||||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
|
||||||
labels:
|
|
||||||
severity: none
|
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/k8s.rules.pod_owner.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-k8s.rules.pod-owner
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: k8s.rules.pod_owner
|
|
||||||
rules:
|
|
||||||
- expr: |-
|
|
||||||
max by (cluster, namespace, workload, pod) (
|
|
||||||
label_replace(
|
|
||||||
label_replace(
|
|
||||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
|
||||||
"replicaset", "$1", "owner_name", "(.*)"
|
|
||||||
) * on (replicaset, namespace) group_left(owner_name) topk by (replicaset, namespace) (
|
|
||||||
1, max by (replicaset, namespace, owner_name) (
|
|
||||||
kube_replicaset_owner{job="kube-state-metrics"}
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"workload", "$1", "owner_name", "(.*)"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
workload_type: deployment
|
|
||||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
||||||
- expr: |-
|
|
||||||
max by (cluster, namespace, workload, pod) (
|
|
||||||
label_replace(
|
|
||||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
|
||||||
"workload", "$1", "owner_name", "(.*)"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
workload_type: daemonset
|
|
||||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
||||||
- expr: |-
|
|
||||||
max by (cluster, namespace, workload, pod) (
|
|
||||||
label_replace(
|
|
||||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
|
||||||
"workload", "$1", "owner_name", "(.*)"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
workload_type: statefulset
|
|
||||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
||||||
- expr: |-
|
|
||||||
max by (cluster, namespace, workload, pod) (
|
|
||||||
label_replace(
|
|
||||||
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
|
|
||||||
"workload", "$1", "owner_name", "(.*)"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
labels:
|
|
||||||
workload_type: job
|
|
||||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-general.rules.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kube-prometheus-general.rules
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kube-prometheus-general.rules
|
|
||||||
rules:
|
|
||||||
- expr: count without(instance, pod, node) (up == 1)
|
|
||||||
record: count:up1
|
|
||||||
- expr: count without(instance, pod, node) (up == 0)
|
|
||||||
record: count:up0
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-prometheus-node-recording.rules.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kube-prometheus-node-recording.rules
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kube-prometheus-node-recording.rules
|
|
||||||
rules:
|
|
||||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
|
||||||
record: instance:node_cpu:rate:sum
|
|
||||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
|
||||||
record: instance:node_network_receive_bytes:rate:sum
|
|
||||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
|
||||||
record: instance:node_network_transmit_bytes:rate:sum
|
|
||||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
|
||||||
record: instance:node_cpu:ratio
|
|
||||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
|
||||||
record: cluster:node_cpu:sum_rate5m
|
|
||||||
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
|
||||||
record: cluster:node_cpu:ratio
|
|
||||||
@@ -1,258 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kubernetes-apps
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kubernetes-apps
|
|
||||||
rules:
|
|
||||||
- alert: KubePodCrashLooping
|
|
||||||
annotations:
|
|
||||||
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
|
||||||
summary: Pod is crash looping.
|
|
||||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubePodNotReady
|
|
||||||
annotations:
|
|
||||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
|
||||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
|
||||||
expr: |-
|
|
||||||
sum by (namespace, pod, cluster) (
|
|
||||||
max by (namespace, pod, cluster) (
|
|
||||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
|
|
||||||
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
|
|
||||||
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
|
||||||
)
|
|
||||||
) > 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeDeploymentGenerationMismatch
|
|
||||||
annotations:
|
|
||||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
|
||||||
summary: Deployment generation mismatch due to possible roll-back
|
|
||||||
expr: |-
|
|
||||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeDeploymentReplicasMismatch
|
|
||||||
annotations:
|
|
||||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
|
||||||
summary: Deployment has not matched the expected number of replicas.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
>
|
|
||||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
) and (
|
|
||||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
|
||||||
==
|
|
||||||
0
|
|
||||||
)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeDeploymentRolloutStuck
|
|
||||||
annotations:
|
|
||||||
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
|
||||||
summary: Deployment rollout is not progressing.
|
|
||||||
expr: |-
|
|
||||||
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!= 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeStatefulSetReplicasMismatch
|
|
||||||
annotations:
|
|
||||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
|
||||||
summary: StatefulSet has not matched the expected number of replicas.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
) and (
|
|
||||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
|
||||||
==
|
|
||||||
0
|
|
||||||
)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeStatefulSetGenerationMismatch
|
|
||||||
annotations:
|
|
||||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
|
||||||
summary: StatefulSet generation mismatch due to possible roll-back
|
|
||||||
expr: |-
|
|
||||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
|
||||||
annotations:
|
|
||||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
|
||||||
summary: StatefulSet update has not been rolled out.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
max without (revision) (
|
|
||||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
unless
|
|
||||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
)
|
|
||||||
*
|
|
||||||
(
|
|
||||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
)
|
|
||||||
) and (
|
|
||||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
|
||||||
==
|
|
||||||
0
|
|
||||||
)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeDaemonSetRolloutStuck
|
|
||||||
annotations:
|
|
||||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
|
||||||
summary: DaemonSet rollout is stuck.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
(
|
|
||||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
) or (
|
|
||||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
0
|
|
||||||
) or (
|
|
||||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
) or (
|
|
||||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
)
|
|
||||||
) and (
|
|
||||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
|
||||||
==
|
|
||||||
0
|
|
||||||
)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeContainerWaiting
|
|
||||||
annotations:
|
|
||||||
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
|
||||||
summary: Pod container waiting longer than 1 hour
|
|
||||||
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeDaemonSetNotScheduled
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
|
||||||
summary: DaemonSet pods are not scheduled.
|
|
||||||
expr: |-
|
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
-
|
|
||||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeDaemonSetMisScheduled
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
|
||||||
summary: DaemonSet pods are misscheduled.
|
|
||||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeJobNotCompleted
|
|
||||||
annotations:
|
|
||||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
|
||||||
summary: Job did not complete in time
|
|
||||||
expr: |-
|
|
||||||
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
and
|
|
||||||
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeJobFailed
|
|
||||||
annotations:
|
|
||||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
|
||||||
summary: Job failed to complete.
|
|
||||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeHpaReplicasMismatch
|
|
||||||
annotations:
|
|
||||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
|
||||||
summary: HPA has not matched desired number of replicas.
|
|
||||||
expr: |-
|
|
||||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
||||||
and
|
|
||||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
>
|
|
||||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
||||||
and
|
|
||||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
<
|
|
||||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
||||||
and
|
|
||||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeHpaMaxedOut
|
|
||||||
annotations:
|
|
||||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
|
||||||
summary: HPA is running at max replicas
|
|
||||||
expr: |-
|
|
||||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
==
|
|
||||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
@@ -1,122 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kubernetes-resources
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kubernetes-resources
|
|
||||||
rules:
|
|
||||||
- alert: KubeCPUOvercommit
|
|
||||||
annotations:
|
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
|
||||||
expr: |-
|
|
||||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
|
||||||
and
|
|
||||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeMemoryOvercommit
|
|
||||||
annotations:
|
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
|
||||||
expr: |-
|
|
||||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
|
||||||
and
|
|
||||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeCPUQuotaOvercommit
|
|
||||||
annotations:
|
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
|
||||||
expr: |-
|
|
||||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
|
||||||
/
|
|
||||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
|
||||||
> 1.5
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeMemoryQuotaOvercommit
|
|
||||||
annotations:
|
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
|
||||||
expr: |-
|
|
||||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
|
||||||
/
|
|
||||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
|
||||||
> 1.5
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeQuotaAlmostFull
|
|
||||||
annotations:
|
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
|
||||||
summary: Namespace quota is going to be full.
|
|
||||||
expr: |-
|
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
||||||
/ ignoring(instance, job, type)
|
|
||||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
|
||||||
> 0.9 < 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: info
|
|
||||||
- alert: KubeQuotaFullyUsed
|
|
||||||
annotations:
|
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
|
||||||
summary: Namespace quota is fully used.
|
|
||||||
expr: |-
|
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
||||||
/ ignoring(instance, job, type)
|
|
||||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
|
||||||
== 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: info
|
|
||||||
- alert: KubeQuotaExceeded
|
|
||||||
annotations:
|
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
|
||||||
summary: Namespace quota has exceeded the limits.
|
|
||||||
expr: |-
|
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
||||||
/ ignoring(instance, job, type)
|
|
||||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
|
||||||
> 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: CPUThrottlingHigh
|
|
||||||
annotations:
|
|
||||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
|
||||||
summary: Processes experience elevated CPU throttling.
|
|
||||||
expr: |-
|
|
||||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
|
|
||||||
/
|
|
||||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
|
|
||||||
> ( 25 / 100 )
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: info
|
|
||||||
@@ -1,113 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-storage.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kubernetes-storage
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kubernetes-storage
|
|
||||||
rules:
|
|
||||||
- alert: KubePersistentVolumeFillingUp
|
|
||||||
annotations:
|
|
||||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} is only {{ $value | humanizePercentage }} free.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
|
||||||
summary: PersistentVolume is filling up.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
/
|
|
||||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
) < 0.03
|
|
||||||
and
|
|
||||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubePersistentVolumeFillingUp
|
|
||||||
annotations:
|
|
||||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
|
||||||
summary: PersistentVolume is filling up.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
/
|
|
||||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
) < 0.15
|
|
||||||
and
|
|
||||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
||||||
and
|
|
||||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubePersistentVolumeInodesFillingUp
|
|
||||||
annotations:
|
|
||||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} only has {{ $value | humanizePercentage }} free inodes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
|
||||||
summary: PersistentVolumeInodes are filling up.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
/
|
|
||||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
) < 0.03
|
|
||||||
and
|
|
||||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubePersistentVolumeInodesFillingUp
|
|
||||||
annotations:
|
|
||||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} on Cluster {{ $labels.cluster }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
|
||||||
summary: PersistentVolumeInodes are filling up.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
/
|
|
||||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
|
||||||
) < 0.15
|
|
||||||
and
|
|
||||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
|
||||||
and
|
|
||||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
|
||||||
for: 1h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubePersistentVolumeErrors
|
|
||||||
annotations:
|
|
||||||
description: The persistent volume {{ $labels.persistentvolume }} on Cluster {{ $labels.cluster }} has status {{ $labels.phase }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
|
||||||
summary: PersistentVolume is having issues with provisioning.
|
|
||||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-apiserver.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kubernetes-system-apiserver
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kubernetes-system-apiserver
|
|
||||||
rules:
|
|
||||||
- alert: KubeClientCertificateExpiration
|
|
||||||
annotations:
|
|
||||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
|
||||||
summary: Client certificate is about to expire.
|
|
||||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on (job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeClientCertificateExpiration
|
|
||||||
annotations:
|
|
||||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
|
||||||
summary: Client certificate is about to expire.
|
|
||||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on (job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubeAggregatedAPIErrors
|
|
||||||
annotations:
|
|
||||||
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
|
|
||||||
summary: Kubernetes aggregated API has reported errors.
|
|
||||||
expr: sum by (name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[10m])) > 4
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeAggregatedAPIDown
|
|
||||||
annotations:
|
|
||||||
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
|
|
||||||
summary: Kubernetes aggregated API is down.
|
|
||||||
expr: (1 - max by (name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeAPITerminatedRequests
|
|
||||||
annotations:
|
|
||||||
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
|
|
||||||
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
|
|
||||||
expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system-kubelet.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kubernetes-system-kubelet
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kubernetes-system-kubelet
|
|
||||||
rules:
|
|
||||||
- alert: KubeNodeNotReady
|
|
||||||
annotations:
|
|
||||||
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
|
|
||||||
summary: Node is not ready.
|
|
||||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeNodeUnreachable
|
|
||||||
annotations:
|
|
||||||
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
|
|
||||||
summary: Node is unreachable.
|
|
||||||
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletTooManyPods
|
|
||||||
annotations:
|
|
||||||
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
|
|
||||||
summary: Kubelet is running at capacity.
|
|
||||||
expr: |-
|
|
||||||
count by (cluster, node) (
|
|
||||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster) group_left(node) topk by (instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
|
||||||
)
|
|
||||||
/
|
|
||||||
max by (cluster, node) (
|
|
||||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
|
||||||
) > 0.95
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: info
|
|
||||||
- alert: KubeNodeReadinessFlapping
|
|
||||||
annotations:
|
|
||||||
description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
|
|
||||||
summary: Node readiness status is flapping.
|
|
||||||
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletPlegDurationHigh
|
|
||||||
annotations:
|
|
||||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
|
|
||||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
|
||||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletPodStartUpLatencyHigh
|
|
||||||
annotations:
|
|
||||||
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
|
|
||||||
summary: Kubelet Pod startup latency is too high.
|
|
||||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on (cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletClientCertificateExpiration
|
|
||||||
annotations:
|
|
||||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
|
||||||
summary: Kubelet client certificate is about to expire.
|
|
||||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletClientCertificateExpiration
|
|
||||||
annotations:
|
|
||||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
|
||||||
summary: Kubelet client certificate is about to expire.
|
|
||||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubeletServerCertificateExpiration
|
|
||||||
annotations:
|
|
||||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
|
||||||
summary: Kubelet server certificate is about to expire.
|
|
||||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletServerCertificateExpiration
|
|
||||||
annotations:
|
|
||||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
|
||||||
summary: Kubelet server certificate is about to expire.
|
|
||||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubeletClientCertificateRenewalErrors
|
|
||||||
annotations:
|
|
||||||
description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
|
|
||||||
summary: Kubelet has failed to renew its client certificate.
|
|
||||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletServerCertificateRenewalErrors
|
|
||||||
annotations:
|
|
||||||
description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
|
|
||||||
summary: Kubelet has failed to renew its server certificate.
|
|
||||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeletDown
|
|
||||||
annotations:
|
|
||||||
description: Kubelet has disappeared from Prometheus target discovery.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown
|
|
||||||
summary: Target disappeared from Prometheus target discovery.
|
|
||||||
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-system.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-kubernetes-system
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kubernetes-system
|
|
||||||
rules:
|
|
||||||
- alert: KubeVersionMismatch
|
|
||||||
annotations:
|
|
||||||
description: There are {{ $value }} different semantic versions of Kubernetes components running.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
|
|
||||||
summary: Different semantic versions of Kubernetes components running.
|
|
||||||
expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeClientErrors
|
|
||||||
annotations:
|
|
||||||
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
|
|
||||||
summary: Kubernetes API server client is experiencing errors.
|
|
||||||
expr: |-
|
|
||||||
(sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace)
|
|
||||||
/
|
|
||||||
sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace))
|
|
||||||
> 0.01
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: dashboards-cluster
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
useless:
|
|
||||||
default: true
|
|
||||||
examples:
|
|
||||||
- true
|
|
||||||
type: boolean
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,16 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: dashboards-minimal
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
useless:
|
|
||||||
default: true
|
|
||||||
examples:
|
|
||||||
- true
|
|
||||||
type: boolean
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,16 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: dashboards-namespace
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
useless:
|
|
||||||
default: true
|
|
||||||
examples:
|
|
||||||
- true
|
|
||||||
type: boolean
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,16 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
|
|
||||||
}
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: dashboards-workload
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
useless:
|
|
||||||
default: true
|
|
||||||
examples:
|
|
||||||
- true
|
|
||||||
type: boolean
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,298 +0,0 @@
|
|||||||
# Source: grafana/templates/deployment.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
revisionHistoryLimit: 10
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
strategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
annotations:
|
|
||||||
checksum/config: 0e9cbd0ea8e24e32f7dfca5bab17a2ba05652642f0a09a4882833ae88e4cc4a3
|
|
||||||
checksum/sc-dashboard-provider-config: 593c0a8778b83f11fe80ccb21dfb20bc46705e2be3178df1dc4c89d164c8cd9c
|
|
||||||
kubectl.kubernetes.io/default-container: grafana
|
|
||||||
spec:
|
|
||||||
|
|
||||||
serviceAccountName: grafana
|
|
||||||
automountServiceAccountToken: true
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 472
|
|
||||||
runAsGroup: 472
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 472
|
|
||||||
initContainers:
|
|
||||||
- name: init-chown-data
|
|
||||||
image: "docker.io/library/busybox:1.31.1"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
securityContext:
|
|
||||||
capabilities:
|
|
||||||
add:
|
|
||||||
- CHOWN
|
|
||||||
runAsNonRoot: false
|
|
||||||
runAsUser: 0
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
command:
|
|
||||||
- chown
|
|
||||||
- -R
|
|
||||||
- 472:472
|
|
||||||
- /var/lib/grafana
|
|
||||||
volumeMounts:
|
|
||||||
- name: storage
|
|
||||||
mountPath: "/var/lib/grafana"
|
|
||||||
enableServiceLinks: true
|
|
||||||
containers:
|
|
||||||
- name: grafana-sc-dashboard
|
|
||||||
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
env:
|
|
||||||
- name: METHOD
|
|
||||||
value: WATCH
|
|
||||||
- name: LABEL
|
|
||||||
value: "grafana_dashboard"
|
|
||||||
- name: FOLDER
|
|
||||||
value: "/tmp/dashboards"
|
|
||||||
- name: RESOURCE
|
|
||||||
value: "both"
|
|
||||||
- name: REQ_USERNAME
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: username
|
|
||||||
- name: REQ_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: password
|
|
||||||
- name: REQ_URL
|
|
||||||
value: http://localhost:3000/api/admin/provisioning/dashboards/reload
|
|
||||||
- name: REQ_METHOD
|
|
||||||
value: POST
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
volumeMounts:
|
|
||||||
- name: sc-dashboard-volume
|
|
||||||
mountPath: "/tmp/dashboards"
|
|
||||||
- name: grafana-sc-datasources
|
|
||||||
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
env:
|
|
||||||
- name: METHOD
|
|
||||||
value: WATCH
|
|
||||||
- name: LABEL
|
|
||||||
value: "grafana_datasource"
|
|
||||||
- name: FOLDER
|
|
||||||
value: "/etc/grafana/provisioning/datasources"
|
|
||||||
- name: RESOURCE
|
|
||||||
value: "both"
|
|
||||||
- name: REQ_USERNAME
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: username
|
|
||||||
- name: REQ_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: password
|
|
||||||
- name: REQ_URL
|
|
||||||
value: http://localhost:3000/api/admin/provisioning/datasources/reload
|
|
||||||
- name: REQ_METHOD
|
|
||||||
value: POST
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
volumeMounts:
|
|
||||||
- name: sc-datasources-volume
|
|
||||||
mountPath: "/etc/grafana/provisioning/datasources"
|
|
||||||
- name: grafana-sc-notifiers
|
|
||||||
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
env:
|
|
||||||
- name: METHOD
|
|
||||||
value: WATCH
|
|
||||||
- name: LABEL
|
|
||||||
value: "grafana_notifier"
|
|
||||||
- name: FOLDER
|
|
||||||
value: "/etc/grafana/provisioning/notifiers"
|
|
||||||
- name: RESOURCE
|
|
||||||
value: "both"
|
|
||||||
- name: REQ_USERNAME
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: username
|
|
||||||
- name: REQ_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: password
|
|
||||||
- name: REQ_URL
|
|
||||||
value: http://localhost:3000/api/admin/provisioning/notifications/reload
|
|
||||||
- name: REQ_METHOD
|
|
||||||
value: POST
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
volumeMounts:
|
|
||||||
- name: sc-notifiers-volume
|
|
||||||
mountPath: "/etc/grafana/provisioning/notifiers"
|
|
||||||
- name: grafana-sc-plugins
|
|
||||||
image: "quay.io/kiwigrid/k8s-sidecar:1.25.2"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
env:
|
|
||||||
- name: METHOD
|
|
||||||
value: WATCH
|
|
||||||
- name: LABEL
|
|
||||||
value: "grafana_plugin"
|
|
||||||
- name: FOLDER
|
|
||||||
value: "/etc/grafana/provisioning/plugins"
|
|
||||||
- name: RESOURCE
|
|
||||||
value: "both"
|
|
||||||
- name: REQ_USERNAME
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: username
|
|
||||||
- name: REQ_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: password
|
|
||||||
- name: REQ_URL
|
|
||||||
value: http://localhost:3000/api/admin/provisioning/plugins/reload
|
|
||||||
- name: REQ_METHOD
|
|
||||||
value: POST
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
volumeMounts:
|
|
||||||
- name: sc-plugins-volume
|
|
||||||
mountPath: "/etc/grafana/provisioning/plugins"
|
|
||||||
- name: grafana
|
|
||||||
image: "docker.io/grafana/grafana:10.2.3"
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
volumeMounts:
|
|
||||||
- name: config
|
|
||||||
mountPath: "/etc/grafana/grafana.ini"
|
|
||||||
subPath: grafana.ini
|
|
||||||
- name: storage
|
|
||||||
mountPath: "/var/lib/grafana"
|
|
||||||
- name: sc-dashboard-volume
|
|
||||||
mountPath: "/tmp/dashboards"
|
|
||||||
- name: sc-dashboard-provider
|
|
||||||
mountPath: "/etc/grafana/provisioning/dashboards/sc-dashboardproviders.yaml"
|
|
||||||
subPath: provider.yaml
|
|
||||||
- name: sc-datasources-volume
|
|
||||||
mountPath: "/etc/grafana/provisioning/datasources"
|
|
||||||
- name: sc-plugins-volume
|
|
||||||
mountPath: "/etc/grafana/provisioning/plugins"
|
|
||||||
- name: sc-notifiers-volume
|
|
||||||
mountPath: "/etc/grafana/provisioning/notifiers"
|
|
||||||
ports:
|
|
||||||
- name: grafana
|
|
||||||
containerPort: 3000
|
|
||||||
protocol: TCP
|
|
||||||
- name: gossip-tcp
|
|
||||||
containerPort: 9094
|
|
||||||
protocol: TCP
|
|
||||||
- name: gossip-udp
|
|
||||||
containerPort: 9094
|
|
||||||
protocol: UDP
|
|
||||||
env:
|
|
||||||
- name: POD_IP
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: status.podIP
|
|
||||||
- name: GF_SECURITY_ADMIN_USER
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: username
|
|
||||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: grafana-admin-user
|
|
||||||
key: password
|
|
||||||
- name: GF_PATHS_DATA
|
|
||||||
value: /var/lib/grafana/
|
|
||||||
- name: GF_PATHS_LOGS
|
|
||||||
value: /var/log/grafana
|
|
||||||
- name: GF_PATHS_PLUGINS
|
|
||||||
value: /var/lib/grafana/plugins
|
|
||||||
- name: GF_PATHS_PROVISIONING
|
|
||||||
value: /etc/grafana/provisioning
|
|
||||||
livenessProbe:
|
|
||||||
failureThreshold: 10
|
|
||||||
httpGet:
|
|
||||||
path: /api/health
|
|
||||||
port: 3000
|
|
||||||
initialDelaySeconds: 60
|
|
||||||
timeoutSeconds: 30
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /api/health
|
|
||||||
port: 3000
|
|
||||||
volumes:
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: grafana
|
|
||||||
- name: storage
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: grafana
|
|
||||||
- name: sc-dashboard-volume
|
|
||||||
emptyDir:
|
|
||||||
{}
|
|
||||||
- name: sc-dashboard-provider
|
|
||||||
configMap:
|
|
||||||
name: grafana-config-dashboards
|
|
||||||
- name: sc-datasources-volume
|
|
||||||
emptyDir:
|
|
||||||
{}
|
|
||||||
- name: sc-plugins-volume
|
|
||||||
emptyDir:
|
|
||||||
{}
|
|
||||||
- name: sc-notifiers-volume
|
|
||||||
emptyDir:
|
|
||||||
{}
|
|
||||||
@@ -1,43 +0,0 @@
|
|||||||
resource "kubernetes_config_map_v1" "config" {
|
|
||||||
metadata {
|
|
||||||
name = "grafana"
|
|
||||||
namespace = var.namespace
|
|
||||||
labels = local.common-labels
|
|
||||||
}
|
|
||||||
data = {
|
|
||||||
"grafana.ini" = <<-EOF
|
|
||||||
[analytics]
|
|
||||||
check_for_updates = true
|
|
||||||
[grafana_net]
|
|
||||||
url = https://grafana.net
|
|
||||||
[log]
|
|
||||||
mode = console
|
|
||||||
[paths]
|
|
||||||
data = /var/lib/grafana/
|
|
||||||
logs = /var/log/grafana
|
|
||||||
plugins = /var/lib/grafana/plugins
|
|
||||||
provisioning = /etc/grafana/provisioning
|
|
||||||
[server]
|
|
||||||
domain = ''
|
|
||||||
root_url = 'https://${local.dns_name}/'
|
|
||||||
[users]
|
|
||||||
auto_assign_org = true
|
|
||||||
auto_assign_org_id = 1
|
|
||||||
[auth]
|
|
||||||
oauth_allow_insecure_email_lookup = true
|
|
||||||
signout_redirect_url = '${module.oauth2.sso_signout_url}'
|
|
||||||
oauth_auto_login = true
|
|
||||||
[auth.generic_oauth]
|
|
||||||
enabled = true
|
|
||||||
name = vynil
|
|
||||||
scopes = openid profile email
|
|
||||||
${var.issuer=="letsencrypt-prod"?";":""}tls_client_ca = /etc/local-certs/ca.crt
|
|
||||||
client_id = '${module.oauth2.client_id}'
|
|
||||||
client_secret = '${module.oauth2.client_secret}'
|
|
||||||
auth_url = '${module.oauth2.sso_authorize_url}'
|
|
||||||
api_url = '${module.oauth2.sso_userinfo_url}'
|
|
||||||
token_url = '${module.oauth2.sso_token_url}'
|
|
||||||
role_attribute_path = contains(groups, '${module.application.main_group}-admin') && 'Admin' || contains(groups, '${module.application.main_group}') && 'Editor' || 'Viewer'
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,112 +0,0 @@
|
|||||||
locals {
|
|
||||||
authentik_url = "http://authentik.${var.domain}-auth.svc"
|
|
||||||
authentik_token = data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
pvc_spec = merge({
|
|
||||||
"accessModes" = [var.storage.volume.accessMode]
|
|
||||||
"volumeMode" = var.storage.volume.type
|
|
||||||
"resources" = {
|
|
||||||
"requests" = {
|
|
||||||
"storage" = "${var.storage.volume.size}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, var.storage.volume.class != "" ?{
|
|
||||||
"storageClassName" = var.storage.volume.class
|
|
||||||
}:{})
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
data "kubernetes_secret_v1" "authentik" {
|
|
||||||
metadata {
|
|
||||||
name = "authentik"
|
|
||||||
namespace = "${var.domain}-auth"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kubernetes_ingress_v1" "authentik" {
|
|
||||||
metadata {
|
|
||||||
name = "authentik"
|
|
||||||
namespace = "${var.domain}-auth"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
|
|
||||||
images {
|
|
||||||
name = "docker.io/grafana/grafana"
|
|
||||||
new_name = "${var.images.grafana.registry}/${var.images.grafana.repository}"
|
|
||||||
new_tag = "${var.images.grafana.tag}"
|
|
||||||
}
|
|
||||||
images {
|
|
||||||
name = "docker.io/library/busybox"
|
|
||||||
new_name = "${var.images.busybox.registry}/${var.images.busybox.repository}"
|
|
||||||
new_tag = "${var.images.busybox.tag}"
|
|
||||||
}
|
|
||||||
images {
|
|
||||||
name = "quay.io/kiwigrid/k8s-sidecar"
|
|
||||||
new_name = "${var.images.sidecar.registry}/${var.images.sidecar.repository}"
|
|
||||||
new_tag = "${var.images.sidecar.tag}"
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "PersistentVolumeClaim"
|
|
||||||
name = "grafana"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
apiVersion: v1
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
annotations:
|
|
||||||
k8up.io/backup: "true"
|
|
||||||
spec: ${jsonencode(local.pvc_spec)}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "ServiceMonitor"
|
|
||||||
name = "grafana"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
- op: replace
|
|
||||||
path: /spec/namespaceSelector/matchNames/0
|
|
||||||
value: "${var.namespace}"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "Deployment"
|
|
||||||
name = "grafana"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
annotations:
|
|
||||||
configmap.reloader.stakater.com/reload: "grafana"
|
|
||||||
spec:
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: grafana
|
|
||||||
volumeMounts:
|
|
||||||
- name: local-certs
|
|
||||||
mountPath: "/etc/local-certs"
|
|
||||||
volumes:
|
|
||||||
- name: local-certs
|
|
||||||
secret:
|
|
||||||
secretName: "${var.instance}-cert"
|
|
||||||
defaultMode: 0444
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
# Source: grafana/templates/servicemonitor.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: ServiceMonitor
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
spec:
|
|
||||||
endpoints:
|
|
||||||
- port: service
|
|
||||||
interval: 30s
|
|
||||||
scrapeTimeout: 30s
|
|
||||||
honorLabels: true
|
|
||||||
path: /metrics
|
|
||||||
scheme: http
|
|
||||||
jobLabel: "grafana"
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
namespaceSelector:
|
|
||||||
matchNames:
|
|
||||||
- vynil-monitor
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
locals {
|
|
||||||
dns_name = "${var.sub_domain}.${var.domain_name}"
|
|
||||||
dns_names = [local.dns_name]
|
|
||||||
app_name = var.component == var.instance ? var.instance : format("%s-%s", var.component, var.instance)
|
|
||||||
icon = "public/img/grafana_icon.svg"
|
|
||||||
request_headers = {
|
|
||||||
"Content-Type" = "application/json"
|
|
||||||
Authorization = "Bearer ${data.kubernetes_secret_v1.authentik.data["AUTHENTIK_BOOTSTRAP_TOKEN"]}"
|
|
||||||
}
|
|
||||||
service = {
|
|
||||||
"name" = "grafana"
|
|
||||||
"port" = {
|
|
||||||
"number" = 80
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
module "ingress" {
|
|
||||||
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//ingress"
|
|
||||||
component = ""
|
|
||||||
instance = var.instance
|
|
||||||
namespace = var.namespace
|
|
||||||
issuer = var.issuer
|
|
||||||
ingress_class = var.ingress_class
|
|
||||||
labels = local.common-labels
|
|
||||||
dns_names = local.dns_names
|
|
||||||
middlewares = []
|
|
||||||
services = [local.service]
|
|
||||||
providers = {
|
|
||||||
kubectl = kubectl
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
module "application" {
|
|
||||||
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//application"
|
|
||||||
component = var.component
|
|
||||||
instance = var.instance
|
|
||||||
app_group = var.app_group
|
|
||||||
dns_name = local.dns_name
|
|
||||||
icon = local.icon
|
|
||||||
sub_groups = ["admin"]
|
|
||||||
protocol_provider = module.oauth2.provider-id
|
|
||||||
providers = {
|
|
||||||
authentik = authentik
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
module "oauth2" {
|
|
||||||
source = "git::https://git.solidite.fr/vynil/kydah-modules.git//oauth2"
|
|
||||||
component = var.component
|
|
||||||
instance = var.instance
|
|
||||||
namespace = var.namespace
|
|
||||||
domain = var.domain
|
|
||||||
labels = local.common-labels
|
|
||||||
dns_name = local.dns_name
|
|
||||||
redirect_path = "login/generic_oauth"
|
|
||||||
providers = {
|
|
||||||
kubernetes = kubernetes
|
|
||||||
kubectl = kubectl
|
|
||||||
authentik = authentik
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
provider "restapi" {
|
|
||||||
uri = "http://authentik.${var.domain}-auth.svc/api/v3/"
|
|
||||||
headers = local.request_headers
|
|
||||||
create_method = "PATCH"
|
|
||||||
update_method = "PATCH"
|
|
||||||
destroy_method = "PATCH"
|
|
||||||
write_returns_object = true
|
|
||||||
id_attribute = "name"
|
|
||||||
}
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
# Source: grafana/templates/rolebinding.yaml
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: RoleBinding
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: Role
|
|
||||||
name: grafana
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# Source: grafana/templates/role.yaml
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: Role
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""] # "" indicates the core API group
|
|
||||||
resources: ["configmaps", "secrets"]
|
|
||||||
verbs: ["get", "watch", "list"]
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
|
|
||||||
resource "kubectl_manifest" "grafana_secret" {
|
|
||||||
ignore_fields = ["metadata.annotations"]
|
|
||||||
yaml_body = <<-EOF
|
|
||||||
apiVersion: "secretgenerator.mittwald.de/v1alpha1"
|
|
||||||
kind: "StringSecret"
|
|
||||||
metadata:
|
|
||||||
name: "grafana-admin-user"
|
|
||||||
namespace: "${var.namespace}"
|
|
||||||
labels: ${jsonencode(local.common-labels)}
|
|
||||||
spec:
|
|
||||||
forceRegenerate: false
|
|
||||||
data:
|
|
||||||
username: "${var.admin_name}"
|
|
||||||
fields:
|
|
||||||
- fieldName: "password"
|
|
||||||
length: "32"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# Source: grafana/templates/configmap-dashboard-provider.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
name: grafana-config-dashboards
|
|
||||||
namespace: vynil-monitor
|
|
||||||
data:
|
|
||||||
provider.yaml: |-
|
|
||||||
apiVersion: 1
|
|
||||||
providers:
|
|
||||||
- name: 'sidecarProvider'
|
|
||||||
orgId: 1
|
|
||||||
folder: ''
|
|
||||||
type: file
|
|
||||||
disableDeletion: false
|
|
||||||
allowUiUpdates: false
|
|
||||||
updateIntervalSeconds: 30
|
|
||||||
options:
|
|
||||||
foldersFromFilesStructure: false
|
|
||||||
path: /tmp/dashboards
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
# Source: grafana/templates/pvc.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
finalizers:
|
|
||||||
- kubernetes.io/pvc-protection
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- "ReadWriteOnce"
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: "10Gi"
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
---
|
|
||||||
# Source: grafana/templates/serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
# Source: grafana/templates/service.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: grafana
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: grafana-7.2.4
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
app.kubernetes.io/version: "10.2.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- name: service
|
|
||||||
port: 80
|
|
||||||
protocol: TCP
|
|
||||||
targetPort: 3000
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: grafana
|
|
||||||
app.kubernetes.io/instance: grafana
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/deployment.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: kube-state-metrics
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: kube-state-metrics-5.16.0
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: metrics
|
|
||||||
app.kubernetes.io/part-of: kube-state-metrics
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "2.10.1"
|
|
||||||
release: kube-state-metrics
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
replicas: 1
|
|
||||||
strategy:
|
|
||||||
type: RollingUpdate
|
|
||||||
revisionHistoryLimit: 10
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: kube-state-metrics-5.16.0
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: metrics
|
|
||||||
app.kubernetes.io/part-of: kube-state-metrics
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "2.10.1"
|
|
||||||
release: kube-state-metrics
|
|
||||||
spec:
|
|
||||||
hostNetwork: false
|
|
||||||
serviceAccountName: kube-state-metrics
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 65534
|
|
||||||
runAsGroup: 65534
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 65534
|
|
||||||
seccompProfile:
|
|
||||||
type: RuntimeDefault
|
|
||||||
containers:
|
|
||||||
- name: kube-state-metrics
|
|
||||||
args:
|
|
||||||
- --port=8080
|
|
||||||
- --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.10.1
|
|
||||||
ports:
|
|
||||||
- containerPort: 8080
|
|
||||||
name: "http"
|
|
||||||
livenessProbe:
|
|
||||||
failureThreshold: 3
|
|
||||||
httpGet:
|
|
||||||
httpHeaders:
|
|
||||||
path: /healthz
|
|
||||||
port: 8080
|
|
||||||
scheme: HTTP
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
successThreshold: 1
|
|
||||||
timeoutSeconds: 5
|
|
||||||
readinessProbe:
|
|
||||||
failureThreshold: 3
|
|
||||||
httpGet:
|
|
||||||
httpHeaders:
|
|
||||||
path: /
|
|
||||||
port: 8080
|
|
||||||
scheme: HTTP
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
successThreshold: 1
|
|
||||||
timeoutSeconds: 5
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
@@ -1,49 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
rb-patch = <<-EOF
|
|
||||||
- op: replace
|
|
||||||
path: /subjects/0/namespace
|
|
||||||
value: "${var.namespace}"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
|
|
||||||
images {
|
|
||||||
name = "registry.k8s.io/kube-state-metrics/kube-state-metrics"
|
|
||||||
new_name = "${var.images.kube-state-metrics.registry}/${var.images.kube-state-metrics.repository}"
|
|
||||||
new_tag = "${var.images.kube-state-metrics.tag}"
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "ServiceMonitor"
|
|
||||||
name = "kube-state-metrics"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
- op: replace
|
|
||||||
path: /spec/selector/matchLabels/app.kubernetes.io~1instance
|
|
||||||
value: "${var.instance}"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data_no_ns" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if length(regexall("ClusterRole",file))>0]
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "ClusterRoleBinding"
|
|
||||||
name = "kube-state-metrics"
|
|
||||||
}
|
|
||||||
patch = local.rb-patch
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: kube-state-metrics
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
images:
|
|
||||||
default:
|
|
||||||
kube-state-metrics:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: registry.k8s.io
|
|
||||||
repository: kube-state-metrics/kube-state-metrics
|
|
||||||
tag: v2.10.1
|
|
||||||
examples:
|
|
||||||
- kube-state-metrics:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: registry.k8s.io
|
|
||||||
repository: kube-state-metrics/kube-state-metrics
|
|
||||||
tag: v2.10.1
|
|
||||||
properties:
|
|
||||||
kube-state-metrics:
|
|
||||||
default:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: registry.k8s.io
|
|
||||||
repository: kube-state-metrics/kube-state-metrics
|
|
||||||
tag: v2.10.1
|
|
||||||
properties:
|
|
||||||
pullPolicy:
|
|
||||||
default: IfNotPresent
|
|
||||||
enum:
|
|
||||||
- Always
|
|
||||||
- Never
|
|
||||||
- IfNotPresent
|
|
||||||
type: string
|
|
||||||
registry:
|
|
||||||
default: registry.k8s.io
|
|
||||||
type: string
|
|
||||||
repository:
|
|
||||||
default: kube-state-metrics/kube-state-metrics
|
|
||||||
type: string
|
|
||||||
tag:
|
|
||||||
default: v2.10.1
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/kube-state-metrics.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: kube-state-metrics-kube-pr-kube-state-metrics
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "kube-state-metrics"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: kube-state-metrics
|
|
||||||
rules:
|
|
||||||
- alert: KubeStateMetricsListErrors
|
|
||||||
annotations:
|
|
||||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
|
|
||||||
summary: kube-state-metrics is experiencing errors in list operations.
|
|
||||||
expr: |-
|
|
||||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
|
||||||
/
|
|
||||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
|
|
||||||
> 0.01
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubeStateMetricsWatchErrors
|
|
||||||
annotations:
|
|
||||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
|
|
||||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
|
||||||
expr: |-
|
|
||||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
|
||||||
/
|
|
||||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
|
|
||||||
> 0.01
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubeStateMetricsShardingMismatch
|
|
||||||
annotations:
|
|
||||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
|
|
||||||
summary: kube-state-metrics sharding is misconfigured.
|
|
||||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: KubeStateMetricsShardsMissing
|
|
||||||
annotations:
|
|
||||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
|
|
||||||
summary: kube-state-metrics shards are missing.
|
|
||||||
expr: |-
|
|
||||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
|
|
||||||
-
|
|
||||||
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
|
|
||||||
!= 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/servicemonitor.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: ServiceMonitor
|
|
||||||
metadata:
|
|
||||||
name: kube-state-metrics
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: kube-state-metrics-5.16.0
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: metrics
|
|
||||||
app.kubernetes.io/part-of: kube-state-metrics
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "2.10.1"
|
|
||||||
release: kube-state-metrics
|
|
||||||
spec:
|
|
||||||
jobLabel: app.kubernetes.io/name
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
endpoints:
|
|
||||||
- port: http
|
|
||||||
honorLabels: true
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/clusterrolebinding.yaml
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: kube-state-metrics-5.16.0
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: metrics
|
|
||||||
app.kubernetes.io/part-of: kube-state-metrics
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "2.10.1"
|
|
||||||
release: kube-state-metrics
|
|
||||||
name: kube-state-metrics
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: ClusterRole
|
|
||||||
name: kube-state-metrics
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: kube-state-metrics
|
|
||||||
namespace: vynil-monitor
|
|
||||||
@@ -1,155 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/role.yaml
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: kube-state-metrics-5.16.0
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: metrics
|
|
||||||
app.kubernetes.io/part-of: kube-state-metrics
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "2.10.1"
|
|
||||||
release: kube-state-metrics
|
|
||||||
name: kube-state-metrics
|
|
||||||
rules:
|
|
||||||
|
|
||||||
- apiGroups: ["certificates.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- certificatesigningrequests
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- configmaps
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["batch"]
|
|
||||||
resources:
|
|
||||||
- cronjobs
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["extensions", "apps"]
|
|
||||||
resources:
|
|
||||||
- daemonsets
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["extensions", "apps"]
|
|
||||||
resources:
|
|
||||||
- deployments
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- endpoints
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["autoscaling"]
|
|
||||||
resources:
|
|
||||||
- horizontalpodautoscalers
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["extensions", "networking.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- ingresses
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["batch"]
|
|
||||||
resources:
|
|
||||||
- jobs
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["coordination.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- leases
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- limitranges
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["admissionregistration.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- mutatingwebhookconfigurations
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- namespaces
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["networking.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- networkpolicies
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- nodes
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- persistentvolumeclaims
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- persistentvolumes
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["policy"]
|
|
||||||
resources:
|
|
||||||
- poddisruptionbudgets
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- pods
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["extensions", "apps"]
|
|
||||||
resources:
|
|
||||||
- replicasets
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- replicationcontrollers
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- resourcequotas
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- secrets
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources:
|
|
||||||
- services
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["apps"]
|
|
||||||
resources:
|
|
||||||
- statefulsets
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["storage.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- storageclasses
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["admissionregistration.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- validatingwebhookconfigurations
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
|
|
||||||
- apiGroups: ["storage.k8s.io"]
|
|
||||||
resources:
|
|
||||||
- volumeattachments
|
|
||||||
verbs: ["list", "watch"]
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
|
|
||||||
# first loop through resources in ids_prio[0]
|
|
||||||
resource "kustomization_resource" "pre_no_ns" {
|
|
||||||
for_each = data.kustomization_overlay.data_no_ns.ids_prio[0]
|
|
||||||
|
|
||||||
manifest = (
|
|
||||||
contains(["_/Secret"], regex("(?P<group_kind>.*/.*)/.*/.*", each.value)["group_kind"])
|
|
||||||
? sensitive(data.kustomization_overlay.data_no_ns.manifests[each.value])
|
|
||||||
: data.kustomization_overlay.data_no_ns.manifests[each.value]
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
# then loop through resources in ids_prio[1]
|
|
||||||
# and set an explicit depends_on on kustomization_resource.pre
|
|
||||||
# wait 2 minutes for any deployment or daemonset to become ready
|
|
||||||
resource "kustomization_resource" "main_no_ns" {
|
|
||||||
for_each = data.kustomization_overlay.data_no_ns.ids_prio[1]
|
|
||||||
|
|
||||||
manifest = (
|
|
||||||
contains(["_/Secret"], regex("(?P<group_kind>.*/.*)/.*/.*", each.value)["group_kind"])
|
|
||||||
? sensitive(data.kustomization_overlay.data_no_ns.manifests[each.value])
|
|
||||||
: data.kustomization_overlay.data_no_ns.manifests[each.value]
|
|
||||||
)
|
|
||||||
wait = true
|
|
||||||
timeouts {
|
|
||||||
create = "5m"
|
|
||||||
update = "5m"
|
|
||||||
}
|
|
||||||
|
|
||||||
depends_on = [kustomization_resource.pre_no_ns]
|
|
||||||
}
|
|
||||||
|
|
||||||
# finally, loop through resources in ids_prio[2]
|
|
||||||
# and set an explicit depends_on on kustomization_resource.main
|
|
||||||
resource "kustomization_resource" "post_no_ns" {
|
|
||||||
for_each = data.kustomization_overlay.data_no_ns.ids_prio[2]
|
|
||||||
|
|
||||||
manifest = (
|
|
||||||
contains(["_/Secret"], regex("(?P<group_kind>.*/.*)/.*/.*", each.value)["group_kind"])
|
|
||||||
? sensitive(data.kustomization_overlay.data_no_ns.manifests[each.value])
|
|
||||||
: data.kustomization_overlay.data_no_ns.manifests[each.value]
|
|
||||||
)
|
|
||||||
|
|
||||||
depends_on = [kustomization_resource.main_no_ns]
|
|
||||||
}
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
---
|
|
||||||
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: kube-state-metrics-5.16.0
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: metrics
|
|
||||||
app.kubernetes.io/part-of: kube-state-metrics
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "2.10.1"
|
|
||||||
release: kube-state-metrics
|
|
||||||
name: kube-state-metrics
|
|
||||||
namespace: vynil-monitor
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/charts/kube-state-metrics/templates/service.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: kube-state-metrics
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: kube-state-metrics-5.16.0
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: metrics
|
|
||||||
app.kubernetes.io/part-of: kube-state-metrics
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
app.kubernetes.io/version: "2.10.1"
|
|
||||||
release: kube-state-metrics
|
|
||||||
annotations:
|
|
||||||
prometheus.io/scrape: 'true'
|
|
||||||
spec:
|
|
||||||
type: "ClusterIP"
|
|
||||||
ports:
|
|
||||||
- name: "http"
|
|
||||||
protocol: TCP
|
|
||||||
port: 8080
|
|
||||||
targetPort: 8080
|
|
||||||
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
|
||||||
app.kubernetes.io/instance: kube-state-metrics
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: loki-dashboard
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
ingress_class:
|
|
||||||
default: traefik
|
|
||||||
examples:
|
|
||||||
- traefik
|
|
||||||
type: string
|
|
||||||
issuer:
|
|
||||||
default: letsencrypt-prod
|
|
||||||
examples:
|
|
||||||
- letsencrypt-prod
|
|
||||||
type: string
|
|
||||||
domain:
|
|
||||||
default: your-company
|
|
||||||
examples:
|
|
||||||
- your-company
|
|
||||||
type: string
|
|
||||||
images:
|
|
||||||
default:
|
|
||||||
operator:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: docker.io
|
|
||||||
repository: to-be/defined
|
|
||||||
tag: v1.0.0
|
|
||||||
examples:
|
|
||||||
- operator:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: docker.io
|
|
||||||
repository: to-be/defined
|
|
||||||
tag: v1.0.0
|
|
||||||
properties:
|
|
||||||
operator:
|
|
||||||
default:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: docker.io
|
|
||||||
repository: to-be/defined
|
|
||||||
tag: v1.0.0
|
|
||||||
properties:
|
|
||||||
pullPolicy:
|
|
||||||
default: IfNotPresent
|
|
||||||
enum:
|
|
||||||
- Always
|
|
||||||
- Never
|
|
||||||
- IfNotPresent
|
|
||||||
type: string
|
|
||||||
registry:
|
|
||||||
default: docker.io
|
|
||||||
type: string
|
|
||||||
repository:
|
|
||||||
default: to-be/defined
|
|
||||||
type: string
|
|
||||||
tag:
|
|
||||||
default: v1.0.0
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
sub_domain:
|
|
||||||
default: to-be-set
|
|
||||||
examples:
|
|
||||||
- to-be-set
|
|
||||||
type: string
|
|
||||||
domain_name:
|
|
||||||
default: your_company.com
|
|
||||||
examples:
|
|
||||||
- your_company.com
|
|
||||||
type: string
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: true
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,124 +0,0 @@
|
|||||||
# Source: loki/templates/single-binary/statefulset.yaml
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: StatefulSet
|
|
||||||
metadata:
|
|
||||||
name: loki
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/component: single-binary
|
|
||||||
app.kubernetes.io/part-of: memberlist
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
podManagementPolicy: Parallel
|
|
||||||
updateStrategy:
|
|
||||||
rollingUpdate:
|
|
||||||
partition: 0
|
|
||||||
serviceName: loki-headless
|
|
||||||
revisionHistoryLimit: 10
|
|
||||||
|
|
||||||
persistentVolumeClaimRetentionPolicy:
|
|
||||||
whenDeleted: Delete
|
|
||||||
whenScaled: Delete
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/component: single-binary
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
annotations:
|
|
||||||
checksum/config: f0b5fe7288abac6838f61aacfebeba1a01d5cf3d391971062c58797d2f0ea40f
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/component: single-binary
|
|
||||||
app.kubernetes.io/part-of: memberlist
|
|
||||||
spec:
|
|
||||||
serviceAccountName: loki
|
|
||||||
automountServiceAccountToken: true
|
|
||||||
enableServiceLinks: true
|
|
||||||
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 10001
|
|
||||||
runAsGroup: 10001
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 10001
|
|
||||||
terminationGracePeriodSeconds: 30
|
|
||||||
containers:
|
|
||||||
- name: loki
|
|
||||||
image: docker.io/grafana/loki:2.9.3
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
args:
|
|
||||||
- -config.file=/etc/loki/config/config.yaml
|
|
||||||
- -target=all
|
|
||||||
ports:
|
|
||||||
- name: http-metrics
|
|
||||||
containerPort: 3100
|
|
||||||
protocol: TCP
|
|
||||||
- name: grpc
|
|
||||||
containerPort: 9095
|
|
||||||
protocol: TCP
|
|
||||||
- name: http-memberlist
|
|
||||||
containerPort: 7946
|
|
||||||
protocol: TCP
|
|
||||||
securityContext:
|
|
||||||
allowPrivilegeEscalation: false
|
|
||||||
capabilities:
|
|
||||||
drop:
|
|
||||||
- ALL
|
|
||||||
readOnlyRootFilesystem: true
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ready
|
|
||||||
port: http-metrics
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
timeoutSeconds: 1
|
|
||||||
volumeMounts:
|
|
||||||
- name: tmp
|
|
||||||
mountPath: /tmp
|
|
||||||
- name: config
|
|
||||||
mountPath: /etc/loki/config
|
|
||||||
- name: runtime-config
|
|
||||||
mountPath: /etc/loki/runtime-config
|
|
||||||
- name: storage
|
|
||||||
mountPath: /var/loki
|
|
||||||
resources:
|
|
||||||
{}
|
|
||||||
affinity:
|
|
||||||
podAntiAffinity:
|
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- labelSelector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/component: single-binary
|
|
||||||
topologyKey: kubernetes.io/hostname
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
- name: tmp
|
|
||||||
emptyDir: {}
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: loki
|
|
||||||
items:
|
|
||||||
- key: "config.yaml"
|
|
||||||
path: "config.yaml"
|
|
||||||
- name: runtime-config
|
|
||||||
configMap:
|
|
||||||
name: loki-runtime
|
|
||||||
volumeClaimTemplates:
|
|
||||||
- apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: storage
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: "17Gi"
|
|
||||||
@@ -1,91 +0,0 @@
|
|||||||
resource "kubectl_manifest" "datasource" {
|
|
||||||
yaml_body = <<-EOF
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: loki-datasource
|
|
||||||
namespace: "${var.namespace}"
|
|
||||||
labels: ${jsonencode(merge(local.common-labels, {"grafana_datasource" = "1"}))}
|
|
||||||
data:
|
|
||||||
loki-datasource.yaml: |-
|
|
||||||
apiVersion: 1
|
|
||||||
datasources:
|
|
||||||
- name: Loki
|
|
||||||
type: loki
|
|
||||||
access: proxy
|
|
||||||
url: "http://loki.${var.namespace}.svc:3100"
|
|
||||||
version: 1
|
|
||||||
isDefault: false
|
|
||||||
jsonData:
|
|
||||||
{}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
resource "kubectl_manifest" "config" {
|
|
||||||
yaml_body = <<-EOF
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: loki
|
|
||||||
namespace: "${var.namespace}"
|
|
||||||
labels: ${jsonencode(local.common-labels)}
|
|
||||||
data:
|
|
||||||
config.yaml: |
|
|
||||||
auth_enabled: false
|
|
||||||
common:
|
|
||||||
compactor_address: 'loki'
|
|
||||||
path_prefix: /var/loki
|
|
||||||
replication_factor: 1
|
|
||||||
storage:
|
|
||||||
filesystem:
|
|
||||||
chunks_directory: /var/loki/chunks
|
|
||||||
rules_directory: /var/loki/rules
|
|
||||||
frontend:
|
|
||||||
scheduler_address: ""
|
|
||||||
frontend_worker:
|
|
||||||
scheduler_address: ""
|
|
||||||
index_gateway:
|
|
||||||
mode: ring
|
|
||||||
limits_config:
|
|
||||||
max_cache_freshness_per_query: 10m
|
|
||||||
reject_old_samples: true
|
|
||||||
reject_old_samples_max_age: 168h
|
|
||||||
split_queries_by_interval: 15m
|
|
||||||
memberlist:
|
|
||||||
join_members:
|
|
||||||
- loki-memberlist
|
|
||||||
query_range:
|
|
||||||
align_queries_with_step: true
|
|
||||||
ruler:
|
|
||||||
storage:
|
|
||||||
type: local
|
|
||||||
local:
|
|
||||||
directory: /tmp/rules
|
|
||||||
rule_path: /tmp/scratch
|
|
||||||
alertmanager_url: http://${var.alertmanager}:9093
|
|
||||||
ring:
|
|
||||||
kvstore:
|
|
||||||
store: inmemory
|
|
||||||
enable_api: true
|
|
||||||
runtime_config:
|
|
||||||
file: /etc/loki/runtime-config/runtime-config.yaml
|
|
||||||
schema_config:
|
|
||||||
configs:
|
|
||||||
- from: "2022-01-11"
|
|
||||||
index:
|
|
||||||
period: 24h
|
|
||||||
prefix: loki_index_
|
|
||||||
object_store: filesystem
|
|
||||||
schema: v12
|
|
||||||
store: boltdb-shipper
|
|
||||||
server:
|
|
||||||
grpc_listen_port: 9095
|
|
||||||
http_listen_port: 3100
|
|
||||||
storage_config:
|
|
||||||
hedging:
|
|
||||||
at: 250ms
|
|
||||||
max_per_second: 20
|
|
||||||
up_to: 3
|
|
||||||
tracing:
|
|
||||||
enabled: false
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
locals {
|
|
||||||
common-labels = {
|
|
||||||
"vynil.solidite.fr/owner-name" = var.instance
|
|
||||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
|
||||||
"vynil.solidite.fr/owner-category" = var.category
|
|
||||||
"vynil.solidite.fr/owner-component" = var.component
|
|
||||||
"app.kubernetes.io/managed-by" = "vynil"
|
|
||||||
"app.kubernetes.io/instance" = var.instance
|
|
||||||
}
|
|
||||||
pvc_spec = merge({
|
|
||||||
"accessModes" = [var.storage.volume.accessMode]
|
|
||||||
"volumeMode" = var.storage.volume.type
|
|
||||||
"resources" = {
|
|
||||||
"requests" = {
|
|
||||||
"storage" = "${var.storage.volume.size}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}, var.storage.volume.class != "" ?{
|
|
||||||
"storageClassName" = var.storage.volume.class
|
|
||||||
}:{})
|
|
||||||
rb-patch = <<-EOF
|
|
||||||
- op: replace
|
|
||||||
path: /subjects/0/namespace
|
|
||||||
value: "${var.namespace}"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
data "kustomization_overlay" "data" {
|
|
||||||
common_labels = local.common-labels
|
|
||||||
namespace = var.namespace
|
|
||||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1]
|
|
||||||
images {
|
|
||||||
name = "docker.io/grafana/loki"
|
|
||||||
new_name = "${var.images.loki.registry}/${var.images.loki.repository}"
|
|
||||||
new_tag = "${var.images.loki.tag}"
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "ServiceMonitor"
|
|
||||||
name = "loki"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
- op: replace
|
|
||||||
path: /spec/endpoints/0/relabelings/0/replacement
|
|
||||||
value: "${var.namespace}/$1"
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
patches {
|
|
||||||
target {
|
|
||||||
kind = "StatefulSet"
|
|
||||||
name = "loki"
|
|
||||||
}
|
|
||||||
patch = <<-EOF
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: StatefulSet
|
|
||||||
metadata:
|
|
||||||
name: loki
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: loki
|
|
||||||
imagePullPolicy: ${var.images.loki.pullPolicy}
|
|
||||||
volumeClaimTemplates:
|
|
||||||
- apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: storage
|
|
||||||
annotations:
|
|
||||||
k8up.io/backup: "true"
|
|
||||||
spec: ${jsonencode(local.pvc_spec)}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,105 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: loki
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
alertmanager:
|
|
||||||
default: alertmanager-alertmanager
|
|
||||||
examples:
|
|
||||||
- alertmanager-alertmanager
|
|
||||||
type: string
|
|
||||||
images:
|
|
||||||
default:
|
|
||||||
loki:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: docker.io
|
|
||||||
repository: grafana/loki
|
|
||||||
tag: 2.9.3
|
|
||||||
examples:
|
|
||||||
- loki:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: docker.io
|
|
||||||
repository: grafana/loki
|
|
||||||
tag: 2.9.3
|
|
||||||
properties:
|
|
||||||
loki:
|
|
||||||
default:
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
registry: docker.io
|
|
||||||
repository: grafana/loki
|
|
||||||
tag: 2.9.3
|
|
||||||
properties:
|
|
||||||
pullPolicy:
|
|
||||||
default: IfNotPresent
|
|
||||||
enum:
|
|
||||||
- Always
|
|
||||||
- Never
|
|
||||||
- IfNotPresent
|
|
||||||
type: string
|
|
||||||
registry:
|
|
||||||
default: docker.io
|
|
||||||
type: string
|
|
||||||
repository:
|
|
||||||
default: grafana/loki
|
|
||||||
type: string
|
|
||||||
tag:
|
|
||||||
default: 2.9.3
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
storage:
|
|
||||||
default:
|
|
||||||
volume:
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
description: Configure this app storage
|
|
||||||
examples:
|
|
||||||
- volume:
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
properties:
|
|
||||||
volume:
|
|
||||||
default:
|
|
||||||
accessMode: ReadWriteOnce
|
|
||||||
class: ''
|
|
||||||
size: 10Gi
|
|
||||||
type: Filesystem
|
|
||||||
properties:
|
|
||||||
accessMode:
|
|
||||||
default: ReadWriteOnce
|
|
||||||
enum:
|
|
||||||
- ReadWriteOnce
|
|
||||||
- ReadOnlyMany
|
|
||||||
- ReadWriteMany
|
|
||||||
type: string
|
|
||||||
class:
|
|
||||||
default: ''
|
|
||||||
type: string
|
|
||||||
size:
|
|
||||||
default: 10Gi
|
|
||||||
type: string
|
|
||||||
type:
|
|
||||||
default: Filesystem
|
|
||||||
enum:
|
|
||||||
- Filesystem
|
|
||||||
- Block
|
|
||||||
type: string
|
|
||||||
type: object
|
|
||||||
type: object
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
# Source: loki/templates/monitoring/loki-alerts.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
name: loki-loki-alerts
|
|
||||||
namespace: vynil-monitor
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: loki_alerts
|
|
||||||
rules:
|
|
||||||
- alert: LokiRequestErrors
|
|
||||||
annotations:
|
|
||||||
message: |
|
|
||||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|
|
||||||
expr: |
|
|
||||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
|
|
||||||
/
|
|
||||||
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
|
|
||||||
> 10
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: LokiRequestPanics
|
|
||||||
annotations:
|
|
||||||
message: |
|
|
||||||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|
|
||||||
expr: |
|
|
||||||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: LokiRequestLatency
|
|
||||||
annotations:
|
|
||||||
message: |
|
|
||||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
|
||||||
expr: |
|
|
||||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: LokiTooManyCompactorsRunning
|
|
||||||
annotations:
|
|
||||||
message: |
|
|
||||||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|
|
||||||
expr: |
|
|
||||||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- name: loki_canaries_alerts
|
|
||||||
rules:
|
|
||||||
- alert: LokiCanaryLatency
|
|
||||||
annotations:
|
|
||||||
message: |
|
|
||||||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
|
||||||
expr: |
|
|
||||||
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
# Source: loki/templates/monitoring/loki-rules.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
name: loki-loki-rules
|
|
||||||
namespace: vynil-monitor
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: loki_rules
|
|
||||||
rules:
|
|
||||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job))
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job:loki_request_duration_seconds:99quantile
|
|
||||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job))
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job:loki_request_duration_seconds:50quantile
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
|
||||||
by (job)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job:loki_request_duration_seconds:avg
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job:loki_request_duration_seconds_count:sum_rate
|
|
||||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job, route))
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job_route:loki_request_duration_seconds:99quantile
|
|
||||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, job, route))
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job_route:loki_request_duration_seconds:50quantile
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m]))
|
|
||||||
by (job, route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job_route:loki_request_duration_seconds:avg
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
|
||||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, namespace, job, route))
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
|
||||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
|
||||||
by (le, namespace, job, route))
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
|
||||||
/ sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job,
|
|
||||||
route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
|
||||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
|
||||||
labels:
|
|
||||||
cluster: loki
|
|
||||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
# Source: loki/templates/monitoring/servicemonitor.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: ServiceMonitor
|
|
||||||
metadata:
|
|
||||||
name: loki
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
matchExpressions:
|
|
||||||
- key: prometheus.io/service-monitor
|
|
||||||
operator: NotIn
|
|
||||||
values:
|
|
||||||
- "false"
|
|
||||||
endpoints:
|
|
||||||
- port: http-metrics
|
|
||||||
path: /metrics
|
|
||||||
interval: 15s
|
|
||||||
relabelings:
|
|
||||||
- sourceLabels: [job]
|
|
||||||
action: replace
|
|
||||||
replacement: "vynil-monitor/$1"
|
|
||||||
targetLabel: job
|
|
||||||
- action: replace
|
|
||||||
replacement: "loki"
|
|
||||||
targetLabel: cluster
|
|
||||||
scheme: http
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
# Source: loki/templates/runtime-configmap.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: loki-runtime
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
data:
|
|
||||||
runtime-config.yaml: |
|
|
||||||
{}
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
---
|
|
||||||
# Source: loki/templates/serviceaccount.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: loki
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
automountServiceAccountToken: true
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
# Source: loki/templates/single-binary/service-headless.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: loki-headless
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
variant: headless
|
|
||||||
prometheus.io/service-monitor: "false"
|
|
||||||
annotations:
|
|
||||||
spec:
|
|
||||||
clusterIP: None
|
|
||||||
ports:
|
|
||||||
- name: http-metrics
|
|
||||||
port: 3100
|
|
||||||
targetPort: http-metrics
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
# Source: loki/templates/service-memberlist.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: loki-memberlist
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
clusterIP: None
|
|
||||||
ports:
|
|
||||||
- name: tcp
|
|
||||||
port: 7946
|
|
||||||
targetPort: http-memberlist
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/part-of: memberlist
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
# Source: loki/templates/single-binary/service.yaml
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: loki
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
helm.sh/chart: loki-5.41.8
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/version: "2.9.3"
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
annotations:
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- name: http-metrics
|
|
||||||
port: 3100
|
|
||||||
targetPort: http-metrics
|
|
||||||
protocol: TCP
|
|
||||||
- name: grpc
|
|
||||||
port: 9095
|
|
||||||
targetPort: grpc
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: loki
|
|
||||||
app.kubernetes.io/instance: loki
|
|
||||||
app.kubernetes.io/component: single-binary
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
---
|
|
||||||
apiVersion: vinyl.solidite.fr/v1beta1
|
|
||||||
kind: Component
|
|
||||||
category: monitor
|
|
||||||
metadata:
|
|
||||||
name: monitor-control-plan
|
|
||||||
description: null
|
|
||||||
options:
|
|
||||||
useless:
|
|
||||||
default: true
|
|
||||||
examples:
|
|
||||||
- true
|
|
||||||
type: boolean
|
|
||||||
dependencies: []
|
|
||||||
providers:
|
|
||||||
kubernetes: true
|
|
||||||
authentik: null
|
|
||||||
kubectl: true
|
|
||||||
postgresql: null
|
|
||||||
restapi: null
|
|
||||||
http: null
|
|
||||||
gitea: null
|
|
||||||
tfaddtype: null
|
|
||||||
@@ -1,167 +0,0 @@
|
|||||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/etcd.yaml
|
|
||||||
apiVersion: monitoring.coreos.com/v1
|
|
||||||
kind: PrometheusRule
|
|
||||||
metadata:
|
|
||||||
name: prometheus-community-kube-etcd
|
|
||||||
namespace: vynil-monitor
|
|
||||||
labels:
|
|
||||||
app: kube-prometheus-stack
|
|
||||||
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/instance: prometheus-community
|
|
||||||
app.kubernetes.io/version: "56.1.0"
|
|
||||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
|
||||||
chart: kube-prometheus-stack-56.1.0
|
|
||||||
release: "prometheus-community"
|
|
||||||
heritage: "Helm"
|
|
||||||
spec:
|
|
||||||
groups:
|
|
||||||
- name: etcd
|
|
||||||
rules:
|
|
||||||
- alert: etcdMembersDown
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
|
|
||||||
summary: etcd cluster members are down.
|
|
||||||
expr: |-
|
|
||||||
max without (endpoint) (
|
|
||||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
|
||||||
or
|
|
||||||
count without (To) (
|
|
||||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
|
||||||
)
|
|
||||||
)
|
|
||||||
> 0
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: etcdInsufficientMembers
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
|
|
||||||
summary: etcd cluster has insufficient number of members.
|
|
||||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: etcdNoLeader
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
|
|
||||||
summary: etcd cluster has no leader.
|
|
||||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: etcdHighNumberOfLeaderChanges
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
|
||||||
summary: etcd cluster has high number of leader changes.
|
|
||||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
|
||||||
summary: etcd cluster has high number of failed grpc requests.
|
|
||||||
expr: |-
|
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
|
||||||
/
|
|
||||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
|
||||||
> 1
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
|
||||||
summary: etcd cluster has high number of failed grpc requests.
|
|
||||||
expr: |-
|
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
|
||||||
/
|
|
||||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
|
||||||
> 5
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: etcdGRPCRequestsSlow
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
|
|
||||||
summary: etcd grpc requests are slow
|
|
||||||
expr: |-
|
|
||||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
|
||||||
> 0.15
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: etcdMemberCommunicationSlow
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
||||||
summary: etcd cluster member communication is slow.
|
|
||||||
expr: |-
|
|
||||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
||||||
> 0.15
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: etcdHighNumberOfFailedProposals
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
|
||||||
summary: etcd cluster has high number of proposal failures.
|
|
||||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: etcdHighFsyncDurations
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
||||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
|
||||||
expr: |-
|
|
||||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
||||||
> 0.5
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: etcdHighFsyncDurations
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
||||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
|
||||||
expr: |-
|
|
||||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
||||||
> 1
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: etcdHighCommitDurations
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
|
||||||
summary: etcd cluster 99th percentile commit durations are too high.
|
|
||||||
expr: |-
|
|
||||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
|
||||||
> 0.25
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: etcdDatabaseQuotaLowSpace
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
|
|
||||||
summary: etcd cluster database is running full.
|
|
||||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
- alert: etcdExcessiveDatabaseGrowth
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
|
|
||||||
summary: etcd cluster database growing very fast.
|
|
||||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: etcdDatabaseHighFragmentationRatio
|
|
||||||
annotations:
|
|
||||||
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
|
|
||||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
|
||||||
summary: etcd database size in use is less than 50% of the actual allocated storage.
|
|
||||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
|
||||||
for: 10m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user