Files
domain-incoming/share/authentik/authentik_monitoring.tf
2024-05-23 13:21:16 +02:00

172 lines
9.1 KiB
HCL

resource "kubectl_manifest" "PrometheusRule_authentik" {
count = var.conditions.have_prometheusrules?1:0
yaml_body = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: authentik
namespace: ${var.namespace}
labels: ${jsonencode(local.metrics_all_labels)}
spec:
groups:
- name: authentik Aggregate request counters
rules:
- record: job:django_http_requests_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_before_middlewares_total[30s])) by (job)
- record: job:django_http_requests_unknown_latency_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_total[30s])) by (job)
- record: job:django_http_ajax_requests_total:sum_rate30s
expr: sum(rate(django_http_ajax_requests_total[30s])) by (job)
- record: job:django_http_responses_before_middlewares_total:sum_rate30s
expr: sum(rate(django_http_responses_before_middlewares_total[30s])) by (job)
- record: job:django_http_requests_unknown_latency_including_middlewares_total:sum_rate30s
expr: sum(rate(django_http_requests_unknown_latency_including_middlewares_total[30s])) by (job)
- record: job:django_http_requests_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_requests_body_total_bytes[30s])) by (job)
- record: job:django_http_responses_streaming_total:sum_rate30s
expr: sum(rate(django_http_responses_streaming_total[30s])) by (job)
- record: job:django_http_responses_body_total_bytes:sum_rate30s
expr: sum(rate(django_http_responses_body_total_bytes[30s])) by (job)
- record: job:django_http_requests_total:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) by (job)
- record: job:django_http_requests_total_by_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_method[30s])) by (job,method)
- record: job:django_http_requests_total_by_transport:sum_rate30s
expr: sum(rate(django_http_requests_total_by_transport[30s])) by (job,transport)
- record: job:django_http_requests_total_by_view:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view)
- record: job:django_http_requests_total_by_view_transport_method:sum_rate30s
expr: sum(rate(django_http_requests_total_by_view_transport_method[30s])) by (job,view,transport,method)
- record: job:django_http_responses_total_by_templatename:sum_rate30s
expr: sum(rate(django_http_responses_total_by_templatename[30s])) by (job,templatename)
- record: job:django_http_responses_total_by_status:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status[30s])) by (job,status)
- record: job:django_http_responses_total_by_status_name_method:sum_rate30s
expr: sum(rate(django_http_responses_total_by_status_name_method[30s])) by (job,status,name,method)
- record: job:django_http_responses_total_by_charset:sum_rate30s
expr: sum(rate(django_http_responses_total_by_charset[30s])) by (job,charset)
- record: job:django_http_exceptions_total_by_type:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_type[30s])) by (job,type)
- record: job:django_http_exceptions_total_by_view:sum_rate30s
expr: sum(rate(django_http_exceptions_total_by_view[30s])) by (job,view)
- name: authentik Aggregate latency histograms
rules:
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: '50'
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: '95'
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: '99'
- record: job:django_http_requests_latency_including_middlewares_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_including_middlewares_seconds_bucket[30s])) by (job, le))
labels:
quantile: '99.9'
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.50, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: '50'
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.95, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: '95'
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.99, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: '99'
- record: job:django_http_requests_latency_seconds:quantile_rate30s
expr: histogram_quantile(0.999, sum(rate(django_http_requests_latency_seconds_bucket[30s])) by (job, le))
labels:
quantile: '99.9'
- name: authentik Aggregate model operations
rules:
- record: job:django_model_inserts_total:sum_rate1m
expr: sum(rate(django_model_inserts_total[1m])) by (job, model)
- record: job:django_model_updates_total:sum_rate1m
expr: sum(rate(django_model_updates_total[1m])) by (job, model)
- record: job:django_model_deletes_total:sum_rate1m
expr: sum(rate(django_model_deletes_total[1m])) by (job, model)
- name: authentik Aggregate database operations
rules:
- record: job:django_db_new_connections_total:sum_rate30s
expr: sum(rate(django_db_new_connections_total[30s])) by (alias, vendor)
- record: job:django_db_new_connection_errors_total:sum_rate30s
expr: sum(rate(django_db_new_connection_errors_total[30s])) by (alias, vendor)
- record: job:django_db_execute_total:sum_rate30s
expr: sum(rate(django_db_execute_total[30s])) by (alias, vendor)
- record: job:django_db_execute_many_total:sum_rate30s
expr: sum(rate(django_db_execute_many_total[30s])) by (alias, vendor)
- record: job:django_db_errors_total:sum_rate30s
expr: sum(rate(django_db_errors_total[30s])) by (alias, vendor, type)
- name: authentik Aggregate migrations
rules:
- record: job:django_migrations_applied_total:max
expr: max(django_migrations_applied_total) by (job, connection)
- record: job:django_migrations_unapplied_total:max
expr: max(django_migrations_unapplied_total) by (job, connection)
- name: authentik Alerts
rules:
- alert: NoWorkersConnected
labels:
severity: critical
expr: max without (pid) (authentik_admin_workers) < 1
for: 10m
annotations:
summary: No workers connected
message: authentik instance {{ $labels.instance }}'s worker are either not running or not connected.
- alert: PendingMigrations
labels:
severity: critical
expr: max without (pid) (django_migrations_unapplied_total) > 0
for: 10m
annotations:
summary: Pending database migrations
message: authentik instance {{ $labels.instance }} has pending database migrations
- alert: FailedSystemTasks
labels:
severity: critical
expr: sum(increase(authentik_system_tasks{status="error"}[2h])) > 0
for: 2h
annotations:
summary: Failed system tasks
message: System task {{ $labels.task_name }} has failed
- alert: DisconnectedOutposts
labels:
severity: critical
expr: sum by (outpost) (max without (pid) (authentik_outposts_connected{uid!~"specific.*"})) < 1
for: 30m
annotations:
summary: Disconnected outpost
message: Outpost {{ $labels.outpost }} has at least 1 disconnected instance
EOF
}
resource "kubectl_manifest" "ServiceMonitor_authentik-server" {
count = var.conditions.have_servicemonitors?1:0
yaml_body = <<-EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: authentik-server
namespace: ${var.namespace}
labels: ${jsonencode(local.metrics_all_labels)}
spec:
endpoints:
- port: metrics
interval: 30s
scrapeTimeout: 3s
path: /metrics
namespaceSelector:
matchNames:
- ${var.namespace}
selector:
matchLabels: ${jsonencode(local.metrics_labels)}
EOF
}