diff --git a/monitor/prometheus/datas.tf b/monitor/prometheus/datas.tf index b75c5e7..8e9f7a1 100644 --- a/monitor/prometheus/datas.tf +++ b/monitor/prometheus/datas.tf @@ -25,45 +25,16 @@ data "kustomization_overlay" "data" { common_labels = local.common-labels namespace = var.namespace resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml" && length(regexall("ClusterRole",file))<1 && length(regexall("Service_prometheus",file))<1] - patches { - target { - kind = "Prometheus" - name = "prometheus-community-kube-prometheus" - } - patch = <<-EOF - apiVersion: monitoring.coreos.com/v1 - kind: Prometheus - metadata: - name: prometheus-community-kube-prometheus - spec: - image: "${var.images.prometheus.registry}/${var.images.prometheus.repository}:${var.images.prometheus.tag}" - version: ${var.images.prometheus.tag} - externalUrl: http://prometheus-community-kube-prometheus.${var.namespace}:9090 - replicas: ${var.replicas} - shards: ${var.shards} - logLevel: ${var.logLevel} - listenLocal: ${var.listenLocal} - enableAdminAPI: ${var.enableAdminAPI} - retention: "${var.retention}" - alerting: - alertmanagers: - - namespace: ${var.namespace} - name: prometheus-community-kube-alertmanager - port: http-web - pathPrefix: "/" - apiVersion: v2 - EOF - } patches { target { kind = "ConfigMap" - name = "prometheus-community-kube-grafana-datasource" + name = "prometheus-kube-prometheus-grafana-datasource" } patch = <<-EOF apiVersion: v1 kind: ConfigMap metadata: - name: prometheus-community-kube-grafana-datasource + name: prometheus-kube-prometheus-grafana-datasource data: datasource.yaml: |- apiVersion: 1 @@ -82,7 +53,7 @@ data "kustomization_overlay" "data" { patches { target { kind = "ServiceMonitor" - name = "prometheus-community-kube-prometheus" + name = "prometheus-kube-prometheus-prometheus" } patch = <<-EOF - op: replace @@ -94,13 +65,13 @@ data "kustomization_overlay" "data" { patches { target { kind = "PrometheusRule" - name = "prometheus-community-kube-prometheus" + name = "prometheus-kube-prometheus-prometheus" } patch = <<-EOF apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: prometheus-community-kube-prometheus + name: prometheus-kube-prometheus-prometheus spec: groups: - name: prometheus @@ -109,24 +80,24 @@ data "kustomization_overlay" "data" { expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) == 0 + max_over_time(prometheus_config_last_reload_successful{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) == 0 - alert: PrometheusSDRefreshFailure - expr: increase(prometheus_sd_refresh_failures_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[10m]) > 0 + expr: increase(prometheus_sd_refresh_failures_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[10m]) > 0 - alert: PrometheusNotificationQueueRunningFull expr: |- # Without min_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - predict_linear(prometheus_notifications_queue_length{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m], 60 * 30) + predict_linear(prometheus_notifications_queue_length{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m], 60 * 30) > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) ) - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers expr: |- ( - rate(prometheus_notifications_errors_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) + rate(prometheus_notifications_errors_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) / - rate(prometheus_notifications_sent_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) + rate(prometheus_notifications_sent_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) ) * 100 > 1 @@ -134,35 +105,35 @@ data "kustomization_overlay" "data" { expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) < 1 + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) < 1 - alert: PrometheusTSDBReloadsFailing - expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[3h]) > 0 + expr: increase(prometheus_tsdb_reloads_failures_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[3h]) > 0 - alert: PrometheusTSDBCompactionsFailing - expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[3h]) > 0 + expr: increase(prometheus_tsdb_compactions_failed_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[3h]) > 0 - alert: PrometheusNotIngestingSamples expr: |- ( - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) <= 0 + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) <= 0 and ( - sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}) > 0 + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}) > 0 or - sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}) > 0 + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}) > 0 ) ) - alert: PrometheusDuplicateTimestamps - expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusOutOfOrderTimestamps - expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusRemoteStorageFailures expr: |- ( - (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m])) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m])) / ( - (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m])) + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m])) + - (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m])) + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m])) ) ) * 100 @@ -172,9 +143,9 @@ data "kustomization_overlay" "data" { # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) - ignoring(remote_name, url) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) ) > 120 - alert: PrometheusRemoteWriteDesiredShards @@ -182,32 +153,32 @@ data "kustomization_overlay" "data" { # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) ) - alert: PrometheusRuleFailures - expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: increase(prometheus_rule_evaluation_failures_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusMissingRuleEvaluations - expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: increase(prometheus_rule_group_iterations_missed_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusTargetLimitHit - expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusLabelLimitHit - expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusScrapeBodySizeLimitHit - expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusScrapeSampleLimitHit - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0 + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0 - alert: PrometheusTargetSyncFailure - expr: increase(prometheus_target_sync_failed_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[30m]) > 0 + expr: increase(prometheus_target_sync_failed_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[30m]) > 0 - alert: PrometheusHighQueryLoad - expr: avg_over_time(prometheus_engine_queries{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-community-kube-prometheus",namespace="${var.namespace}"}[5m]) > 0.8 + expr: avg_over_time(prometheus_engine_queries{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}"}[5m]) > 0.8 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager expr: |- min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}",alertmanager!~``}[5m]) + rate(prometheus_notifications_errors_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}",alertmanager!~``}[5m]) / - rate(prometheus_notifications_sent_total{job="prometheus-community-kube-prometheus",namespace="${var.namespace}",alertmanager!~``}[5m]) + rate(prometheus_notifications_sent_total{job="prometheus-kube-prometheus-prometheus",namespace="${var.namespace}",alertmanager!~``}[5m]) ) * 100 > 3 @@ -233,7 +204,7 @@ data "kustomization_overlay" "data_no_ns" { patches { target { kind = "ClusterRoleBinding" - name = "prometheus-community-kube-prometheus" + name = "prometheus-kube-prometheus-prometheus" } patch = local.rb-patch } diff --git a/monitor/prometheus/index.yaml b/monitor/prometheus/index.yaml index 64c6d72..7c318df 100644 --- a/monitor/prometheus/index.yaml +++ b/monitor/prometheus/index.yaml @@ -6,30 +6,10 @@ metadata: name: prometheus description: null options: - domain: - default: your-company + issuer: + default: letsencrypt-prod examples: - - your-company - type: string - replicas: - default: 1 - examples: - - 1 - type: integer - shards: - default: 1 - examples: - - 1 - type: integer - domain-name: - default: your_company.com - examples: - - your_company.com - type: string - app-group: - default: monitor - examples: - - monitor + - letsencrypt-prod type: string images: default: @@ -70,41 +50,61 @@ options: type: string type: object type: object - issuer: - default: letsencrypt-prod + domain: + default: your-company examples: - - letsencrypt-prod + - your-company type: string logLevel: default: info examples: - info type: string - listenLocal: - default: false - examples: - - false - type: boolean - ingress-class: - default: traefik - examples: - - traefik - type: string retention: default: 10d examples: - 10d type: string + listenLocal: + default: false + examples: + - false + type: boolean enableAdminAPI: default: false examples: - false type: boolean + domain-name: + default: your_company.com + examples: + - your_company.com + type: string + ingress-class: + default: traefik + examples: + - traefik + type: string + replicas: + default: 1 + examples: + - 1 + type: integer + shards: + default: 1 + examples: + - 1 + type: integer sub-domain: default: prometheus examples: - prometheus type: string + app-group: + default: monitor + examples: + - monitor + type: string dependencies: - dist: null category: share