fix
This commit is contained in:
@@ -0,0 +1,119 @@
|
||||
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/daemonset.yaml
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: prometheus-community-prometheus-node-exporter
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
helm.sh/chart: prometheus-node-exporter-4.25.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/component: metrics
|
||||
app.kubernetes.io/part-of: prometheus-node-exporter
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "1.7.0"
|
||||
jobLabel: node-exporter
|
||||
release: prometheus-community
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
revisionHistoryLimit: 10
|
||||
updateStrategy:
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
cluster-autoscaler.kubernetes.io/safe-to-evict: "true"
|
||||
labels:
|
||||
helm.sh/chart: prometheus-node-exporter-4.25.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/component: metrics
|
||||
app.kubernetes.io/part-of: prometheus-node-exporter
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "1.7.0"
|
||||
jobLabel: node-exporter
|
||||
release: prometheus-community
|
||||
spec:
|
||||
automountServiceAccountToken: false
|
||||
securityContext:
|
||||
fsGroup: 65534
|
||||
runAsGroup: 65534
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
serviceAccountName: prometheus-community-prometheus-node-exporter
|
||||
containers:
|
||||
- name: node-exporter
|
||||
image: quay.io/prometheus/node-exporter:v1.7.0
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
- --path.udev.data=/host/root/run/udev/data
|
||||
- --web.listen-address=[$(HOST_IP)]:9100
|
||||
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)
|
||||
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: true
|
||||
env:
|
||||
- name: HOST_IP
|
||||
value: 0.0.0.0
|
||||
ports:
|
||||
- name: http-metrics
|
||||
containerPort: 9100
|
||||
protocol: TCP
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
httpHeaders:
|
||||
path: /
|
||||
port: 9100
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 0
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
httpHeaders:
|
||||
path: /
|
||||
port: 9100
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 0
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
readOnly: true
|
||||
- name: sys
|
||||
mountPath: /host/sys
|
||||
readOnly: true
|
||||
- name: root
|
||||
mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
volumes:
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: root
|
||||
hostPath:
|
||||
path: /
|
||||
21
monitor/node-exporter/datas.tf
Normal file
21
monitor/node-exporter/datas.tf
Normal file
@@ -0,0 +1,21 @@
|
||||
locals {
|
||||
common-labels = {
|
||||
"vynil.solidite.fr/owner-name" = var.instance
|
||||
"vynil.solidite.fr/owner-namespace" = var.namespace
|
||||
"vynil.solidite.fr/owner-category" = var.category
|
||||
"vynil.solidite.fr/owner-component" = var.component
|
||||
"app.kubernetes.io/managed-by" = "vynil"
|
||||
"app.kubernetes.io/instance" = var.instance
|
||||
}
|
||||
}
|
||||
|
||||
data "kustomization_overlay" "data" {
|
||||
common_labels = local.common-labels
|
||||
namespace = var.namespace
|
||||
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
|
||||
images {
|
||||
name = "quay.io/prometheus/node-exporter"
|
||||
new_name = "${var.images.node-exporter.registry}/${var.images.node-exporter.repository}"
|
||||
new_tag = "${var.images.node-exporter.tag}"
|
||||
}
|
||||
}
|
||||
57
monitor/node-exporter/index.yaml
Normal file
57
monitor/node-exporter/index.yaml
Normal file
@@ -0,0 +1,57 @@
|
||||
---
|
||||
apiVersion: vinyl.solidite.fr/v1beta1
|
||||
kind: Component
|
||||
category: monitor
|
||||
metadata:
|
||||
name: node-exporter
|
||||
description: null
|
||||
options:
|
||||
images:
|
||||
default:
|
||||
node-exporter:
|
||||
pullPolicy: IfNotPresent
|
||||
registry: quay.io
|
||||
repository: prometheus/node-exporter
|
||||
tag: v1.7.0
|
||||
examples:
|
||||
- node-exporter:
|
||||
pullPolicy: IfNotPresent
|
||||
registry: quay.io
|
||||
repository: prometheus/node-exporter
|
||||
tag: v1.7.0
|
||||
properties:
|
||||
node-exporter:
|
||||
default:
|
||||
pullPolicy: IfNotPresent
|
||||
registry: quay.io
|
||||
repository: prometheus/node-exporter
|
||||
tag: v1.7.0
|
||||
properties:
|
||||
pullPolicy:
|
||||
default: IfNotPresent
|
||||
enum:
|
||||
- Always
|
||||
- Never
|
||||
- IfNotPresent
|
||||
type: string
|
||||
registry:
|
||||
default: quay.io
|
||||
type: string
|
||||
repository:
|
||||
default: prometheus/node-exporter
|
||||
type: string
|
||||
tag:
|
||||
default: v1.7.0
|
||||
type: string
|
||||
type: object
|
||||
type: object
|
||||
dependencies: []
|
||||
providers:
|
||||
kubernetes: true
|
||||
authentik: null
|
||||
kubectl: true
|
||||
postgresql: null
|
||||
restapi: null
|
||||
http: null
|
||||
gitea: null
|
||||
tfaddtype: null
|
||||
@@ -0,0 +1,82 @@
|
||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: prometheus-community-kube-node-exporter.rules
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
app: kube-prometheus-stack
|
||||
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "56.0.2"
|
||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
||||
chart: kube-prometheus-stack-56.0.2
|
||||
release: "prometheus-community"
|
||||
heritage: "Helm"
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |-
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
@@ -0,0 +1,328 @@
|
||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: prometheus-community-kube-node-exporter
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
app: kube-prometheus-stack
|
||||
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "56.0.2"
|
||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
||||
chart: kube-prometheus-stack-56.0.2
|
||||
release: "prometheus-community"
|
||||
heritage: "Helm"
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector on {{ $labels.instance }} failed to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded.
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array.
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeCPUHighUsage
|
||||
annotations:
|
||||
description: 'CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
|
||||
'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||
summary: High CPU usage.
|
||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: NodeSystemSaturation
|
||||
annotations:
|
||||
description: 'System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
|
||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||
|
||||
'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
||||
summary: System saturated, load per core is very high.
|
||||
expr: |-
|
||||
node_load1{job="node-exporter"}
|
||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryMajorPagesFaults
|
||||
annotations:
|
||||
description: 'Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
|
||||
Please check that there is enough memory available at this instance.
|
||||
|
||||
'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
||||
summary: Memory major page faults are occurring at very high rate.
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryHighUtilization
|
||||
annotations:
|
||||
description: 'Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
|
||||
'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||
summary: Host is running out of memory.
|
||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeDiskIOSaturation
|
||||
annotations:
|
||||
description: 'Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
|
||||
This symptom might indicate disk saturation.
|
||||
|
||||
'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||
summary: Disk IO queue is high.
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
description: Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||
summary: Systemd service has entered failed state.
|
||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||
summary: Bonding interface is degraded
|
||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -0,0 +1,29 @@
|
||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: prometheus-community-kube-node-network
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
app: kube-prometheus-stack
|
||||
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "56.0.2"
|
||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
||||
chart: kube-prometheus-stack-56.0.2
|
||||
release: "prometheus-community"
|
||||
heritage: "Helm"
|
||||
spec:
|
||||
groups:
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changing its status
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -0,0 +1,56 @@
|
||||
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: prometheus-community-kube-node.rules
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
app: kube-prometheus-stack
|
||||
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "56.0.2"
|
||||
app.kubernetes.io/part-of: kube-prometheus-stack
|
||||
chart: kube-prometheus-stack-56.0.2
|
||||
release: "prometheus-community"
|
||||
heritage: "Helm"
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
topk by (cluster, namespace, pod) (1,
|
||||
max by (cluster, node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |-
|
||||
count by (cluster, node) (
|
||||
node_cpu_seconds_total{mode="idle",job="node-exporter"}
|
||||
* on (cluster, namespace, pod) group_left(node)
|
||||
topk by (cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:)
|
||||
)
|
||||
record: node:node_num_cpu:sum
|
||||
- expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- expr: |-
|
||||
avg by (cluster, node) (
|
||||
sum without (mode) (
|
||||
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
|
||||
)
|
||||
)
|
||||
record: node:node_cpu_utilization:ratio_rate5m
|
||||
- expr: |-
|
||||
avg by (cluster) (
|
||||
node:node_cpu_utilization:ratio_rate5m
|
||||
)
|
||||
record: cluster:node_cpu:ratio_rate5m
|
||||
@@ -0,0 +1,28 @@
|
||||
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/servicemonitor.yaml
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: prometheus-community-prometheus-node-exporter
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
helm.sh/chart: prometheus-node-exporter-4.25.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/component: metrics
|
||||
app.kubernetes.io/part-of: prometheus-node-exporter
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "1.7.0"
|
||||
jobLabel: node-exporter
|
||||
release: prometheus-community
|
||||
spec:
|
||||
jobLabel: jobLabel
|
||||
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
attachMetadata:
|
||||
node: false
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
scheme: http
|
||||
@@ -0,0 +1,17 @@
|
||||
---
|
||||
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/serviceaccount.yaml
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus-community-prometheus-node-exporter
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
helm.sh/chart: prometheus-node-exporter-4.25.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/component: metrics
|
||||
app.kubernetes.io/part-of: prometheus-node-exporter
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "1.7.0"
|
||||
jobLabel: node-exporter
|
||||
release: prometheus-community
|
||||
@@ -0,0 +1,28 @@
|
||||
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/service.yaml
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-community-prometheus-node-exporter
|
||||
namespace: vynil-monitor
|
||||
labels:
|
||||
helm.sh/chart: prometheus-node-exporter-4.25.0
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/component: metrics
|
||||
app.kubernetes.io/part-of: prometheus-node-exporter
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
app.kubernetes.io/version: "1.7.0"
|
||||
jobLabel: node-exporter
|
||||
release: prometheus-community
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 9100
|
||||
targetPort: 9100
|
||||
protocol: TCP
|
||||
name: http-metrics
|
||||
selector:
|
||||
app.kubernetes.io/name: prometheus-node-exporter
|
||||
app.kubernetes.io/instance: prometheus-community
|
||||
Reference in New Issue
Block a user