This commit is contained in:
2024-01-28 10:00:47 +01:00
parent ccb828c881
commit 0961759af7
173 changed files with 449 additions and 8800 deletions

View File

@@ -1,119 +0,0 @@
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter-prometheus-node-exporter
namespace: vynil-monitor
labels:
helm.sh/chart: prometheus-node-exporter-4.26.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: prometheus-node-exporter
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "1.7.0"
jobLabel: node-exporter
release: node-exporter
spec:
selector:
matchLabels:
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter
revisionHistoryLimit: 10
updateStrategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
annotations:
cluster-autoscaler.kubernetes.io/safe-to-evict: "true"
labels:
helm.sh/chart: prometheus-node-exporter-4.26.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: prometheus-node-exporter
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "1.7.0"
jobLabel: node-exporter
release: node-exporter
spec:
automountServiceAccountToken: false
securityContext:
fsGroup: 65534
runAsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: node-exporter-prometheus-node-exporter
containers:
- name: node-exporter
image: quay.io/prometheus/node-exporter:v1.7.0
imagePullPolicy: IfNotPresent
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --path.udev.data=/host/root/run/udev/data
- --web.listen-address=[$(HOST_IP)]:9100
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
securityContext:
readOnlyRootFilesystem: true
env:
- name: HOST_IP
value: 0.0.0.0
ports:
- name: http-metrics
containerPort: 9100
protocol: TCP
livenessProbe:
failureThreshold: 3
httpGet:
httpHeaders:
path: /
port: 9100
scheme: HTTP
initialDelaySeconds: 0
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
readinessProbe:
failureThreshold: 3
httpGet:
httpHeaders:
path: /
port: 9100
scheme: HTTP
initialDelaySeconds: 0
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
volumeMounts:
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
hostNetwork: true
hostPID: true
nodeSelector:
kubernetes.io/os: linux
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /

View File

@@ -1,34 +0,0 @@
locals {
common-labels = {
"vynil.solidite.fr/owner-name" = var.instance
"vynil.solidite.fr/owner-namespace" = var.namespace
"vynil.solidite.fr/owner-category" = var.category
"vynil.solidite.fr/owner-component" = var.component
"app.kubernetes.io/managed-by" = "vynil"
"app.kubernetes.io/instance" = var.instance
}
}
data "kustomization_overlay" "data" {
common_labels = local.common-labels
namespace = var.namespace
resources = [for file in fileset(path.module, "*.yaml"): file if file != "index.yaml"]
images {
name = "quay.io/prometheus/node-exporter"
new_name = "${var.images.node-exporter.registry}/${var.images.node-exporter.repository}"
new_tag = "${var.images.node-exporter.tag}"
}
patches {
target {
kind = "ServiceMonitor"
name = "node-exporter-prometheus-node-exporter"
}
patch = <<-EOF
- op: replace
path: /spec/selector/matchLabels/app.kubernetes.io~1instance
value: "${var.instance}"
EOF
}
}

View File

@@ -1,57 +0,0 @@
---
apiVersion: vinyl.solidite.fr/v1beta1
kind: Component
category: monitor
metadata:
name: node-exporter
description: null
options:
images:
default:
node-exporter:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/node-exporter
tag: v1.7.0
examples:
- node-exporter:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/node-exporter
tag: v1.7.0
properties:
node-exporter:
default:
pullPolicy: IfNotPresent
registry: quay.io
repository: prometheus/node-exporter
tag: v1.7.0
properties:
pullPolicy:
default: IfNotPresent
enum:
- Always
- Never
- IfNotPresent
type: string
registry:
default: quay.io
type: string
repository:
default: prometheus/node-exporter
type: string
tag:
default: v1.7.0
type: string
type: object
type: object
dependencies: []
providers:
kubernetes: true
authentik: null
kubectl: true
postgresql: null
restapi: null
http: null
gitea: null
tfaddtype: null

View File

@@ -1,82 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-exporter-kube-prometh-node-exporter.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "node-exporter"
heritage: "Helm"
spec:
groups:
- name: node-exporter.rules
rules:
- expr: |-
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
record: instance:node_num_cpu:sum
- expr: |-
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
- expr: |-
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |-
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
- expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m

View File

@@ -1,328 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-exporter.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-exporter-kube-prometh-node-exporter
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "node-exporter"
heritage: "Helm"
spec:
groups:
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector on {{ $labels.instance }} failed to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
expr: |-
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node-exporter"} < -0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded.
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array.
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
)
for: 15m
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
)
for: 15m
labels:
severity: critical
- alert: NodeCPUHighUsage
annotations:
description: 'CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
summary: High CPU usage.
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
for: 15m
labels:
severity: info
- alert: NodeSystemSaturation
annotations:
description: 'System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
This might indicate this instance resources saturation and can cause it becoming unresponsive.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
summary: System saturated, load per core is very high.
expr: |-
node_load1{job="node-exporter"}
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
for: 15m
labels:
severity: warning
- alert: NodeMemoryMajorPagesFaults
annotations:
description: 'Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
Please check that there is enough memory available at this instance.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
summary: Memory major page faults are occurring at very high rate.
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
for: 15m
labels:
severity: warning
- alert: NodeMemoryHighUtilization
annotations:
description: 'Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: Host is running out of memory.
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
for: 15m
labels:
severity: warning
- alert: NodeDiskIOSaturation
annotations:
description: 'Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
This symptom might indicate disk saturation.
'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
summary: Disk IO queue is high.
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
for: 30m
labels:
severity: warning
- alert: NodeSystemdServiceFailed
annotations:
description: Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Systemd service has entered failed state.
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
- alert: NodeBondingDegraded
annotations:
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: Bonding interface is degraded
expr: (node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning

View File

@@ -1,29 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node-network.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-exporter-kube-prometh-node-network
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "node-exporter"
heritage: "Helm"
spec:
groups:
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
summary: Network interface is often changing its status
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning

View File

@@ -1,56 +0,0 @@
# Source: kube-prometheus-stack/templates/prometheus/rules-1.14/node.rules.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: node-exporter-kube-prometh-node.rules
namespace: vynil-monitor
labels:
app: kube-prometheus-stack
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "56.1.0"
app.kubernetes.io/part-of: kube-prometheus-stack
chart: kube-prometheus-stack-56.1.0
release: "node-exporter"
heritage: "Helm"
spec:
groups:
- name: node.rules
rules:
- expr: |-
topk by (cluster, namespace, pod) (1,
max by (cluster, node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:'
- expr: |-
count by (cluster, node) (
node_cpu_seconds_total{mode="idle",job="node-exporter"}
* on (cluster, namespace, pod) group_left(node)
topk by (cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:)
)
record: node:node_num_cpu:sum
- expr: |-
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by (cluster)
record: :node_memory_MemAvailable_bytes:sum
- expr: |-
avg by (cluster, node) (
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
)
)
record: node:node_cpu_utilization:ratio_rate5m
- expr: |-
avg by (cluster) (
node:node_cpu_utilization:ratio_rate5m
)
record: cluster:node_cpu:ratio_rate5m

View File

@@ -1,28 +0,0 @@
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: node-exporter-prometheus-node-exporter
namespace: vynil-monitor
labels:
helm.sh/chart: prometheus-node-exporter-4.26.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: prometheus-node-exporter
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "1.7.0"
jobLabel: node-exporter
release: node-exporter
spec:
jobLabel: jobLabel
selector:
matchLabels:
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter
attachMetadata:
node: false
endpoints:
- port: http-metrics
scheme: http

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,17 +0,0 @@
---
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-exporter-prometheus-node-exporter
namespace: vynil-monitor
labels:
helm.sh/chart: prometheus-node-exporter-4.26.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: prometheus-node-exporter
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "1.7.0"
jobLabel: node-exporter
release: node-exporter

View File

@@ -1,28 +0,0 @@
# Source: kube-prometheus-stack/charts/prometheus-node-exporter/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
name: node-exporter-prometheus-node-exporter
namespace: vynil-monitor
labels:
helm.sh/chart: prometheus-node-exporter-4.26.0
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/component: metrics
app.kubernetes.io/part-of: prometheus-node-exporter
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter
app.kubernetes.io/version: "1.7.0"
jobLabel: node-exporter
release: node-exporter
annotations:
prometheus.io/scrape: "true"
spec:
type: ClusterIP
ports:
- port: 9100
targetPort: 9100
protocol: TCP
name: http-metrics
selector:
app.kubernetes.io/name: prometheus-node-exporter
app.kubernetes.io/instance: node-exporter