node-exporter
Overview
The Node Mixin is a set of configurable, reusable, and extensible alerts and dashboards based on the metrics exported by the Node Exporter. The mixin creates recording and alerting rules for Prometheus and suitable dashboard descriptions for Grafana.
Jsonnet source code is available at github.com/prometheus/node_exporter
Alerts
Complete list of pregenerated alerts is available here.
node-exporter
NodeFilesystemSpaceFillingUp
alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up.
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
NodeFilesystemSpaceFillingUp
alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left and is filling up fast.
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
NodeFilesystemAlmostOutOfSpace
alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
NodeFilesystemAlmostOutOfSpace
alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available space left.
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
NodeFilesystemFilesFillingUp
alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up.
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
NodeFilesystemFilesFillingUp
alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left and is filling up fast.
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
NodeFilesystemAlmostOutOfFiles
alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
NodeFilesystemAlmostOutOfFiles
alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ printf "%.2f" $value }}% available inodes left.
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node",fstype!="",mountpoint!=""} / node_filesystem_files{job="node",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
NodeNetworkReceiveErrs
alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
NodeNetworkTransmitErrs
alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
NodeHighNumberConntrackEntriesUsed
alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
NodeTextFileCollectorScrapeError
alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node"} == 1
labels:
severity: warning
NodeClockSkewDetected
alert: NodeClockSkewDetected
annotations:
description: Clock on {{ $labels.instance }} is out of sync by more than 0.05s.
Ensure NTP is configured correctly on this host.
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds{job="node"} > 0.05
and
deriv(node_timex_offset_seconds{job="node"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node"} < -0.05
and
deriv(node_timex_offset_seconds{job="node"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
NodeClockNotSynchronising
alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is
configured on this host.
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status{job="node"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node"} >= 16
for: 10m
labels:
severity: warning
NodeRAIDDegraded
alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded
state due to one or more disks failures. Number of spare drives is insufficient
to fix issue automatically.
summary: RAID Array is degraded
expr: |
node_md_disks_required{job="node",device!=""} - ignoring (state) (node_md_disks{state="active",job="node",device!=""}) > 0
for: 15m
labels:
severity: critical
NodeRAIDDiskFailure
alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
summary: Failed device in RAID array
expr: |
node_md_disks{state="failed",job="node",device!=""} > 0
labels:
severity: warning
NodeFileDescriptorLimit
alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{
printf "%.2f" $value }}%.
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
(
node_filefd_allocated{job="node"} * 100 / node_filefd_maximum{job="node"} > 70
)
for: 15m
labels:
severity: warning
NodeFileDescriptorLimit
alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{
printf "%.2f" $value }}%.
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
(
node_filefd_allocated{job="node"} * 100 / node_filefd_maximum{job="node"} > 90
)
for: 15m
labels:
severity: critical
Recording rules
Complete list of pregenerated recording rules is available here.
node-exporter.rules
instance:node_num_cpu:sum
expr: |
count without (cpu, mode) (
node_cpu_seconds_total{job="node",mode="idle"}
)
record: instance:node_num_cpu:sum
instance:node_cpu_utilisation:rate5m
expr: |
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
instance:node_load1_per_cpu:ratio
expr: |
(
node_load1{job="node"}
/
instance:node_num_cpu:sum{job="node"}
)
record: instance:node_load1_per_cpu:ratio
instance:node_memory_utilisation:ratio
expr: |
1 - (
(
node_memory_MemAvailable_bytes{job="node"}
or
(
node_memory_Buffers_bytes{job="node"}
+
node_memory_Cached_bytes{job="node"}
+
node_memory_MemFree_bytes{job="node"}
+
node_memory_Slab_bytes{job="node"}
)
)
/
node_memory_MemTotal_bytes{job="node"}
)
record: instance:node_memory_utilisation:ratio
instance:node_vmstat_pgmajfault:rate5m
expr: |
rate(node_vmstat_pgmajfault{job="node"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
instance_device:node_disk_io_time_seconds:rate5m
expr: |
rate(node_disk_io_time_seconds_total{job="node", device!=""}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
instance_device:node_disk_io_time_weighted_seconds:rate5m
expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
instance:node_network_receive_bytes_excluding_lo:rate5m
expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
instance:node_network_transmit_bytes_excluding_lo:rate5m
expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
instance:node_network_receive_drop_excluding_lo:rate5m
expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
instance:node_network_transmit_drop_excluding_lo:rate5m
expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m
Dashboards
Following dashboards are generated from mixins and hosted on github: