node-exporter

Overview

The Node Mixin is a set of configurable, reusable, and extensible alerts and dashboards based on the metrics exported by the Node Exporter. The mixin creates recording and alerting rules for Prometheus and suitable dashboard descriptions for Grafana.

Jsonnet source code is available at github.com/prometheus/node_exporter

Alerts

Complete list of pregenerated alerts is available here.

node-exporter

NodeFilesystemSpaceFillingUp

alert: NodeFilesystemSpaceFillingUp
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left and is filling up.
  summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
  (
    node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 40
  and
    predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 24*60*60) < 0
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: warning

NodeFilesystemSpaceFillingUp

alert: NodeFilesystemSpaceFillingUp
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left and is filling up fast.
  summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
  (
    node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 20
  and
    predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 4*60*60) < 0
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: critical

NodeFilesystemAlmostOutOfSpace

alert: NodeFilesystemAlmostOutOfSpace
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left.
  summary: Filesystem has less than 5% space left.
expr: |
  (
    node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: warning

NodeFilesystemAlmostOutOfSpace

alert: NodeFilesystemAlmostOutOfSpace
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left.
  summary: Filesystem has less than 3% space left.
expr: |
  (
    node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: critical

NodeFilesystemFilesFillingUp

alert: NodeFilesystemFilesFillingUp
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left and is filling up.
  summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
  (
    node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40
  and
    predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: warning

NodeFilesystemFilesFillingUp

alert: NodeFilesystemFilesFillingUp
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
  summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
  (
    node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20
  and
    predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: critical

NodeFilesystemAlmostOutOfFiles

alert: NodeFilesystemAlmostOutOfFiles
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left.
  summary: Filesystem has less than 5% inodes left.
expr: |
  (
    node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: warning

NodeFilesystemAlmostOutOfFiles

alert: NodeFilesystemAlmostOutOfFiles
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left.
  summary: Filesystem has less than 3% inodes left.
expr: |
  (
    node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3
  and
    node_filesystem_readonly{job="node",fstype!=""} == 0
  )
for: 1h
labels:
  severity: critical

NodeNetworkReceiveErrs

alert: NodeNetworkReceiveErrs
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
    {{ printf "%.0f" $value }} receive errors in the last two minutes.'
  summary: Network interface is reporting many receive errors.
expr: |
  rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
  severity: warning

NodeNetworkTransmitErrs

alert: NodeNetworkTransmitErrs
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
    {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
  summary: Network interface is reporting many transmit errors.
expr: |
  rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
  severity: warning

NodeHighNumberConntrackEntriesUsed

alert: NodeHighNumberConntrackEntriesUsed
annotations:
  description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
  summary: Number of conntrack are getting close to the limit.
expr: |
  (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
  severity: warning

NodeTextFileCollectorScrapeError

alert: NodeTextFileCollectorScrapeError
annotations:
  description: Node Exporter text file collector failed to scrape.
  summary: Node Exporter text file collector failed to scrape.
expr: |
  node_textfile_scrape_error{job="node"} == 1
labels:
  severity: warning

NodeClockSkewDetected

alert: NodeClockSkewDetected
annotations:
  message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
    NTP is configured correctly on this host.
  summary: Clock skew detected.
expr: |
  (
    node_timex_offset_seconds > 0.05
  and
    deriv(node_timex_offset_seconds[5m]) >= 0
  )
  or
  (
    node_timex_offset_seconds < -0.05
  and
    deriv(node_timex_offset_seconds[5m]) <= 0
  )
for: 10m
labels:
  severity: warning

NodeClockNotSynchronising

alert: NodeClockNotSynchronising
annotations:
  message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
    on this host.
  summary: Clock not synchronising.
expr: |
  min_over_time(node_timex_sync_status[5m]) == 0
  and
  node_timex_maxerror_seconds >= 16
for: 10m
labels:
  severity: warning

NodeRAIDDegraded

alert: NodeRAIDDegraded
annotations:
  description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded
    state due to one or more disks failures. Number of spare drives is insufficient
    to fix issue automatically.
  summary: RAID Array is degraded
expr: |
  node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
  severity: critical

NodeRAIDDiskFailure

alert: NodeRAIDDiskFailure
annotations:
  description: At least one device in RAID array on {{ $labels.instance }} failed.
    Array '{{ $labels.device }}' needs attention and possibly a disk swap.
  summary: Failed device in RAID array
expr: |
  node_md_disks{state="fail"} > 0
labels:
  severity: warning

Recording rules

Complete list of pregenerated recording rules is available here.

node-exporter.rules

instance:node_num_cpu:sum

expr: |
  count without (cpu) (
    count without (mode) (
      node_cpu_seconds_total{job="node"}
    )
  )
record: instance:node_num_cpu:sum

instance:node_cpu_utilisation:rate1m

expr: |
  1 - avg without (cpu, mode) (
    rate(node_cpu_seconds_total{job="node", mode="idle"}[1m])
  )
record: instance:node_cpu_utilisation:rate1m

instance:node_load1_per_cpu:ratio

expr: |
  (
    node_load1{job="node"}
  /
    instance:node_num_cpu:sum{job="node"}
  )
record: instance:node_load1_per_cpu:ratio

instance:node_memory_utilisation:ratio

expr: |
  1 - (
    node_memory_MemAvailable_bytes{job="node"}
  /
    node_memory_MemTotal_bytes{job="node"}
  )
record: instance:node_memory_utilisation:ratio

instance:node_vmstat_pgmajfault:rate1m

expr: |
  rate(node_vmstat_pgmajfault{job="node"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m

instance_device:node_disk_io_time_seconds:rate1m

expr: |
  rate(node_disk_io_time_seconds_total{job="node", device!=""}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m

instance_device:node_disk_io_time_weighted_seconds:rate1m

expr: |
  rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m

instance:node_network_receive_bytes_excluding_lo:rate1m

expr: |
  sum without (device) (
    rate(node_network_receive_bytes_total{job="node", device!="lo"}[1m])
  )
record: instance:node_network_receive_bytes_excluding_lo:rate1m

instance:node_network_transmit_bytes_excluding_lo:rate1m

expr: |
  sum without (device) (
    rate(node_network_transmit_bytes_total{job="node", device!="lo"}[1m])
  )
record: instance:node_network_transmit_bytes_excluding_lo:rate1m

instance:node_network_receive_drop_excluding_lo:rate1m

expr: |
  sum without (device) (
    rate(node_network_receive_drop_total{job="node", device!="lo"}[1m])
  )
record: instance:node_network_receive_drop_excluding_lo:rate1m

instance:node_network_transmit_drop_excluding_lo:rate1m

expr: |
  sum without (device) (
    rate(node_network_transmit_drop_total{job="node", device!="lo"}[1m])
  )
record: instance:node_network_transmit_drop_excluding_lo:rate1m

Dashboards

Following dashboards are generated from mixins and hosted on github: