ceph
Overview
A set of Prometheus alerts for Ceph.
The scope of this project is to provide Ceph specific Prometheus rule files using Prometheus Mixins.
Jsonnet source code is available at github.com/ceph/ceph-mixins
Alerts
Complete list of pregenerated alerts is available here.
ceph-mgr-status
CephMgrIsAbsent
alert: CephMgrIsAbsent
annotations:
description: Ceph Manager has disappeared from Prometheus target discovery.
message: Storage metrics collector service not available anymore.
severity_level: critical
storage_type: ceph
expr: |
absent(up{job="rook-ceph-mgr"} == 1)
for: 5m
labels:
severity: critical
CephMgrIsMissingReplicas
alert: CephMgrIsMissingReplicas
annotations:
description: Ceph Manager is missing replicas.
message: Storage metrics collector service doesn't have required no of replicas.
severity_level: warning
storage_type: ceph
expr: |
sum(up{job="rook-ceph-mgr"}) < 1
for: 5m
labels:
severity: warning
ceph-mds-status
CephMdsMissingReplicas
alert: CephMdsMissingReplicas
annotations:
description: Minimum required replicas for storage metadata service not available.
Might affect the working of storage cluster.
message: Insufficient replicas for storage metadata service.
severity_level: warning
storage_type: ceph
expr: |
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) < 2
for: 5m
labels:
severity: warning
quorum-alert.rules
CephMonQuorumAtRisk
alert: CephMonQuorumAtRisk
annotations:
description: Storage cluster quorum is low. Contact Support.
message: Storage quorum at risk
severity_level: error
storage_type: ceph
expr: |
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) <= ((count(ceph_mon_metadata{job="rook-ceph-mgr"}) % 2) + 1)
for: 15m
labels:
severity: critical
CephMonHighNumberOfLeaderChanges
alert: CephMonHighNumberOfLeaderChanges
annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
message: Storage Cluster has seen many leader changes recently.
severity_level: warning
storage_type: ceph
expr: |
(ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95
for: 5m
labels:
severity: warning
ceph-node-alert.rules
CephNodeDown
alert: CephNodeDown
annotations:
description: Storage node {{ $labels.node }} went down. Please check the node immediately.
message: Storage node {{ $labels.node }} went down
severity_level: error
storage_type: ceph
expr: |
cluster:ceph_node_down:join_kube == 0
for: 30s
labels:
severity: critical
osd-alert.rules
CephOSDCriticallyFull
alert: CephOSDCriticallyFull
annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
crossed 85% on host {{ $labels.hostname }}. Immediately free up some space or
expand the storage cluster or contact support.
message: Back-end storage device is critically full.
severity_level: error
storage_type: ceph
expr: |
(ceph_osd_metadata * on (ceph_daemon) group_left() (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.85
for: 40s
labels:
severity: critical
CephOSDNearFull
alert: CephOSDNearFull
annotations:
description: Utilization of back-end storage device {{ $labels.ceph_daemon }} has
crossed 75% on host {{ $labels.hostname }}. Free up some space or expand the storage
cluster or contact support.
message: Back-end storage device is nearing full.
severity_level: warning
storage_type: ceph
expr: |
(ceph_osd_metadata * on (ceph_daemon) group_left() (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75
for: 40s
labels:
severity: warning
CephOSDDiskNotResponding
alert: CephOSDDiskNotResponding
annotations:
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
}}.
message: Disk not responding
severity_level: error
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 1m
labels:
severity: critical
CephOSDDiskUnavailable
alert: CephOSDDiskUnavailable
annotations:
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
}}.
message: Disk not accessible
severity_level: error
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 1m
labels:
severity: critical
CephDataRecoveryTakingTooLong
alert: CephDataRecoveryTakingTooLong
annotations:
description: Data recovery has been active for too long. Contact Support.
message: Data recovery is slow
severity_level: warning
storage_type: ceph
expr: |
ceph_pg_undersized > 0
for: 2h
labels:
severity: warning
CephPGRepairTakingTooLong
alert: CephPGRepairTakingTooLong
annotations:
description: Self heal operations taking too long. Contact Support.
message: Self heal problems detected
severity_level: warning
storage_type: ceph
expr: |
ceph_pg_inconsistent > 0
for: 1h
labels:
severity: warning
persistent-volume-alert.rules
PersistentVolumeUsageNearFull
alert: PersistentVolumeUsageNearFull
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%.
Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
or PVC expansion is required.
severity_level: warning
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75
for: 5s
labels:
severity: warning
PersistentVolumeUsageCritical
alert: PersistentVolumeUsageCritical
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%.
Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
or PVC expansion is required.
severity_level: error
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85
for: 5s
labels:
severity: critical
cluster-state-alert.rules
CephClusterErrorState
alert: CephClusterErrorState
annotations:
description: Storage cluster is in error state for more than 10m.
message: Storage cluster is in error state
severity_level: error
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} > 1
for: 10m
labels:
severity: critical
CephClusterWarningState
alert: CephClusterWarningState
annotations:
description: Storage cluster is in warning state for more than 10m.
message: Storage cluster is in degraded state
severity_level: warning
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} == 1
for: 10m
labels:
severity: warning
CephOSDVersionMismatch
alert: CephOSDVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph OSD components running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
for: 10m
labels:
severity: warning
CephMonVersionMismatch
alert: CephMonVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph Mon components running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1
for: 10m
labels:
severity: warning
cluster-utilization-alert.rules
CephClusterNearFull
alert: CephClusterNearFull
annotations:
description: Storage cluster utilization has crossed 75% and will become read-only
at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion is
required.
severity_level: warning
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.75
for: 5s
labels:
severity: warning
CephClusterCriticallyFull
alert: CephClusterCriticallyFull
annotations:
description: Storage cluster utilization has crossed 80% and will become read-only
at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion or
cluster expansion.
severity_level: error
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.80
for: 5s
labels:
severity: critical
CephClusterReadOnly
alert: CephClusterReadOnly
annotations:
description: Storage cluster utilization has crossed 85% and will become read-only
now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion or cluster
expansion.
severity_level: error
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes >= 0.85
for: 0s
labels:
severity: critical
Recording rules
Complete list of pregenerated recording rules is available here.
ceph.rules
cluster:ceph_node_down:join_kube
expr: |
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)
record: cluster:ceph_node_down:join_kube
cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
expr: |
avg(max by(instance) (label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)") * on(instance, device) group_right() (irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
telemeter.rules
job:ceph_osd_metadata:count
expr: |
count(ceph_osd_metadata{job="rook-ceph-mgr"})
record: job:ceph_osd_metadata:count
job:kube_pv:count
expr: |
count(kube_persistentvolume_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})
record: job:kube_pv:count
job:ceph_pools_iops:total
expr: |
sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"})
record: job:ceph_pools_iops:total
job:ceph_pools_iops_bytes:total
expr: |
sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"})
record: job:ceph_pools_iops_bytes:total
job:ceph_versions_running:count
expr: |
count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version))
record: job:ceph_versions_running:count