ceph
Overview
A set of Prometheus alerts for Ceph.
The scope of this project is to provide Ceph specific Prometheus rule files using Prometheus Mixins.
Jsonnet source code is available at github.com/ceph/ceph-mixins
Alerts
Complete list of pregenerated alerts is available here.
ceph-mgr-status
CephMgrIsAbsent
alert: CephMgrIsAbsent
annotations:
description: Ceph Manager has disappeared from Prometheus target discovery.
message: Storage metrics collector service not available anymore.
severity_level: critical
storage_type: ceph
expr: |
label_replace((up{job="rook-ceph-mgr"} == 0 or absent(up{job="rook-ceph-mgr"})), "namespace", "openshift-storage", "", "")
for: 5m
labels:
severity: critical
CephMgrIsMissingReplicas
alert: CephMgrIsMissingReplicas
annotations:
description: Ceph Manager is missing replicas.
message: Storage metrics collector service doesn't have required no of replicas.
severity_level: warning
storage_type: ceph
expr: |
sum(kube_deployment_spec_replicas{deployment=~"rook-ceph-mgr-.*"}) by (namespace) < 1
for: 5m
labels:
severity: warning
ceph-mds-status
CephMdsMissingReplicas
alert: CephMdsMissingReplicas
annotations:
description: Minimum required replicas for storage metadata service not available.
Might affect the working of storage cluster.
message: Insufficient replicas for storage metadata service.
severity_level: warning
storage_type: ceph
expr: |
sum(ceph_mds_metadata{job="rook-ceph-mgr"} == 1) by (namespace) < 2
for: 5m
labels:
severity: warning
quorum-alert.rules
CephMonQuorumAtRisk
alert: CephMonQuorumAtRisk
annotations:
description: Storage cluster quorum is low. Contact Support.
message: Storage quorum at risk
severity_level: error
storage_type: ceph
expr: |
count(ceph_mon_quorum_status{job="rook-ceph-mgr"} == 1) by (namespace) <= (floor(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (namespace) / 2) + 1)
for: 15m
labels:
severity: critical
CephMonQuorumLost
alert: CephMonQuorumLost
annotations:
description: Storage cluster quorum is lost. Contact Support.
message: Storage quorum is lost
severity_level: critical
storage_type: ceph
expr: |
count(kube_pod_status_phase{pod=~"rook-ceph-mon-.*", phase=~"Running|running"} == 1) by (namespace) < 2
for: 5m
labels:
severity: critical
CephMonHighNumberOfLeaderChanges
alert: CephMonHighNumberOfLeaderChanges
annotations:
description: Ceph Monitor {{ $labels.ceph_daemon }} on host {{ $labels.hostname
}} has seen {{ $value | printf "%.2f" }} leader changes per minute recently.
message: Storage Cluster has seen many leader changes recently.
severity_level: warning
storage_type: ceph
expr: |
(ceph_mon_metadata{job="rook-ceph-mgr"} * on (ceph_daemon) group_left() (rate(ceph_mon_num_elections{job="rook-ceph-mgr"}[5m]) * 60)) > 0.95
for: 5m
labels:
severity: warning
ceph-node-alert.rules
CephNodeDown
alert: CephNodeDown
annotations:
description: Storage node {{ $labels.node }} went down. Please check the node immediately.
message: Storage node {{ $labels.node }} went down
severity_level: error
storage_type: ceph
expr: |
cluster:ceph_node_down:join_kube == 0
for: 30s
labels:
severity: critical
osd-alert.rules
CephOSDCriticallyFull
alert: CephOSDCriticallyFull
annotations:
description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class
type {{$labels.device_class}} has crossed 80% on host {{ $labels.hostname }}.
Immediately free up some space or add capacity of type {{$labels.device_class}}.
message: Back-end storage device is critically full.
severity_level: error
storage_type: ceph
expr: |
(ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.80
for: 40s
labels:
severity: critical
CephOSDFlapping
alert: CephOSDFlapping
annotations:
description: Storage daemon {{ $labels.ceph_daemon }} has restarted 5 times in last
5 minutes. Please check the pod events or ceph status to find out the cause.
message: Ceph storage osd flapping.
severity_level: error
storage_type: ceph
expr: |
changes(ceph_osd_up[5m]) >= 10
for: 0s
labels:
severity: critical
CephOSDNearFull
alert: CephOSDNearFull
annotations:
description: Utilization of storage device {{ $labels.ceph_daemon }} of device_class
type {{$labels.device_class}} has crossed 75% on host {{ $labels.hostname }}.
Immediately free up some space or add capacity of type {{$labels.device_class}}.
message: Back-end storage device is nearing full.
severity_level: warning
storage_type: ceph
expr: |
(ceph_osd_metadata * on (ceph_daemon) group_right(device_class,hostname) (ceph_osd_stat_bytes_used / ceph_osd_stat_bytes)) >= 0.75
for: 40s
labels:
severity: warning
CephOSDDiskNotResponding
alert: CephOSDDiskNotResponding
annotations:
description: Disk device {{ $labels.device }} not responding, on host {{ $labels.host
}}.
message: Disk not responding
severity_level: error
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 15m
labels:
severity: critical
CephOSDDiskUnavailable
alert: CephOSDDiskUnavailable
annotations:
description: Disk device {{ $labels.device }} not accessible on host {{ $labels.host
}}.
message: Disk not accessible
severity_level: error
storage_type: ceph
expr: |
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon) group_left(host, device) label_replace(ceph_disk_occupation,"host","$1","exported_instance","(.*)")
for: 1m
labels:
severity: critical
CephOSDSlowOps
alert: CephOSDSlowOps
annotations:
description: '{{ $value }} Ceph OSD requests are taking too long to process. Please
check ceph status to find out the cause.'
message: OSD requests are taking too long to process.
severity_level: warning
storage_type: ceph
expr: |
ceph_healthcheck_slow_ops > 0
for: 30s
labels:
severity: warning
CephDataRecoveryTakingTooLong
alert: CephDataRecoveryTakingTooLong
annotations:
description: Data recovery has been active for too long. Contact Support.
message: Data recovery is slow
severity_level: warning
storage_type: ceph
expr: |
ceph_pg_undersized > 0
for: 2h
labels:
severity: warning
CephPGRepairTakingTooLong
alert: CephPGRepairTakingTooLong
annotations:
description: Self heal operations taking too long. Contact Support.
message: Self heal problems detected
severity_level: warning
storage_type: ceph
expr: |
ceph_pg_inconsistent > 0
for: 1h
labels:
severity: warning
persistent-volume-alert.rules
PersistentVolumeUsageNearFull
alert: PersistentVolumeUsageNearFull
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 75%.
Free up some space or expand the PVC.
message: PVC {{ $labels.persistentvolumeclaim }} is nearing full. Data deletion
or PVC expansion is required.
severity_level: warning
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.75
for: 5s
labels:
severity: warning
PersistentVolumeUsageCritical
alert: PersistentVolumeUsageCritical
annotations:
description: PVC {{ $labels.persistentvolumeclaim }} utilization has crossed 85%.
Free up some space or expand the PVC immediately.
message: PVC {{ $labels.persistentvolumeclaim }} is critically full. Data deletion
or PVC expansion is required.
severity_level: error
storage_type: ceph
expr: |
(kubelet_volume_stats_used_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) / (kubelet_volume_stats_capacity_bytes * on (namespace,persistentvolumeclaim) group_left(storageclass, provisioner) (kube_persistentvolumeclaim_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})) > 0.85
for: 5s
labels:
severity: critical
cluster-state-alert.rules
CephClusterErrorState
alert: CephClusterErrorState
annotations:
description: Storage cluster is in error state for more than 10m.
message: Storage cluster is in error state
severity_level: error
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} > 1
for: 10m
labels:
severity: critical
CephClusterWarningState
alert: CephClusterWarningState
annotations:
description: Storage cluster is in warning state for more than 10m.
message: Storage cluster is in degraded state
severity_level: warning
storage_type: ceph
expr: |
ceph_health_status{job="rook-ceph-mgr"} == 1
for: 15m
labels:
severity: warning
CephOSDVersionMismatch
alert: CephOSDVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph OSD components running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version, namespace)) by (ceph_version, namespace) > 1
for: 10m
labels:
severity: warning
CephMonVersionMismatch
alert: CephMonVersionMismatch
annotations:
description: There are {{ $value }} different versions of Ceph Mon components running.
message: There are multiple versions of storage services running.
severity_level: warning
storage_type: ceph
expr: |
count(count(ceph_mon_metadata{job="rook-ceph-mgr", ceph_version != ""}) by (ceph_version)) > 1
for: 10m
labels:
severity: warning
cluster-utilization-alert.rules
CephClusterNearFull
alert: CephClusterNearFull
annotations:
description: Storage cluster utilization has crossed 75% and will become read-only
at 85%. Free up some space or expand the storage cluster.
message: Storage cluster is nearing full. Data deletion or cluster expansion is
required.
severity_level: warning
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.75
for: 5s
labels:
severity: warning
CephClusterCriticallyFull
alert: CephClusterCriticallyFull
annotations:
description: Storage cluster utilization has crossed 80% and will become read-only
at 85%. Free up some space or expand the storage cluster immediately.
message: Storage cluster is critically full and needs immediate data deletion or
cluster expansion.
severity_level: error
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes > 0.80
for: 5s
labels:
severity: critical
CephClusterReadOnly
alert: CephClusterReadOnly
annotations:
description: Storage cluster utilization has crossed 85% and will become read-only
now. Free up some space or expand the storage cluster immediately.
message: Storage cluster is read-only now and needs immediate data deletion or cluster
expansion.
severity_level: error
storage_type: ceph
expr: |
ceph_cluster_total_used_raw_bytes / ceph_cluster_total_bytes >= 0.85
for: 0s
labels:
severity: critical
pool-quota.rules
CephPoolQuotaBytesNearExhaustion
alert: CephPoolQuotaBytesNearExhaustion
annotations:
description: Storage pool {{ $labels.name }} quota usage has crossed 70%.
message: Storage pool quota(bytes) is near exhaustion.
severity_level: warning
storage_type: ceph
expr: |
(ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.70
for: 1m
labels:
severity: warning
CephPoolQuotaBytesCriticallyExhausted
alert: CephPoolQuotaBytesCriticallyExhausted
annotations:
description: Storage pool {{ $labels.name }} quota usage has crossed 90%.
message: Storage pool quota(bytes) is critically exhausted.
severity_level: critical
storage_type: ceph
expr: |
(ceph_pool_stored_raw * on (pool_id) group_left(name)ceph_pool_metadata) / ((ceph_pool_quota_bytes * on (pool_id) group_left(name)ceph_pool_metadata) > 0) > 0.90
for: 1m
labels:
severity: critical
Recording rules
Complete list of pregenerated recording rules is available here.
ceph.rules
cluster:ceph_node_down:join_kube
expr: |
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node, namespace)
record: cluster:ceph_node_down:join_kube
cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
expr: |
avg(topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon) topk by (instance,device) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
telemeter.rules
job:ceph_osd_metadata:count
expr: |
count(ceph_osd_metadata{job="rook-ceph-mgr"})
record: job:ceph_osd_metadata:count
job:kube_pv:count
expr: |
count(kube_persistentvolume_info * on (storageclass) group_left(provisioner) kube_storageclass_info {provisioner=~"(.*rbd.csi.ceph.com)|(.*cephfs.csi.ceph.com)"})
record: job:kube_pv:count
job:ceph_pools_iops:total
expr: |
sum(ceph_pool_rd{job="rook-ceph-mgr"}+ ceph_pool_wr{job="rook-ceph-mgr"})
record: job:ceph_pools_iops:total
job:ceph_pools_iops_bytes:total
expr: |
sum(ceph_pool_rd_bytes{job="rook-ceph-mgr"}+ ceph_pool_wr_bytes{job="rook-ceph-mgr"})
record: job:ceph_pools_iops_bytes:total
job:ceph_versions_running:count
expr: |
count(count(ceph_mon_metadata{job="rook-ceph-mgr"} or ceph_osd_metadata{job="rook-ceph-mgr"} or ceph_rgw_metadata{job="rook-ceph-mgr"} or ceph_mds_metadata{job="rook-ceph-mgr"} or ceph_mgr_metadata{job="rook-ceph-mgr"}) by(ceph_version))
record: job:ceph_versions_running:count