kube-cockroachdb

Overview

Jsonnet source code is available at github.com/metalmatze/kube-cockroachdb

Alerts

Complete list of pregenerated alerts is available here.

cockroachdb

CockroachInstanceFlapping

alert: CockroachInstanceFlapping
annotations:
  description: '{{ $labels.instance }} for cluster {{ $labels.cluster }} restarted
    {{ $value }} time(s) in 10m.'
  summary: CockroachDB instances have restarted in the last 10 minutes.
expr: |
  resets(cockroachdb_sys_uptime{job="cockroachdb-public"}[10m]) > 5
for: 1m
labels:
  severity: warning

CockroachLivenessMismatch

alert: CockroachLivenessMismatch
annotations:
  description: Liveness mismatch for {{ $labels.instance }}
  summary: CockroachDB has liveness mismatches.
expr: |
  (cockroachdb_liveness_livenodes{job="cockroachdb-public"})
    !=
  ignoring(instance) group_left() (count by(cluster, job) (up{job="cockroachdb-public"} == 1))
for: 5m
labels:
  severity: warning

CockroachVersionMismatch

alert: CockroachVersionMismatch
annotations:
  description: Cluster {{ $labels.cluster }} running {{ $value }} different versions
  summary: CockroachDB cluster is running different versions.
expr: |
  count by(cluster) (count_values by(tag, cluster) ("version", cockroachdb_build_timestamp{job="cockroachdb-public"})) > 1
for: 1h
labels:
  severity: warning

CockroachStoreDiskLow

alert: CockroachStoreDiskLow
annotations:
  description: Store {{ $labels.store }} on node {{ $labels.instance }} at {{ $value
    }} available disk fraction
  summary: CockroachDB is at low disk capacity.
expr: |
  :cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.15
for: 30m
labels:
  severity: critical

CockroachClusterDiskLow

alert: CockroachClusterDiskLow
annotations:
  description: Cluster {{ $labels.cluster }} at {{ $value }} available disk fraction
  summary: CockroachDB cluster is at critically low disk capacity.
expr: |
  cluster:cockroachdb_capacity_available:ratio{job="cockroachdb-public"} < 0.2
for: 30m
labels:
  severity: critical

CockroachUnavailableRanges

alert: CockroachUnavailableRanges
annotations:
  description: Instance {{ $labels.instance }} has {{ $value }} unavailable ranges
  summary: CockroachDB has unavailable ranges.
expr: |
  (sum by(instance, cluster) (cockroachdb_ranges_unavailable{job="cockroachdb-public"})) > 0
for: 10m
labels:
  severity: critical

CockroachNoLeaseRanges

alert: CockroachNoLeaseRanges
annotations:
  description: Instance {{ $labels.instance }} has {{ $value }} ranges without leases
  summary: CockroachDB has ranges without leases.
expr: |
  (sum by(instance, cluster) (cockroachdb_replicas_leaders_not_leaseholders{job="cockroachdb-public"})) > 0
for: 10m
labels:
  severity: warning

CockroachHighOpenFDCount

alert: CockroachHighOpenFDCount
annotations:
  description: 'Too many open file descriptors on {{ $labels.instance }}: {{ $value
    }} fraction used'
  summary: CockroachDB has too many open file descriptors.
expr: |
  cockroachdb_sys_fd_open{job="cockroachdb-public"} / cockroachdb_sys_fd_softlimit{job="cockroachdb-public"} > 0.8
for: 10m
labels:
  severity: warning

Recording rules

Complete list of pregenerated recording rules is available here.

cockroachdb.rules

node:cockroachdb_capacity:sum

expr: |
  sum without(store) (cockroachdb_capacity{job="cockroachdb-public"})
record: node:cockroachdb_capacity:sum

cluster:cockroachdb_capacity:sum

expr: |
  sum without(instance) (node:cockroachdb_capacity:sum{job="cockroachdb-public"})
record: cluster:cockroachdb_capacity:sum

node:cockroachdb_capacity_available:sum

expr: |
  sum without(store) (cockroachdb_capacity_available{job="cockroachdb-public"})
record: node:cockroachdb_capacity_available:sum

cluster:cockroachdb_capacity_available:sum

expr: |
  sum without(instance) (node:cockroachdb_capacity_available:sum{job="cockroachdb-public"})
record: cluster:cockroachdb_capacity_available:sum

:cockroachdb_capacity_available:ratio

expr: |
  cockroachdb_capacity_available{job="cockroachdb-public"} / cockroachdb_capacity{job="cockroachdb-public"}
record: :cockroachdb_capacity_available:ratio

node:cockroachdb_capacity_available:ratio

expr: |
  node:cockroachdb_capacity_available:sum{job="cockroachdb-public"} / node:cockroachdb_capacity:sum{job="cockroachdb-public"}
record: node:cockroachdb_capacity_available:ratio

cluster:cockroachdb_capacity_available:ratio

expr: |
  cluster:cockroachdb_capacity_available:sum{job="cockroachdb-public"} / cluster:cockroachdb_capacity:sum{job="cockroachdb-public"}
record: cluster:cockroachdb_capacity_available:ratio