cortex

Overview

Jsonnet source code is available at github.com/cortexproject/cortex-jsonnet

Alerts

Complete list of pregenerated alerts is available here.

cortex_alerts

CortexIngesterUnhealthy

alert: CortexIngesterUnhealthy
annotations:
  message: |
    Cortex cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ printf "%f" $value }} unhealthy ingester(s).
expr: |
  min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0
for: 15m
labels:
  severity: critical

CortexRequestErrors

alert: CortexRequestErrors
annotations:
  message: |
    The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
  100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m]))
    /
  sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m]))
    > 1
for: 15m
labels:
  severity: critical

CortexRequestLatency

alert: CortexRequestLatency
annotations:
  message: |
    {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
  cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
     >
  2.5
for: 15m
labels:
  severity: warning

CortexTableSyncFailure

alert: CortexTableSyncFailure
annotations:
  message: |
    {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables.
expr: |
  100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m])
    /
  rate(cortex_table_manager_sync_duration_seconds_count[15m])
    > 10
for: 30m
labels:
  severity: critical

CortexQueriesIncorrect

alert: CortexQueriesIncorrect
annotations:
  message: |
    The Cortex cluster {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% incorrect query results.
expr: |
  100 * sum by (cluster, namespace) (rate(test_exporter_test_case_result_total{result="fail"}[5m]))
    /
  sum by (cluster, namespace) (rate(test_exporter_test_case_result_total[5m])) > 1
for: 15m
labels:
  severity: warning

CortexInconsistentRuntimeConfig

alert: CortexInconsistentRuntimeConfig
annotations:
  message: |
    An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}.
expr: |
  count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1
for: 1h
labels:
  severity: critical

CortexBadRuntimeConfig

alert: CortexBadRuntimeConfig
annotations:
  message: |
    {{ $labels.job }} failed to reload runtime config.
expr: |
  # The metric value is reset to 0 on error while reloading the config at runtime.
  cortex_runtime_config_last_reload_successful == 0
for: 5m
labels:
  severity: critical

CortexFrontendQueriesStuck

alert: CortexFrontendQueriesStuck
annotations:
  message: |
    There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} query-frontend.
expr: |
  sum by (cluster, namespace) (cortex_query_frontend_queue_length) > 1
for: 5m
labels:
  severity: critical

CortexSchedulerQueriesStuck

alert: CortexSchedulerQueriesStuck
annotations:
  message: |
    There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} query-scheduler.
expr: |
  sum by (cluster, namespace) (cortex_query_scheduler_queue_length) > 1
for: 5m
labels:
  severity: critical

CortexMemcachedRequestErrors

alert: CortexMemcachedRequestErrors
annotations:
  message: |
    Memcached {{ $labels.name }} used by Cortex {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.
expr: |
  (
    sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operation_failures_total[1m])) /
    sum by(cluster, namespace, name, operation) (rate(thanos_memcached_operations_total[1m]))
  ) * 100 > 5
for: 5m
labels:
  severity: warning

CortexIngesterRestarts

alert: CortexIngesterRestarts
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.
expr: |
  changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) >= 2
labels:
  severity: warning

CortexTransferFailed

alert: CortexTransferFailed
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} transfer failed.
expr: |
  max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m])
for: 5m
labels:
  severity: critical

CortexOldChunkInMemory

alert: CortexOldChunkInMemory
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory.
expr: |
  (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000)
    and
  (cortex_oldest_unflushed_chunk_timestamp_seconds > 0)
for: 5m
labels:
  severity: warning

CortexKVStoreFailure

alert: CortexKVStoreFailure
annotations:
  message: |
    Cortex {{ $labels.pod }} in  {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}.
expr: |
  (
    sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m]))
    /
    sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m]))
  )
  # We want to get alerted only in case there's a constant failure.
  == 1
for: 5m
labels:
  severity: warning

CortexMemoryMapAreasTooHigh

alert: CortexMemoryMapAreasTooHigh
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} has a number of mmap-ed areas close to the limit.
expr: |
  process_memory_map_areas{job=~".+(cortex|ingester.*|store-gateway)"} / process_memory_map_areas_limit{job=~".+(cortex|ingester.*|store-gateway)"} > 0.8
for: 5m
labels:
  severity: critical

cortex_ingester_instance_alerts

CortexIngesterReachingSeriesLimit

alert: CortexIngesterReachingSeriesLimit
annotations:
  message: |
    Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit.
expr: |
  (
      (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"})
      and ignoring (limit)
      (cortex_ingester_instance_limits{limit="max_series"} > 0)
  ) > 0.8
for: 3h
labels:
  severity: warning

CortexIngesterReachingSeriesLimit

alert: CortexIngesterReachingSeriesLimit
annotations:
  message: |
    Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit.
expr: |
  (
      (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"})
      and ignoring (limit)
      (cortex_ingester_instance_limits{limit="max_series"} > 0)
  ) > 0.9
for: 5m
labels:
  severity: critical

CortexIngesterReachingTenantsLimit

alert: CortexIngesterReachingTenantsLimit
annotations:
  message: |
    Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit.
expr: |
  (
      (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"})
      and ignoring (limit)
      (cortex_ingester_instance_limits{limit="max_tenants"} > 0)
  ) > 0.7
for: 5m
labels:
  severity: warning

CortexIngesterReachingTenantsLimit

alert: CortexIngesterReachingTenantsLimit
annotations:
  message: |
    Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit.
expr: |
  (
      (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"})
      and ignoring (limit)
      (cortex_ingester_instance_limits{limit="max_tenants"} > 0)
  ) > 0.8
for: 5m
labels:
  severity: critical

CortexDistributorReachingInflightPushRequestLimit

alert: CortexDistributorReachingInflightPushRequestLimit
annotations:
  message: |
    Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its inflight push request limit.
expr: |
  (
      (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"})
      and ignoring (limit)
      (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0)
  ) > 0.8
for: 5m
labels:
  severity: critical

cortex_wal_alerts

CortexWALCorruption

alert: CortexWALCorruption
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint.
expr: |
  increase(cortex_ingester_wal_corruptions_total[5m]) > 0
labels:
  severity: critical

CortexCheckpointCreationFailed

alert: CortexCheckpointCreationFailed
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint.
expr: |
  increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0
labels:
  severity: warning

CortexCheckpointCreationFailed

alert: CortexCheckpointCreationFailed
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint.
expr: |
  increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1
labels:
  severity: critical

CortexCheckpointDeletionFailed

alert: CortexCheckpointDeletionFailed
annotations:
  message: |
    {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint.
expr: |
  increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0
labels:
  severity: warning

CortexCheckpointDeletionFailed

alert: CortexCheckpointDeletionFailed
annotations:
  message: |
    {{ $labels.instance }} is failing to delete checkpoint.
expr: |
  increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1
labels:
  severity: critical

cortex-rollout-alerts

CortexRolloutStuck

alert: CortexRolloutStuck
annotations:
  message: |
    The {{ $labels.statefulset }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
expr: |
  (
    max without (revision) (
      kube_statefulset_status_current_revision
        unless
      kube_statefulset_status_update_revision
    )
      *
    (
      kube_statefulset_replicas
        !=
      kube_statefulset_status_replicas_updated
    )
  )  and (
    changes(kube_statefulset_status_replicas_updated[15m])
      ==
    0
  )
  * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 15m
labels:
  severity: warning

CortexRolloutStuck

alert: CortexRolloutStuck
annotations:
  message: |
    The {{ $labels.deployment }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
expr: |
  (
    kube_deployment_spec_replicas
      !=
    kube_deployment_status_replicas_updated
  ) and (
    changes(kube_deployment_status_replicas_updated[15m])
      ==
    0
  )
  * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 15m
labels:
  severity: warning

cortex-provisioning

CortexProvisioningMemcachedTooSmall

alert: CortexProvisioningMemcachedTooSmall
annotations:
  message: |
    Chunk memcached cluster in {{ $labels.cluster }}/{{ $labels.namespace }} is too small, should be at least {{ printf "%.2f" $value }}GB.
expr: |
  (
    4 *
    sum by (cluster, namespace) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count)
     / 1e9
  )
    >
  (
    sum by (cluster, namespace) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9
  )
for: 15m
labels:
  severity: warning

CortexProvisioningTooManyActiveSeries

alert: CortexProvisioningTooManyActiveSeries
annotations:
  message: |
    The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high.
expr: |
  avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6
for: 2h
labels:
  severity: warning

CortexProvisioningTooManyWrites

alert: CortexProvisioningTooManyWrites
annotations:
  message: |
    Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
expr: |
  avg by (cluster, namespace) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3
for: 15m
labels:
  severity: warning

CortexAllocatingTooMuchMemory

alert: CortexAllocatingTooMuchMemory
annotations:
  message: |
    Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
expr: |
  (
    container_memory_working_set_bytes{container="ingester"}
      /
    container_spec_memory_limit_bytes{container="ingester"}
  ) > 0.65
for: 15m
labels:
  severity: warning

CortexAllocatingTooMuchMemory

alert: CortexAllocatingTooMuchMemory
annotations:
  message: |
    Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory.
expr: |
  (
    container_memory_working_set_bytes{container="ingester"}
      /
    container_spec_memory_limit_bytes{container="ingester"}
  ) > 0.8
for: 15m
labels:
  severity: critical

ruler_alerts

CortexRulerTooManyFailedPushes

alert: CortexRulerTooManyFailedPushes
annotations:
  message: |
    Cortex Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors.
expr: |
  100 * (
  sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_failed_total[1m]))
    /
  sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_total[1m]))
  ) > 1
for: 5m
labels:
  severity: critical

CortexRulerTooManyFailedQueries

alert: CortexRulerTooManyFailedQueries
annotations:
  message: |
    Cortex Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.
expr: |
  100 * (
  sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_failed_total[1m]))
    /
  sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_total[1m]))
  ) > 1
for: 5m
labels:
  severity: warning

CortexRulerMissedEvaluations

alert: CortexRulerMissedEvaluations
annotations:
  message: |
    Cortex Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}.
expr: |
  sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m]))
    /
  sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m]))
    > 0.01
for: 5m
labels:
  severity: warning

CortexRulerFailedRingCheck

alert: CortexRulerFailedRingCheck
annotations:
  message: |
    Cortex Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership.
expr: |
  sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m]))
     > 0
for: 5m
labels:
  severity: critical

gossip_alerts

CortexGossipMembersMismatch

alert: CortexGossipMembersMismatch
annotations:
  message: |
    Cortex instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} sees incorrect number of gossip members.
expr: |
  memberlist_client_cluster_members_count
    != on (cluster, namespace) group_left
  sum by (cluster, namespace) (up{job=~".+/(compactor|distributor|ingester.*|querier.*|ruler|store-gateway|cortex)"})
for: 5m
labels:
  severity: warning

etcd_alerts

EtcdAllocatingTooMuchMemory

alert: EtcdAllocatingTooMuchMemory
annotations:
  message: |
    Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit.
expr: |
  (
    container_memory_working_set_bytes{container="etcd"}
      /
    container_spec_memory_limit_bytes{container="etcd"}
  ) > 0.65
for: 15m
labels:
  severity: warning

EtcdAllocatingTooMuchMemory

alert: EtcdAllocatingTooMuchMemory
annotations:
  message: |
    Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit.
expr: |
  (
    container_memory_working_set_bytes{container="etcd"}
      /
    container_spec_memory_limit_bytes{container="etcd"}
  ) > 0.8
for: 15m
labels:
  severity: critical

alertmanager_alerts

CortexAlertmanagerSyncConfigsFailing

alert: CortexAlertmanagerSyncConfigsFailing
annotations:
  message: |
    Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage.
expr: |
  rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0
for: 30m
labels:
  severity: critical

CortexAlertmanagerRingCheckFailing

alert: CortexAlertmanagerRingCheckFailing
annotations:
  message: |
    Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring.
expr: |
  rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0
for: 10m
labels:
  severity: critical

CortexAlertmanagerPartialStateMergeFailing

alert: CortexAlertmanagerPartialStateMergeFailing
annotations:
  message: |
    Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica.
expr: |
  rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0
for: 10m
labels:
  severity: critical

CortexAlertmanagerReplicationFailing

alert: CortexAlertmanagerReplicationFailing
annotations:
  message: |
    Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas.
expr: |
  rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0
for: 10m
labels:
  severity: critical

CortexAlertmanagerPersistStateFailing

alert: CortexAlertmanagerPersistStateFailing
annotations:
  message: |
    Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage.
expr: |
  rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0
for: 1h
labels:
  severity: critical

CortexAlertmanagerInitialSyncFailed

alert: CortexAlertmanagerInitialSyncFailed
annotations:
  message: |
    Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up.
expr: |
  increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0
labels:
  severity: critical

cortex_blocks_alerts

CortexIngesterHasNotShippedBlocks

alert: CortexIngesterHasNotShippedBlocks
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours.
expr: |
  (min by(cluster, namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
  and
  (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
  and
  # Only if the ingester has ingested samples over the last 4h.
  (max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
  and
  # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
  # had ingested samples in the past, then no traffic was received for a long period and then it starts
  # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
  # samples, while the a block shipping is expected within the next 4h.
  (max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
for: 15m
labels:
  severity: critical

CortexIngesterHasNotShippedBlocksSinceStart

alert: CortexIngesterHasNotShippedBlocksSinceStart
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours.
expr: |
  (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
  and
  (max by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
for: 4h
labels:
  severity: critical

CortexIngesterHasUnshippedBlocks

alert: CortexIngesterHasUnshippedBlocks
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.
expr: |
  (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600)
  and
  (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0)
for: 15m
labels:
  severity: critical

CortexIngesterTSDBHeadCompactionFailed

alert: CortexIngesterTSDBHeadCompactionFailed
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to compact TSDB head.
expr: |
  rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0
for: 15m
labels:
  severity: critical

CortexIngesterTSDBHeadTruncationFailed

alert: CortexIngesterTSDBHeadTruncationFailed
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB head.
expr: |
  rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0
labels:
  severity: critical

CortexIngesterTSDBCheckpointCreationFailed

alert: CortexIngesterTSDBCheckpointCreationFailed
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to create TSDB checkpoint.
expr: |
  rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0
labels:
  severity: critical

CortexIngesterTSDBCheckpointDeletionFailed

alert: CortexIngesterTSDBCheckpointDeletionFailed
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to delete TSDB checkpoint.
expr: |
  rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0
labels:
  severity: critical

CortexIngesterTSDBWALTruncationFailed

alert: CortexIngesterTSDBWALTruncationFailed
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB WAL.
expr: |
  rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0
labels:
  severity: warning

CortexIngesterTSDBWALCorrupted

alert: CortexIngesterTSDBWALCorrupted
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL.
expr: |
  rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0
labels:
  severity: critical

CortexIngesterTSDBWALWritesFailed

alert: CortexIngesterTSDBWALWritesFailed
annotations:
  message: |
    Cortex Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to write to TSDB WAL.
expr: |
  rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0
for: 3m
labels:
  severity: critical

CortexQuerierHasNotScanTheBucket

alert: CortexQuerierHasNotScanTheBucket
annotations:
  message: |
    Cortex Querier {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.
expr: |
  (time() - cortex_querier_blocks_last_successful_scan_timestamp_seconds > 60 * 30)
  and
  cortex_querier_blocks_last_successful_scan_timestamp_seconds > 0
for: 5m
labels:
  severity: critical

CortexQuerierHighRefetchRate

alert: CortexQuerierHighRefetchRate
annotations:
  message: |
    Cortex Queries in {{ $labels.cluster }}/{{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.
expr: |
  100 * (
    (
      sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
      -
      sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
    )
    /
    sum by(cluster, namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
  )
  > 1
for: 10m
labels:
  severity: warning

CortexStoreGatewayHasNotSyncTheBucket

alert: CortexStoreGatewayHasNotSyncTheBucket
annotations:
  message: |
    Cortex Store Gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.
expr: |
  (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30)
  and
  cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
for: 5m
labels:
  severity: critical

CortexBucketIndexNotUpdated

alert: CortexBucketIndexNotUpdated
annotations:
  message: |
    Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.
expr: |
  min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
labels:
  severity: critical

CortexTenantHasPartialBlocks

alert: CortexTenantHasPartialBlocks
annotations:
  message: |
    Cortex tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has {{ $value }} partial blocks.
expr: |
  max by(cluster, namespace, user) (cortex_bucket_blocks_partials_count) > 0
for: 6h
labels:
  severity: warning

cortex_compactor_alerts

CortexCompactorHasNotSuccessfullyCleanedUpBlocks

alert: CortexCompactorHasNotSuccessfullyCleanedUpBlocks
annotations:
  message: |
    Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully cleaned up blocks in the last 6 hours.
expr: |
  (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6)
for: 1h
labels:
  severity: critical

CortexCompactorHasNotSuccessfullyRunCompaction

alert: CortexCompactorHasNotSuccessfullyRunCompaction
annotations:
  message: |
    Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours.
expr: |
  (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24)
  and
  (cortex_compactor_last_successful_run_timestamp_seconds > 0)
for: 1h
labels:
  severity: critical

CortexCompactorHasNotSuccessfullyRunCompaction

alert: CortexCompactorHasNotSuccessfullyRunCompaction
annotations:
  message: |
    Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours.
expr: |
  cortex_compactor_last_successful_run_timestamp_seconds == 0
for: 24h
labels:
  severity: critical

CortexCompactorHasNotSuccessfullyRunCompaction

alert: CortexCompactorHasNotSuccessfullyRunCompaction
annotations:
  message: |
    Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed to run 2 consecutive compactions.
expr: |
  increase(cortex_compactor_runs_failed_total[2h]) >= 2
labels:
  severity: critical

CortexCompactorHasNotUploadedBlocks

alert: CortexCompactorHasNotUploadedBlocks
annotations:
  message: |
    Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block in the last 24 hours.
expr: |
  (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor.*"} > 60 * 60 * 24)
  and
  (thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor.*"} > 0)
for: 15m
labels:
  severity: critical

CortexCompactorHasNotUploadedBlocks

alert: CortexCompactorHasNotUploadedBlocks
annotations:
  message: |
    Cortex Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block in the last 24 hours.
expr: |
  thanos_objstore_bucket_last_successful_upload_time{job=~".+/compactor.*"} == 0
for: 24h
labels:
  severity: critical

Recording rules

Complete list of pregenerated recording rules is available here.

cortex_api_1

cluster_job:cortex_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:99quantile

cluster_job:cortex_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_request_duration_seconds:50quantile

cluster_job:cortex_request_duration_seconds:avg

expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_request_duration_seconds:avg

cluster_job:cortex_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate

cluster_job:cortex_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_sum:sum_rate

cluster_job:cortex_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_request_duration_seconds_count:sum_rate

cortex_api_2

cluster_job_route:cortex_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:99quantile

cluster_job_route:cortex_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, route))
record: cluster_job_route:cortex_request_duration_seconds:50quantile

cluster_job_route:cortex_request_duration_seconds:avg

expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
  / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds:avg

cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job,
  route)
record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate

cluster_job_route:cortex_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate

cluster_job_route:cortex_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate

cortex_api_3

cluster_namespace_job_route:cortex_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m]))
  by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile

cluster_namespace_job_route:cortex_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m]))
  by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile

cluster_namespace_job_route:cortex_request_duration_seconds:avg

expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job,
  route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
  job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds:avg

cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
  job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate

cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job,
  route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate

cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace,
  job, route)
record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate

cortex_querier_api

cluster_job:cortex_querier_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:99quantile

cluster_job:cortex_querier_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_querier_request_duration_seconds:50quantile

cluster_job:cortex_querier_request_duration_seconds:avg

expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job)
  / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_querier_request_duration_seconds:avg

cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
  job)
record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate

cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate

cluster_job:cortex_querier_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate

cluster_job_route:cortex_querier_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile

cluster_job_route:cortex_querier_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, route))
record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile

cluster_job_route:cortex_querier_request_duration_seconds:avg

expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job,
  route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
  job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds:avg

cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
  job, route)
record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate

cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job,
  route)
record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate

cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job,
  route)
record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate

cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
  by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile

cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m]))
  by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile

cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg

expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace,
  job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster,
  namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg

cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster,
  namespace, job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate

cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace,
  job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate

cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace,
  job, route)
record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate

cortex_cache

cluster_job_method:cortex_memcache_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile

cluster_job_method:cortex_memcache_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, method))
record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile

cluster_job_method:cortex_memcache_request_duration_seconds:avg

expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job,
  method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster,
  job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds:avg

cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster,
  job, method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate

cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job,
  method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate

cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, job,
  method)
record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate

cluster_job:cortex_cache_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:99quantile

cluster_job:cortex_cache_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_cache_request_duration_seconds:50quantile

cluster_job:cortex_cache_request_duration_seconds:avg

expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) /
  sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds:avg

cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
  job)
record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate

cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate

cluster_job:cortex_cache_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate

cluster_job_method:cortex_cache_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile

cluster_job_method:cortex_cache_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, method))
record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile

cluster_job_method:cortex_cache_request_duration_seconds:avg

expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method)
  / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds:avg

cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster,
  job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate

cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method)
record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate

cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job,
  method)
record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate

cortex_storage

cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:99quantile

cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:50quantile

cluster_job_operation:cortex_bigtable_request_duration_seconds:avg

expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job,
  operation) / sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster,
  job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds:avg

cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_bigtable_request_duration_seconds_bucket[1m])) by (le, cluster,
  job, operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_bucket:sum_rate

cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_bigtable_request_duration_seconds_sum[1m])) by (cluster, job,
  operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_sum:sum_rate

cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_bigtable_request_duration_seconds_count[1m])) by (cluster, job,
  operation)
record: cluster_job_operation:cortex_bigtable_request_duration_seconds_count:sum_rate

cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:99quantile

cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:50quantile

cluster_job_operation:cortex_cassandra_request_duration_seconds:avg

expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job,
  operation) / sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster,
  job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds:avg

cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_cassandra_request_duration_seconds_bucket[1m])) by (le, cluster,
  job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_bucket:sum_rate

cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_cassandra_request_duration_seconds_sum[1m])) by (cluster, job,
  operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_sum:sum_rate

cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_cassandra_request_duration_seconds_count[1m])) by (cluster,
  job, operation)
record: cluster_job_operation:cortex_cassandra_request_duration_seconds_count:sum_rate

cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:99quantile

cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:50quantile

cluster_job_operation:cortex_dynamo_request_duration_seconds:avg

expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
  operation) / sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster,
  job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds:avg

cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_dynamo_request_duration_seconds_bucket[1m])) by (le, cluster,
  job, operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_bucket:sum_rate

cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_dynamo_request_duration_seconds_sum[1m])) by (cluster, job,
  operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_sum:sum_rate

cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_dynamo_request_duration_seconds_count[1m])) by (cluster, job,
  operation)
record: cluster_job_operation:cortex_dynamo_request_duration_seconds_count:sum_rate

cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:99quantile

cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_index_lookups_per_query:50quantile

cluster_job:cortex_chunk_store_index_lookups_per_query:avg

expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job)
  / sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query:avg

cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate

expr: sum(rate(cortex_chunk_store_index_lookups_per_query_bucket[1m])) by (le, cluster,
  job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_bucket:sum_rate

cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate

expr: sum(rate(cortex_chunk_store_index_lookups_per_query_sum[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_sum:sum_rate

cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate

expr: sum(rate(cortex_chunk_store_index_lookups_per_query_count[1m])) by (cluster,
  job)
record: cluster_job:cortex_chunk_store_index_lookups_per_query_count:sum_rate

cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:99quantile

cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:50quantile

cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg

expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster,
  job) / sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query:avg

cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate

expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_bucket[1m])) by
  (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_bucket:sum_rate

cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate

expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_sum[1m])) by (cluster,
  job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_sum:sum_rate

cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate

expr: sum(rate(cortex_chunk_store_series_pre_intersection_per_query_count[1m])) by
  (cluster, job)
record: cluster_job:cortex_chunk_store_series_pre_intersection_per_query_count:sum_rate

cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:99quantile

cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:50quantile

cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg

expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by
  (cluster, job) / sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query:avg

cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate

expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_bucket[1m]))
  by (le, cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_bucket:sum_rate

cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate

expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_sum[1m])) by
  (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_sum:sum_rate

cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate

expr: sum(rate(cortex_chunk_store_series_post_intersection_per_query_count[1m])) by
  (cluster, job)
record: cluster_job:cortex_chunk_store_series_post_intersection_per_query_count:sum_rate

cluster_job:cortex_chunk_store_chunks_per_query:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:99quantile

cluster_job:cortex_chunk_store_chunks_per_query:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_chunk_store_chunks_per_query:50quantile

cluster_job:cortex_chunk_store_chunks_per_query:avg

expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job) / sum(rate(cortex_chunk_store_chunks_per_query_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query:avg

cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate

expr: sum(rate(cortex_chunk_store_chunks_per_query_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_bucket:sum_rate

cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate

expr: sum(rate(cortex_chunk_store_chunks_per_query_sum[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_sum:sum_rate

cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate

expr: sum(rate(cortex_chunk_store_chunks_per_query_count[1m])) by (cluster, job)
record: cluster_job:cortex_chunk_store_chunks_per_query_count:sum_rate

cluster_job_method:cortex_database_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:99quantile

cluster_job_method:cortex_database_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_database_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, method))
record: cluster_job_method:cortex_database_request_duration_seconds:50quantile

cluster_job_method:cortex_database_request_duration_seconds:avg

expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job,
  method) / sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster,
  job, method)
record: cluster_job_method:cortex_database_request_duration_seconds:avg

cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_database_request_duration_seconds_bucket[1m])) by (le, cluster,
  job, method)
record: cluster_job_method:cortex_database_request_duration_seconds_bucket:sum_rate

cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_database_request_duration_seconds_sum[1m])) by (cluster, job,
  method)
record: cluster_job_method:cortex_database_request_duration_seconds_sum:sum_rate

cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_database_request_duration_seconds_count[1m])) by (cluster, job,
  method)
record: cluster_job_method:cortex_database_request_duration_seconds_count:sum_rate

cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:99quantile

cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_gcs_request_duration_seconds_bucket[1m]))
  by (le, cluster, job, operation))
record: cluster_job_operation:cortex_gcs_request_duration_seconds:50quantile

cluster_job_operation:cortex_gcs_request_duration_seconds:avg

expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation)
  / sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds:avg

cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_gcs_request_duration_seconds_bucket[1m])) by (le, cluster, job,
  operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_bucket:sum_rate

cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_gcs_request_duration_seconds_sum[1m])) by (cluster, job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_sum:sum_rate

cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_gcs_request_duration_seconds_count[1m])) by (cluster, job, operation)
record: cluster_job_operation:cortex_gcs_request_duration_seconds_count:sum_rate

cluster_job:cortex_kv_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:99quantile

cluster_job:cortex_kv_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_kv_request_duration_seconds:50quantile

cluster_job:cortex_kv_request_duration_seconds:avg

expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds:avg

cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate

cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate

cluster_job:cortex_kv_request_duration_seconds_count:sum_rate

expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate

cortex_queries

cluster_job:cortex_query_frontend_retries:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:99quantile

cluster_job:cortex_query_frontend_retries:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_query_frontend_retries:50quantile

cluster_job:cortex_query_frontend_retries:avg

expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_query_frontend_retries:avg

cluster_job:cortex_query_frontend_retries_bucket:sum_rate

expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate

cluster_job:cortex_query_frontend_retries_sum:sum_rate

expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_sum:sum_rate

cluster_job:cortex_query_frontend_retries_count:sum_rate

expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)
record: cluster_job:cortex_query_frontend_retries_count:sum_rate

cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile

cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile

cluster_job:cortex_query_frontend_queue_duration_seconds:avg

expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
  job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
  job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg

cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate

expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster,
  job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate

cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate

expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster,
  job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate

cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate

expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster,
  job)
record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate

cluster_job:cortex_ingester_queried_series:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:99quantile

cluster_job:cortex_ingester_queried_series:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_series:50quantile

cluster_job:cortex_ingester_queried_series:avg

expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_ingester_queried_series:avg

cluster_job:cortex_ingester_queried_series_bucket:sum_rate

expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate

cluster_job:cortex_ingester_queried_series_sum:sum_rate

expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_sum:sum_rate

cluster_job:cortex_ingester_queried_series_count:sum_rate

expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_series_count:sum_rate

cluster_job:cortex_ingester_queried_chunks:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:99quantile

cluster_job:cortex_ingester_queried_chunks:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_chunks_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_chunks:50quantile

cluster_job:cortex_ingester_queried_chunks:avg

expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_chunks_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks:avg

cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate

expr: sum(rate(cortex_ingester_queried_chunks_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_bucket:sum_rate

cluster_job:cortex_ingester_queried_chunks_sum:sum_rate

expr: sum(rate(cortex_ingester_queried_chunks_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_sum:sum_rate

cluster_job:cortex_ingester_queried_chunks_count:sum_rate

expr: sum(rate(cortex_ingester_queried_chunks_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_chunks_count:sum_rate

cluster_job:cortex_ingester_queried_samples:99quantile

expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:99quantile

cluster_job:cortex_ingester_queried_samples:50quantile

expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m]))
  by (le, cluster, job))
record: cluster_job:cortex_ingester_queried_samples:50quantile

cluster_job:cortex_ingester_queried_samples:avg

expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m]))
  by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples:avg

cluster_job:cortex_ingester_queried_samples_bucket:sum_rate

expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)
record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate

cluster_job:cortex_ingester_queried_samples_sum:sum_rate

expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate

cluster_job:cortex_ingester_queried_samples_count:sum_rate

expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)
record: cluster_job:cortex_ingester_queried_samples_count:sum_rate

cortex_received_samples

cluster_namespace_job:cortex_distributor_received_samples:rate5m

expr: |
  sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))
record: cluster_namespace_job:cortex_distributor_received_samples:rate5m

cortex_scaling_rules

cluster_namespace_deployment:actual_replicas:count

expr: |
  sum by (cluster, namespace, deployment) (
    label_replace(
      kube_deployment_spec_replicas,
      # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
      # always matches everything and the (optional) zone is not removed.
      "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
    )
  )
  or
  sum by (cluster, namespace, deployment) (
    label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
  )
record: cluster_namespace_deployment:actual_replicas:count

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    quantile_over_time(0.99,
      sum by (cluster, namespace) (
        cluster_namespace_job:cortex_distributor_received_samples:rate5m
      )[24h:]
    )
    / 240000
  )
labels:
  deployment: distributor
  reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
    * 0.59999999999999998 / 240000
  )
labels:
  deployment: distributor
  reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    quantile_over_time(0.99,
      sum by (cluster, namespace) (
        cluster_namespace_job:cortex_distributor_received_samples:rate5m
      )[24h:]
    )
    * 3 / 80000
  )
labels:
  deployment: ingester
  reason: sample_rate
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    quantile_over_time(0.99,
      sum by(cluster, namespace) (
        cortex_ingester_memory_series
      )[24h:]
    )
    / 1500000
  )
labels:
  deployment: ingester
  reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"})
    * 3 * 0.59999999999999998 / 1500000
  )
labels:
  deployment: ingester
  reason: active_series_limits
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
    * 0.59999999999999998 / 80000
  )
labels:
  deployment: ingester
  reason: sample_rate_limits
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    (sum by (cluster, namespace) (
      cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
    ) / 4)
      /
    avg by (cluster, namespace) (
      memcached_limit_bytes{job=~".+/memcached"}
    )
  )
labels:
  deployment: memcached
  reason: active_series
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate

expr: |
  sum by (cluster, namespace, deployment) (
    label_replace(
      label_replace(
        node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate,
        "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
      ),
      # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
      # always matches everything and the (optional) zone is not removed.
      "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
    )
  )
record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate

cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum


expr: |
  # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
  # that remove resource metrics, ref:
  # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
  # - https://github.com/kubernetes/kube-state-metrics/pull/1004
  #
  # This is the old expression, compatible with kube-state-metrics < v2.0.0,
  # where kube_pod_container_resource_requests_cpu_cores was removed:
  (
    sum by (cluster, namespace, deployment) (
      label_replace(
        label_replace(
          kube_pod_container_resource_requests_cpu_cores,
          "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
        ),
        # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
        # always matches everything and the (optional) zone is not removed.
        "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
      )
    )
  )
  or
  # This expression is compatible with kube-state-metrics >= v1.4.0,
  # where kube_pod_container_resource_requests was introduced.
  (
    sum by (cluster, namespace, deployment) (
      label_replace(
        label_replace(
          kube_pod_container_resource_requests{resource="cpu"},
          "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
        ),
        # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
        # always matches everything and the (optional) zone is not removed.
        "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
      )
    )
  )
record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    cluster_namespace_deployment:actual_replicas:count
      *
    quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
      /
    cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
  )
labels:
  reason: cpu_usage
record: cluster_namespace_deployment_reason:required_replicas:count

cluster_namespace_deployment:container_memory_usage_bytes:sum

expr: |
  sum by (cluster, namespace, deployment) (
    label_replace(
      label_replace(
        container_memory_usage_bytes,
        "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
      ),
      # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
      # always matches everything and the (optional) zone is not removed.
      "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
    )
  )
record: cluster_namespace_deployment:container_memory_usage_bytes:sum

cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum


expr: |
  # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
  # that remove resource metrics, ref:
  # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16
  # - https://github.com/kubernetes/kube-state-metrics/pull/1004
  #
  # This is the old expression, compatible with kube-state-metrics < v2.0.0,
  # where kube_pod_container_resource_requests_memory_bytes was removed:
  (
    sum by (cluster, namespace, deployment) (
      label_replace(
        label_replace(
          kube_pod_container_resource_requests_memory_bytes,
          "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
        ),
        # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
        # always matches everything and the (optional) zone is not removed.
        "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
      )
    )
  )
  or
  # This expression is compatible with kube-state-metrics >= v1.4.0,
  # where kube_pod_container_resource_requests was introduced.
  (
    sum by (cluster, namespace, deployment) (
      label_replace(
        label_replace(
          kube_pod_container_resource_requests{resource="memory"},
          "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
        ),
        # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
        # always matches everything and the (optional) zone is not removed.
        "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"
      )
    )
  )
record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum

cluster_namespace_deployment_reason:required_replicas:count

expr: |
  ceil(
    cluster_namespace_deployment:actual_replicas:count
      *
    quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h])
      /
    cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
  )
labels:
  reason: memory_usage
record: cluster_namespace_deployment_reason:required_replicas:count

cortex_alertmanager_rules

cluster_job_pod:cortex_alertmanager_alerts:sum

expr: |
  sum by (cluster, job, pod) (cortex_alertmanager_alerts)
record: cluster_job_pod:cortex_alertmanager_alerts:sum

cluster_job_pod:cortex_alertmanager_silences:sum

expr: |
  sum by (cluster, job, pod) (cortex_alertmanager_silences)
record: cluster_job_pod:cortex_alertmanager_silences:sum

cluster_job:cortex_alertmanager_alerts_received_total:rate5m

expr: |
  sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m

cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m

expr: |
  sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))
record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m

cluster_job_integration:cortex_alertmanager_notifications_total:rate5m

expr: |
  sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m

cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m

expr: |
  sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))
record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m

cluster_job:cortex_alertmanager_state_replication_total:rate5m

expr: |
  sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_total:rate5m

cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m

expr: |
  sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))
record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m

cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m

expr: |
  sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m

cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m

expr: |
  sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))
record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m

Dashboards

Following dashboards are generated from mixins and hosted on github: