thanos

Overview

Jsonnet source code is available at github.com/thanos-io/thanos

Alerts

Complete list of pregenerated alerts is available here.

thanos-compact.rules

ThanosCompactMultipleRunning

alert: ThanosCompactMultipleRunning
annotations:
  description: No more than one Thanos Compact instance should be running at once.
    There are {{ $value }}
  summary: Thanos Compact has multiple instances running.
expr: sum(up{job=~"thanos-compact.*"}) > 1
for: 5m
labels:
  severity: warning

ThanosCompactHalted

alert: ThanosCompactHalted
annotations:
  description: Thanos Compact {{$labels.job}} has failed to run and now is halted.
  summary: Thanos Compact has failed to run ans is now halted.
expr: thanos_compactor_halted{job=~"thanos-compact.*"} == 1
for: 5m
labels:
  severity: warning

ThanosCompactHighCompactionFailures

alert: ThanosCompactHighCompactionFailures
annotations:
  description: Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize
    }}% of compactions.
  summary: Thanos Compact is failing to execute compactions.
expr: |
  (
    sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~"thanos-compact.*"}[5m]))
  /
    sum by (job) (rate(thanos_compact_group_compactions_total{job=~"thanos-compact.*"}[5m]))
  * 100 > 5
  )
for: 15m
labels:
  severity: warning

ThanosCompactBucketHighOperationFailures

alert: ThanosCompactBucketHighOperationFailures
annotations:
  description: Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value
    | humanize }}% of operations.
  summary: Thanos Compact Bucket is having a high number of operation failures.
expr: |
  (
    sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-compact.*"}[5m]))
  /
    sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-compact.*"}[5m]))
  * 100 > 5
  )
for: 15m
labels:
  severity: warning

ThanosCompactHasNotRun

alert: ThanosCompactHasNotRun
annotations:
  description: Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.
  summary: Thanos Compact has not uploaded anything for last 24 hours.
expr: (time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~"thanos-compact.*"}[24h])))
  / 60 / 60 > 24
labels:
  severity: warning

thanos-query.rules

ThanosQueryHttpRequestQueryErrorRateHigh

alert: ThanosQueryHttpRequestQueryErrorRateHigh
annotations:
  description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
    }}% of "query" requests.
  summary: Thanos Query is failing to handle requests.
expr: |
  (
    sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query"}[5m]))
  /
    sum(rate(http_requests_total{job=~"thanos-query.*", handler="query"}[5m]))
  ) * 100 > 5
for: 5m
labels:
  severity: critical

ThanosQueryHttpRequestQueryRangeErrorRateHigh

alert: ThanosQueryHttpRequestQueryRangeErrorRateHigh
annotations:
  description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
    }}% of "query_range" requests.
  summary: Thanos Query is failing to handle requests.
expr: |
  (
    sum(rate(http_requests_total{code=~"5..", job=~"thanos-query.*", handler="query_range"}[5m]))
  /
    sum(rate(http_requests_total{job=~"thanos-query.*", handler="query_range"}[5m]))
  ) * 100 > 5
for: 5m
labels:
  severity: critical

ThanosQueryGrpcServerErrorRate

alert: ThanosQueryGrpcServerErrorRate
annotations:
  description: Thanos Query {{$labels.job}} is failing to handle {{ $value | humanize
    }}% of requests.
  summary: Thanos Query is failing to handle requests.
expr: |
  (
    sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*"}[5m]))
  /
    sum by (job) (rate(grpc_server_started_total{job=~"thanos-query.*"}[5m]))
  * 100 > 5
  )
for: 5m
labels:
  severity: warning

ThanosQueryGrpcClientErrorRate

alert: ThanosQueryGrpcClientErrorRate
annotations:
  description: Thanos Query {{$labels.job}} is failing to send {{ $value | humanize
    }}% of requests.
  summary: Thanos Query is failing to send requests.
expr: |
  (
    sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~"thanos-query.*"}[5m]))
  /
    sum by (job) (rate(grpc_client_started_total{job=~"thanos-query.*"}[5m]))
  ) * 100 > 5
for: 5m
labels:
  severity: warning

ThanosQueryHighDNSFailures

alert: ThanosQueryHighDNSFailures
annotations:
  description: Thanos Query {{$labels.job}} have {{ $value | humanize }}% of failing
    DNS queries for store endpoints.
  summary: Thanos Query is having high number of DNS failures.
expr: |
  (
    sum by (job) (rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
  /
    sum by (job) (rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
  ) * 100 > 1
for: 15m
labels:
  severity: warning

ThanosQueryInstantLatencyHigh

alert: ThanosQueryInstantLatencyHigh
annotations:
  description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
    }} seconds for instant queries.
  summary: Thanos Query has high latency for queries.
expr: |
  (
    histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m]))) > 40
  and
    sum by (job) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) > 0
  )
for: 10m
labels:
  severity: critical

ThanosQueryRangeLatencyHigh

alert: ThanosQueryRangeLatencyHigh
annotations:
  description: Thanos Query {{$labels.job}} has a 99th percentile latency of {{ $value
    }} seconds for range queries.
  summary: Thanos Query has high latency for queries.
expr: |
  (
    histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m]))) > 90
  and
    sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-query.*", handler="query_range"}[5m])) > 0
  )
for: 10m
labels:
  severity: critical

thanos-receive.rules

ThanosReceiveHttpRequestErrorRateHigh

alert: ThanosReceiveHttpRequestErrorRateHigh
annotations:
  description: Thanos Receive {{$labels.job}} is failing to handle {{ $value | humanize
    }}% of requests.
  summary: Thanos Receive is failing to handle requests.
expr: |
  (
    sum(rate(http_requests_total{code=~"5..", job=~"thanos-receive.*", handler="receive"}[5m]))
  /
    sum(rate(http_requests_total{job=~"thanos-receive.*", handler="receive"}[5m]))
  ) * 100 > 5
for: 5m
labels:
  severity: critical

ThanosReceiveHttpRequestLatencyHigh

alert: ThanosReceiveHttpRequestLatencyHigh
annotations:
  description: Thanos Receive {{$labels.job}} has a 99th percentile latency of {{
    $value }} seconds for requests.
  summary: Thanos Receive has high HTTP requests latency.
expr: |
  (
    histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~"thanos-receive.*", handler="receive"}[5m]))) > 10
  and
    sum by (job) (rate(http_request_duration_seconds_count{job=~"thanos-receive.*", handler="receive"}[5m])) > 0
  )
for: 10m
labels:
  severity: critical

ThanosReceiveHighReplicationFailures

alert: ThanosReceiveHighReplicationFailures
annotations:
  description: Thanos Receive {{$labels.job}} is failing to replicate {{ $value |
    humanize }}% of requests.
  summary: Thanos Receive is having high number of replication failures.
expr: |
  thanos_receive_replication_factor > 1
    and
  (
    (
      sum by (job) (rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
    /
      sum by (job) (rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
    )
    >
    (
      max by (job) (floor((thanos_receive_replication_factor{job=~"thanos-receive.*"}+1) / 2))
    /
      max by (job) (thanos_receive_hashring_nodes{job=~"thanos-receive.*"})
    )
  ) * 100
for: 5m
labels:
  severity: warning

ThanosReceiveHighForwardRequestFailures

alert: ThanosReceiveHighForwardRequestFailures
annotations:
  description: Thanos Receive {{$labels.job}} is failing to forward {{ $value | humanize
    }}% of requests.
  summary: Thanos Receive is failing to forward requests.
expr: |
  (
    sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
  /
    sum by (job) (rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
  ) * 100 > 20
for: 5m
labels:
  severity: warning

ThanosReceiveHighHashringFileRefreshFailures

alert: ThanosReceiveHighHashringFileRefreshFailures
annotations:
  description: Thanos Receive {{$labels.job}} is failing to refresh hashring file,
    {{ $value | humanize }} of attempts failed.
  summary: Thanos Receive is failing to refresh hasring file.
expr: |
  (
    sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
  /
    sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
  > 0
  )
for: 15m
labels:
  severity: warning

ThanosReceiveConfigReloadFailure

alert: ThanosReceiveConfigReloadFailure
annotations:
  description: Thanos Receive {{$labels.job}} has not been able to reload hashring
    configurations.
  summary: Thanos Receive has not been able to reload configuration.
expr: avg(thanos_receive_config_last_reload_successful{job=~"thanos-receive.*"}) by
  (job) != 1
for: 5m
labels:
  severity: warning

ThanosReceiveNoUpload

alert: ThanosReceiveNoUpload
annotations:
  description: Thanos Receive {{ $labels.instance }} of {{$labels.job}} has not uploaded
    latest data to object storage.
  summary: Thanos Receive has not uploaded latest data to object storage.
expr: |
  (up{job=~"thanos-receive.*"} - 1)
  + on (instance) # filters to only alert on current instance last 3h
  (sum by (instance) (increase(thanos_shipper_uploads_total{job=~"thanos-receive.*"}[3h])) == 0)
for: 3h
labels:
  severity: critical

thanos-sidecar.rules

ThanosSidecarPrometheusDown

alert: ThanosSidecarPrometheusDown
annotations:
  description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect to Prometheus.
  summary: Thanos Sidecar cannot connect to Prometheus
expr: |
  sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"thanos-sidecar.*"} == 0)
for: 5m
labels:
  severity: critical

ThanosSidecarUnhealthy

alert: ThanosSidecarUnhealthy
annotations:
  description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{
    $value }} seconds.
  summary: Thanos Sidecar is unhealthy.
expr: |
  time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600
labels:
  severity: critical

thanos-store.rules

ThanosStoreGrpcErrorRate

alert: ThanosStoreGrpcErrorRate
annotations:
  description: Thanos Store {{$labels.job}} is failing to handle {{ $value | humanize
    }}% of requests.
  summary: Thanos Store is failing to handle qrpcd requests.
expr: |
  (
    sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*"}[5m]))
  /
    sum by (job) (rate(grpc_server_started_total{job=~"thanos-store.*"}[5m]))
  * 100 > 5
  )
for: 5m
labels:
  severity: warning

ThanosStoreSeriesGateLatencyHigh

alert: ThanosStoreSeriesGateLatencyHigh
annotations:
  description: Thanos Store {{$labels.job}} has a 99th percentile latency of {{ $value
    }} seconds for store series gate requests.
  summary: Thanos Store has high latency for store series gate requests.
expr: |
  (
    histogram_quantile(0.9, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
  and
    sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
  )
for: 10m
labels:
  severity: warning

ThanosStoreBucketHighOperationFailures

alert: ThanosStoreBucketHighOperationFailures
annotations:
  description: Thanos Store {{$labels.job}} Bucket is failing to execute {{ $value
    | humanize }}% of operations.
  summary: Thanos Store Bucket is failing to execute operations.
expr: |
  (
    sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
  /
    sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
  * 100 > 5
  )
for: 15m
labels:
  severity: warning

ThanosStoreObjstoreOperationLatencyHigh

alert: ThanosStoreObjstoreOperationLatencyHigh
annotations:
  description: Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of
    {{ $value }} seconds for the bucket operations.
  summary: Thanos Store is having high latency for bucket operations.
expr: |
  (
    histogram_quantile(0.9, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m]))) > 2
  and
    sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~"thanos-store.*"}[5m])) > 0
  )
for: 10m
labels:
  severity: warning

thanos-rule.rules

ThanosRuleQueueIsDroppingAlerts

Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts. Thanos Rule is failing to queue alerts.


alert: ThanosRuleQueueIsDroppingAlerts
annotations:
  description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to queue alerts.
  summary: Thanos Rule is failing to queue alerts.
expr: |
  sum by (job) (rate(thanos_alert_queue_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
for: 5m
labels:
  severity: critical

ThanosRuleSenderIsFailingAlerts

Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts

Thanos Rule is failing to send alerts to alertmanager.


alert: ThanosRuleSenderIsFailingAlerts
annotations:
  description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to send alerts
    to alertmanager.
  summary: Thanos Rule is failing to send alerts to alertmanager.
expr: |
  sum by (job) (rate(thanos_alert_sender_alerts_dropped_total{job=~"thanos-rule.*"}[5m])) > 0
for: 5m
labels:
  severity: critical

ThanosRuleHighRuleEvaluationFailures

alert: ThanosRuleHighRuleEvaluationFailures
annotations:
  description: Thanos Rule {{$labels.job}} {{$labels.pod}} is failing to evaluate
    rules.
  summary: Thanos Rule is failing to evaluate rules.
expr: |
  (
    sum by (job) (rate(prometheus_rule_evaluation_failures_total{job=~"thanos-rule.*"}[5m]))
  /
    sum by (job) (rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[5m]))
  * 100 > 5
  )
for: 5m
labels:
  severity: critical

ThanosRuleHighRuleEvaluationWarnings

alert: ThanosRuleHighRuleEvaluationWarnings
annotations:
  description: Thanos Rule {{$labels.job}} {{$labels.pod}} has high number of evaluation
    warnings.
  summary: Thanos Rule has high number of evaluation warnings.
expr: |
  sum by (job) (rate(thanos_rule_evaluation_with_warnings_total{job=~"thanos-rule.*"}[5m])) > 0
for: 15m
labels:
  severity: info

ThanosRuleRuleEvaluationLatencyHigh

alert: ThanosRuleRuleEvaluationLatencyHigh
annotations:
  description: Thanos Rule {{$labels.job}}/{{$labels.pod}} has higher evaluation latency
    than interval for {{$labels.rule_group}}.
  summary: Thanos Rule has high rule evaluation latency.
expr: |
  (
    sum by (job, pod, rule_group) (prometheus_rule_group_last_duration_seconds{job=~"thanos-rule.*"})
  >
    sum by (job, pod, rule_group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"})
  )
for: 5m
labels:
  severity: warning

ThanosRuleGrpcErrorRate

alert: ThanosRuleGrpcErrorRate
annotations:
  description: Thanos Rule {{$labels.job}} is failing to handle {{ $value | humanize
    }}% of requests.
  summary: Thanos Rule is failing to handle grpc requests.
expr: |
  (
    sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-rule.*"}[5m]))
  /
    sum by (job) (rate(grpc_server_started_total{job=~"thanos-rule.*"}[5m]))
  * 100 > 5
  )
for: 5m
labels:
  severity: warning

ThanosRuleConfigReloadFailure

alert: ThanosRuleConfigReloadFailure
annotations:
  description: Thanos Rule {{$labels.job}} has not been able to reload its configuration.
  summary: Thanos Rule has not been able to reload configuration.
expr: avg(thanos_rule_config_last_reload_successful{job=~"thanos-rule.*"}) by (job)
  != 1
for: 5m
labels:
  severity: info

ThanosRuleQueryHighDNSFailures

alert: ThanosRuleQueryHighDNSFailures
annotations:
  description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
    DNS queries for query endpoints.
  summary: Thanos Rule is having high number of DNS failures.
expr: |
  (
    sum by (job) (rate(thanos_ruler_query_apis_dns_failures_total{job=~"thanos-rule.*"}[5m]))
  /
    sum by (job) (rate(thanos_ruler_query_apis_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
  * 100 > 1
  )
for: 15m
labels:
  severity: warning

ThanosRuleAlertmanagerHighDNSFailures

alert: ThanosRuleAlertmanagerHighDNSFailures
annotations:
  description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% of failing
    DNS queries for Alertmanager endpoints.
  summary: Thanos Rule is having high number of DNS failures.
expr: |
  (
    sum by (job) (rate(thanos_ruler_alertmanagers_dns_failures_total{job=~"thanos-rule.*"}[5m]))
  /
    sum by (job) (rate(thanos_ruler_alertmanagers_dns_lookups_total{job=~"thanos-rule.*"}[5m]))
  * 100 > 1
  )
for: 15m
labels:
  severity: warning

ThanosRuleNoEvaluationFor10Intervals

alert: ThanosRuleNoEvaluationFor10Intervals
annotations:
  description: Thanos Rule {{$labels.job}} has {{ $value | humanize }}% rule groups
    that did not evaluate for at least 10x of their expected interval.
  summary: Thanos Rule has rule groups that did not evaluate for 10 intervals.
expr: |
  time() -  max by (job, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~"thanos-rule.*"})
  >
  10 * max by (job, group) (prometheus_rule_group_interval_seconds{job=~"thanos-rule.*"})
for: 5m
labels:
  severity: info

ThanosNoRuleEvaluations

alert: ThanosNoRuleEvaluations
annotations:
  description: Thanos Rule {{$labels.job}} did not perform any rule evaluations in
    the past 2 minutes.
  summary: Thanos Rule did not perform any rule evaluations.
expr: |
  sum(rate(prometheus_rule_evaluations_total{job=~"thanos-rule.*"}[2m])) <= 0
    and
  sum(thanos_rule_loaded_rules{job=~"thanos-rule.*"}) > 0
for: 3m
labels:
  severity: critical

thanos-component-absent.rules

ThanosCompactIsDown

alert: ThanosCompactIsDown
annotations:
  description: ThanosCompact has disappeared from Prometheus target discovery.
  summary: thanos component has disappeared from Prometheus target discovery.
expr: |
  absent(up{job=~"thanos-compact.*"} == 1)
for: 5m
labels:
  severity: critical

ThanosQueryIsDown

alert: ThanosQueryIsDown
annotations:
  description: ThanosQuery has disappeared from Prometheus target discovery.
  summary: thanos component has disappeared from Prometheus target discovery.
expr: |
  absent(up{job=~"thanos-query.*"} == 1)
for: 5m
labels:
  severity: critical

ThanosReceiveIsDown

alert: ThanosReceiveIsDown
annotations:
  description: ThanosReceive has disappeared from Prometheus target discovery.
  summary: thanos component has disappeared from Prometheus target discovery.
expr: |
  absent(up{job=~"thanos-receive.*"} == 1)
for: 5m
labels:
  severity: critical

ThanosRuleIsDown

alert: ThanosRuleIsDown
annotations:
  description: ThanosRule has disappeared from Prometheus target discovery.
  summary: thanos component has disappeared from Prometheus target discovery.
expr: |
  absent(up{job=~"thanos-rule.*"} == 1)
for: 5m
labels:
  severity: critical

ThanosSidecarIsDown

alert: ThanosSidecarIsDown
annotations:
  description: ThanosSidecar has disappeared from Prometheus target discovery.
  summary: thanos component has disappeared from Prometheus target discovery.
expr: |
  absent(up{job=~"thanos-sidecar.*"} == 1)
for: 5m
labels:
  severity: critical

ThanosStoreIsDown

alert: ThanosStoreIsDown
annotations:
  description: ThanosStore has disappeared from Prometheus target discovery.
  summary: thanos component has disappeared from Prometheus target discovery.
expr: |
  absent(up{job=~"thanos-store.*"} == 1)
for: 5m
labels:
  severity: critical

thanos-bucket-replicate.rules

ThanosBucketReplicateIsDown

alert: ThanosBucketReplicateIsDown
annotations:
  description: Thanos Replicate has disappeared from Prometheus target discovery.
  summary: Thanos Replicate has disappeared from Prometheus target discovery.
expr: |
  absent(up{job=~"thanos-bucket-replicate.*"})
for: 5m
labels:
  severity: critical

ThanosBucketReplicateErrorRate

alert: ThanosBucketReplicateErrorRate
annotations:
  description: Thanos Replicate failing to run, {{ $value | humanize }}% of attempts
    failed.
  summary: Thanose Replicate is failing to run.
expr: |
  (
    sum(rate(thanos_replicate_replication_runs_total{result="error", job=~"thanos-bucket-replicate.*"}[5m]))
  / on (namespace) group_left
    sum(rate(thanos_replicate_replication_runs_total{job=~"thanos-bucket-replicate.*"}[5m]))
  ) * 100 >= 10
for: 5m
labels:
  severity: critical

ThanosBucketReplicateRunLatency

alert: ThanosBucketReplicateRunLatency
annotations:
  description: Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{
    $value }} seconds for the replicate operations.
  summary: Thanos Replicate has a high latency for replicate operations.
expr: |
  (
    histogram_quantile(0.9, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m]))) > 20
  and
    sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~"thanos-bucket-replicate.*"}[5m])) > 0
  )
for: 5m
labels:
  severity: critical

Recording rules

Complete list of pregenerated recording rules is available here.

thanos-query.rules

:grpc_client_failures_per_unary:sum_rate

expr: |
  (
    sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="unary"}[5m]))
  /
    sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="unary"}[5m]))
  )
record: :grpc_client_failures_per_unary:sum_rate

:grpc_client_failures_per_stream:sum_rate

expr: |
  (
    sum(rate(grpc_client_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
  /
    sum(rate(grpc_client_started_total{job=~"thanos-query.*", grpc_type="server_stream"}[5m]))
  )
record: :grpc_client_failures_per_stream:sum_rate

:thanos_querier_store_apis_dns_failures_per_lookup:sum_rate

expr: |
  (
    sum(rate(thanos_querier_store_apis_dns_failures_total{job=~"thanos-query.*"}[5m]))
  /
    sum(rate(thanos_querier_store_apis_dns_lookups_total{job=~"thanos-query.*"}[5m]))
  )
record: :thanos_querier_store_apis_dns_failures_per_lookup:sum_rate

:query_duration_seconds:histogram_quantile

expr: |
  histogram_quantile(0.99,
    sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query"}[5m])) by (le)
  )
labels:
  quantile: "0.99"
record: :query_duration_seconds:histogram_quantile

:api_range_query_duration_seconds:histogram_quantile

expr: |
  histogram_quantile(0.99,
    sum(rate(http_request_duration_seconds_bucket{job=~"thanos-query.*", handler="query_range"}[5m])) by (le)
  )
labels:
  quantile: "0.99"
record: :api_range_query_duration_seconds:histogram_quantile

thanos-receive.rules

:grpc_server_failures_per_unary:sum_rate

expr: |
  sum(
    rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="unary"}[5m])
  /
    rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="unary"}[5m])
  )
record: :grpc_server_failures_per_unary:sum_rate

:grpc_server_failures_per_stream:sum_rate

expr: |
  sum(
    rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
  /
    rate(grpc_server_started_total{job=~"thanos-receive.*", grpc_type="server_stream"}[5m])
  )
record: :grpc_server_failures_per_stream:sum_rate

:http_failure_per_request:sum_rate

expr: |
  sum(
    rate(http_requests_total{handler="receive", job=~"thanos-receive.*", code!~"5.."}[5m])
  /
    rate(http_requests_total{handler="receive", job=~"thanos-receive.*"}[5m])
  )
record: :http_failure_per_request:sum_rate

:http_request_duration_seconds:histogram_quantile

expr: |
  histogram_quantile(0.99,
    sum(rate(http_request_duration_seconds_bucket{handler="receive", job=~"thanos-receive.*"}[5m])) by (le)
  )
labels:
  quantile: "0.99"
record: :http_request_duration_seconds:histogram_quantile

:thanos_receive_replication_failure_per_requests:sum_rate

expr: |
  (
    sum(rate(thanos_receive_replications_total{result="error", job=~"thanos-receive.*"}[5m]))
  /
    sum(rate(thanos_receive_replications_total{job=~"thanos-receive.*"}[5m]))
  )
record: :thanos_receive_replication_failure_per_requests:sum_rate

:thanos_receive_forward_failure_per_requests:sum_rate

expr: |
  (
    sum(rate(thanos_receive_forward_requests_total{result="error", job=~"thanos-receive.*"}[5m]))
  /
    sum(rate(thanos_receive_forward_requests_total{job=~"thanos-receive.*"}[5m]))
  )
record: :thanos_receive_forward_failure_per_requests:sum_rate

:thanos_receive_hashring_file_failure_per_refresh:sum_rate

expr: |
  (
    sum(rate(thanos_receive_hashrings_file_errors_total{job=~"thanos-receive.*"}[5m]))
  /
    sum(rate(thanos_receive_hashrings_file_refreshes_total{job=~"thanos-receive.*"}[5m]))
  )
record: :thanos_receive_hashring_file_failure_per_refresh:sum_rate

thanos-store.rules

:grpc_server_failures_per_unary:sum_rate

expr: |
  (
    sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="unary"}[5m]))
  /
    sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="unary"}[5m]))
  )
record: :grpc_server_failures_per_unary:sum_rate

:grpc_server_failures_per_stream:sum_rate

expr: |
  (
    sum(rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
  /
    sum(rate(grpc_server_started_total{job=~"thanos-store.*", grpc_type="server_stream"}[5m]))
  )
record: :grpc_server_failures_per_stream:sum_rate

:thanos_objstore_bucket_failures_per_operation:sum_rate

expr: |
  (
    sum(rate(thanos_objstore_bucket_operation_failures_total{job=~"thanos-store.*"}[5m]))
  /
    sum(rate(thanos_objstore_bucket_operations_total{job=~"thanos-store.*"}[5m]))
  )
record: :thanos_objstore_bucket_failures_per_operation:sum_rate

:thanos_objstore_bucket_operation_duration_seconds:histogram_quantile

expr: |
  histogram_quantile(0.99,
    sum(rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~"thanos-store.*"}[5m])) by (le)
  )
labels:
  quantile: "0.99"
record: :thanos_objstore_bucket_operation_duration_seconds:histogram_quantile

thanos-bucket-replicate.rules

Dashboards

Following dashboards are generated from mixins and hosted on github: