jaeger

Overview

Jsonnet source code is available at github.com/grafana/jsonnet-libs

Alerts

Complete list of pregenerated alerts is available here.

jaeger_alerts

JaegerAgentUDPPacketsBeingDropped

alert: JaegerAgentUDPPacketsBeingDropped
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }} UDP packets per second.
expr: rate(jaeger_agent_thrift_udp_server_packets_dropped_total[1m]) > 1
for: 15m
labels:
  severity: warning

JaegerAgentHTTPServerErrs

alert: JaegerAgentHTTPServerErrs
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job,
  namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)>
  1
for: 15m
labels:
  severity: warning

JaegerClientSpansDropped

alert: JaegerClientSpansDropped
annotations:
  message: |
    service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
  job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
  1
for: 15m
labels:
  severity: warning

JaegerAgentSpansDropped

alert: JaegerAgentSpansDropped
annotations:
  message: |
    agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
  job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by
  (instance, job, namespace)> 1
for: 15m
labels:
  severity: warning

JaegerCollectorQueueNotDraining

alert: JaegerCollectorQueueNotDraining
annotations:
  message: |
    collector {{ $labels.job }} {{ $labels.instance }} is not able to drain the queue.
expr: avg_over_time(jaeger_collector_queue_length[10m]) > 1000
for: 15m
labels:
  severity: warning

JaegerCollectorDroppingSpans

alert: JaegerCollectorDroppingSpans
annotations:
  message: |
    collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job,
  namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
  job, namespace)> 1
for: 15m
labels:
  severity: warning

JaegerSamplingUpdateFailing

alert: JaegerSamplingUpdateFailing
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job,
  namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
  1
for: 15m
labels:
  severity: warning

JaegerCollectorPersistenceSlow

alert: JaegerCollectorPersistenceSlow
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
  > 0.5
for: 15m
labels:
  severity: warning

JaegerThrottlingUpdateFailing

alert: JaegerThrottlingUpdateFailing
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job,
  namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)>
  1
for: 15m
labels:
  severity: warning

JaegerQueryReqsFailing

alert: JaegerQueryReqsFailing
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
  job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
  namespace)> 1
for: 15m
labels:
  severity: warning

JaegerCassandraWritesFailing

alert: JaegerCassandraWritesFailing
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
  / sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
  1
for: 15m
labels:
  severity: warning

JaegerCassandraReadsFailing

alert: JaegerCassandraReadsFailing
annotations:
  message: |
    {{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace)
  / sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)>
  1
for: 15m
labels:
  severity: warning

Dashboards

Following dashboards are generated from mixins and hosted on github: