jaeger
Overview
Jsonnet source code is available at github.com/grafana/jsonnet-libs
Alerts
Complete list of pregenerated alerts is available here.
jaeger_alerts
JaegerAgentUDPPacketsBeingDropped
alert: JaegerAgentUDPPacketsBeingDropped
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }} UDP packets per second.
expr: rate(jaeger_agent_thrift_udp_server_packets_dropped_total[1m]) > 1
for: 15m
labels:
severity: warning
JaegerAgentHTTPServerErrs
alert: JaegerAgentHTTPServerErrs
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% HTTP errors.
expr: 100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
JaegerClientSpansDropped
alert: JaegerClientSpansDropped
annotations:
message: |
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
JaegerAgentSpansDropped
alert: JaegerAgentSpansDropped
annotations:
message: |
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance,
job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by
(instance, job, namespace)> 1
for: 15m
labels:
severity: warning
JaegerCollectorQueueNotDraining
alert: JaegerCollectorQueueNotDraining
annotations:
message: |
collector {{ $labels.job }} {{ $labels.instance }} is not able to drain the queue.
expr: avg_over_time(jaeger_collector_queue_length[10m]) > 1000
for: 15m
labels:
severity: warning
JaegerCollectorDroppingSpans
alert: JaegerCollectorDroppingSpans
annotations:
message: |
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
expr: 100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job,
namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance,
job, namespace)> 1
for: 15m
labels:
severity: warning
JaegerSamplingUpdateFailing
alert: JaegerSamplingUpdateFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
expr: 100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job,
namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
JaegerCollectorPersistenceSlow
alert: JaegerCollectorPersistenceSlow
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is slow at persisting spans.
expr: histogram_quantile(0.99, sum by (le) (rate(jaeger_collector_save_latency_bucket[1m])))
> 0.5
for: 15m
labels:
severity: warning
JaegerThrottlingUpdateFailing
alert: JaegerThrottlingUpdateFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
expr: 100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job,
namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
JaegerQueryReqsFailing
alert: JaegerQueryReqsFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance,
job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job,
namespace)> 1
for: 15m
labels:
severity: warning
JaegerCassandraWritesFailing
alert: JaegerCassandraWritesFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
JaegerCassandraReadsFailing
alert: JaegerCassandraReadsFailing
annotations:
message: |
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
expr: 100 * sum(rate(jaeger_cassandra_read_errors_total[1m])) by (instance, job, namespace)
/ sum(rate(jaeger_cassandra_read_attempts_total[1m])) by (instance, job, namespace)>
1
for: 15m
labels:
severity: warning
Dashboards
Following dashboards are generated from mixins and hosted on github: