loki
Overview
Jsonnet source code is available at github.com/grafana/loki
Alerts
Complete list of pregenerated alerts is available here.
loki_alerts
LokiRequestErrors
alert: LokiRequestErrors
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
LokiRequestPanics
alert: LokiRequestPanics
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: critical
LokiRequestLatency
alert: LokiRequestLatency
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
for: 15m
labels:
severity: critical
LokiTooManyCompactorsRunning
alert: LokiTooManyCompactorsRunning
annotations:
message: |
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
for: 5m
labels:
severity: warning
Recording rules
Complete list of pregenerated recording rules is available here.
loki_rules
cluster_job:loki_request_duration_seconds:99quantile
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:99quantile
cluster_job:loki_request_duration_seconds:50quantile
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:50quantile
cluster_job:loki_request_duration_seconds:avg
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:loki_request_duration_seconds:avg
cluster_job:loki_request_duration_seconds_bucket:sum_rate
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
cluster_job:loki_request_duration_seconds_sum:sum_rate
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
cluster_job:loki_request_duration_seconds_count:sum_rate
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_count:sum_rate
cluster_job_route:loki_request_duration_seconds:99quantile
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:99quantile
cluster_job_route:loki_request_duration_seconds:50quantile
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:50quantile
cluster_job_route:loki_request_duration_seconds:avg
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) /
sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds:avg
cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
cluster_job_route:loki_request_duration_seconds_sum:sum_rate
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
cluster_job_route:loki_request_duration_seconds_count:sum_rate
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
cluster_namespace_job_route:loki_request_duration_seconds:99quantile
expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
cluster_namespace_job_route:loki_request_duration_seconds:50quantile
expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
cluster_namespace_job_route:loki_request_duration_seconds:avg
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job,
route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job,
route)
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job,
route)
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
Dashboards
Following dashboards are generated from mixins and hosted on github: