promtail
Overview
Jsonnet source code is available at github.com/grafana/loki
Alerts
Complete list of pregenerated alerts is available here.
promtail_alerts
PromtailRequestsErrors
alert: PromtailRequestsErrors
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: Promtail request error rate is high.
expr: |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
for: 15m
labels:
severity: critical
PromtailRequestLatency
alert: PromtailRequestLatency
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: Promtail request latency P99 is high.
expr: |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
severity: critical
PromtailFileMissing
alert: PromtailFileMissing
annotations:
description: |
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
summary: Promtail cannot find a file it should be tailing.
expr: |
promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
severity: warning
Recording rules
Complete list of pregenerated recording rules is available here.
promtail_rules
job:promtail_request_duration_seconds:99quantile
expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:99quantile
job:promtail_request_duration_seconds:50quantile
expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:50quantile
job:promtail_request_duration_seconds:avg
expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
by (job)
record: job:promtail_request_duration_seconds:avg
job:promtail_request_duration_seconds_bucket:sum_rate
expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
record: job:promtail_request_duration_seconds_bucket:sum_rate
job:promtail_request_duration_seconds_sum:sum_rate
expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
record: job:promtail_request_duration_seconds_sum:sum_rate
job:promtail_request_duration_seconds_count:sum_rate
expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds_count:sum_rate
job_namespace:promtail_request_duration_seconds:99quantile
expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:99quantile
job_namespace:promtail_request_duration_seconds:50quantile
expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:50quantile
job_namespace:promtail_request_duration_seconds:avg
expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) / sum(rate(promtail_request_duration_seconds_count[1m]))
by (job, namespace)
record: job_namespace:promtail_request_duration_seconds:avg
job_namespace:promtail_request_duration_seconds_bucket:sum_rate
expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
job_namespace:promtail_request_duration_seconds_sum:sum_rate
expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
job_namespace:promtail_request_duration_seconds_count:sum_rate
expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
job_status_code_namespace:promtail_request_duration_seconds:99quantile
expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
job_status_code_namespace:promtail_request_duration_seconds:50quantile
expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
job_status_code_namespace:promtail_request_duration_seconds:avg
expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace)
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds:avg
job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
Dashboards
Following dashboards are generated from mixins and hosted on github: