promtail

Overview

Jsonnet source code is available at github.com/grafana/loki

Alerts

Complete list of pregenerated alerts is available here.

promtail_alerts

PromtailRequestsErrors

alert: PromtailRequestsErrors
annotations:
  message: |
    {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
  100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
    /
  sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
    > 10
for: 15m
labels:
  severity: critical

PromtailRequestLatency

alert: PromtailRequestLatency
annotations:
  message: |
    {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
  job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
  severity: critical

PromtailFileLagging

alert: PromtailFileLagging
annotations:
  message: |
    {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m.
expr: |
  abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6
for: 15m
labels:
  severity: warning

PromtailFileMissing

alert: PromtailFileMissing
annotations:
  message: |
    {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
expr: |
  promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
  severity: critical

Recording rules

Complete list of pregenerated recording rules is available here.

promtail_rules

job:promtail_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
  by (le, job))
record: job:promtail_request_duration_seconds:99quantile

job:promtail_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
  by (le, job))
record: job:promtail_request_duration_seconds:50quantile

job:promtail_request_duration_seconds:avg

expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
  by (job)
record: job:promtail_request_duration_seconds:avg

job:promtail_request_duration_seconds_bucket:sum_rate

expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
record: job:promtail_request_duration_seconds_bucket:sum_rate

job:promtail_request_duration_seconds_sum:sum_rate

expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
record: job:promtail_request_duration_seconds_sum:sum_rate

job:promtail_request_duration_seconds_count:sum_rate

expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds_count:sum_rate

job_namespace:promtail_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
  by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:99quantile

job_namespace:promtail_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
  by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:50quantile

job_namespace:promtail_request_duration_seconds:avg

expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) / sum(rate(promtail_request_duration_seconds_count[1m]))
  by (job, namespace)
record: job_namespace:promtail_request_duration_seconds:avg

job_namespace:promtail_request_duration_seconds_bucket:sum_rate

expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate

job_namespace:promtail_request_duration_seconds_sum:sum_rate

expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate

job_namespace:promtail_request_duration_seconds_count:sum_rate

expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_count:sum_rate

job_status_code_namespace:promtail_request_duration_seconds:99quantile

expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
  by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile

job_status_code_namespace:promtail_request_duration_seconds:50quantile

expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
  by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile

job_status_code_namespace:promtail_request_duration_seconds:avg

expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace)
  / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds:avg

job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate

expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
  namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate

job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate

expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate

job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate

expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
  namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate

Dashboards

Following dashboards are generated from mixins and hosted on github: