promscale
Overview
Jsonnet source code is available at github.com/timescale/promscale
Alerts
Complete list of pregenerated alerts is available here.
promscale-general
PromscaleDown
alert: PromscaleDown
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} is down.'
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDown.md
summary: Promscale is down.
expr: absent(up{job=~".*promscale.*"})
labels:
severity: critical
promscale-ingest
PromscaleIngestHighErrorRate
alert: PromscaleIngestHighErrorRate
annotations:
description: Promscale ingestion is having a {{ $value | humanizePercentage }} error
rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
summary: High error rate in Promscale ingestion.
expr: |
(
sum by (job, instance, namespace, type) (
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, namespace, type) (
rate(promscale_ingest_requests_total[5m])
)
) > 0.05
labels:
severity: warning
PromscaleIngestHighErrorRate
alert: PromscaleIngestHighErrorRate
annotations:
description: Promscale ingestion is having a {{ $value | humanizePercentage }} error
rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighErrorRate.md
summary: High error rate in Promscale ingestion.
expr: |
(
sum by (job, instance, namespace, type) (
rate(promscale_ingest_requests_total{code=~"5.."}[5m])
)
/
sum by (job, instance, namespace, type) (
rate(promscale_ingest_requests_total[5m])
)
) > 0.1
labels:
severity: critical
PromscaleIngestHighLatency
alert: PromscaleIngestHighLatency
annotations:
description: Slowest 10% of ingestion batch took more than {{ $value | humanizeDuration
}} seconds to ingest.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
summary: Slow Promscale ingestion.
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, namespace, type, le) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 10
and
sum by (job, instance, namespace, type) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 0
for: 5m
labels:
severity: warning
PromscaleIngestHighLatency
alert: PromscaleIngestHighLatency
annotations:
description: Slowest 10% of ingestion batch took more than {{ $value | humanizeDuration
}} seconds to ingest.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighLatency.md
summary: Slow Promscale ingestion.
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, namespace, type, le) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 30
and
sum by (job, instance, namespace, type) (
rate(promscale_ingest_duration_seconds_bucket[5m])
)
) > 0
for: 5m
labels:
severity: critical
PromscaleIngestHighDataDuplication
alert: PromscaleIngestHighDataDuplication
annotations:
description: More than {{ $value | humanize }} samples/sec are rejected as duplicates
by promscale.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleIngestHighDataDuplication.md
summary: Duplicate data being inserted.
expr: |
sum by (job, namespace) (rate(promscale_ingest_duplicates_total{kind="sample"}[15m])) > 0
for: 30m
labels:
severity: warning
promscale-query
PromscaleQueryHighErrorRate
alert: PromscaleQueryHighErrorRate
annotations:
description: Evaluating queries via Promscale {{ $labels.handler }} endpoint has
{{ $value | humanizePercentage }} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
summary: High error rate in querying Promscale.
expr: |
(
sum by (job, instance, namespace, type, handler) (
rate(promscale_query_requests_total{code=~"5..",handler!="/api/v1/query_range",err!="canceled"}[5m])
)
/
sum by (job, instance, namespace, type, handler) (
rate(promscale_query_requests_total{handler!="/api/v1/query_range",err!="canceled"}[5m])
)
) > 0.05
labels:
severity: warning
PromscaleQueryHighErrorRate
alert: PromscaleQueryHighErrorRate
annotations:
description: Evaluating queries via Promscale {{ $labels.handler }} endpoint has
{{ $value | humanizePercentage }} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
summary: High error rate in querying Promscale.
expr: |
(
sum by (job, instance, namespace, type, handler) (
rate(promscale_query_requests_total{code=~"5..",handler="/api/v1/query_range",err!="canceled"}[5m])
)
/
sum by (job, instance, namespace, type, handler) (
rate(promscale_query_requests_total{handler="/api/v1/query_range",err!="canceled"}[5m])
)
) > 0.1
labels:
severity: warning
PromscaleQueryHighErrorRate
alert: PromscaleQueryHighErrorRate
annotations:
description: Evaluating queries via Promscale had {{ $value | humanizePercentage
}} error rate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighErrorRate.md
summary: High error rate in querying Promscale.
expr: |
(
sum by (job, instance, namespace, type) (
rate(promscale_query_requests_total{code=~"5..",err!="canceled"}[5m])
)
/
sum by (job, instance, namespace, type) (
rate(promscale_query_requests_total[5m])
)
) > 0.1
labels:
severity: critical
PromscaleQueryHighLatency
alert: PromscaleQueryHighLatency
annotations:
description: Slowest 10% of the queries took more than {{ $value | humanizeDuration
}} seconds to evaluate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
summary: Slow Promscale querying.
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, namespace, type, le) (
rate(promscale_query_duration_seconds_bucket[5m])
)
) > 5
and
sum by (job, instance, namespace, type) (
rate(promscale_query_duration_seconds_bucket[5m])
) > 0
)
for: 5m
labels:
severity: warning
PromscaleQueryHighLatency
alert: PromscaleQueryHighLatency
annotations:
description: Slowest 10% of the queries took {{ $value | humanizeDuration }} seconds
to evaluate.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleQueryHighLatency.md
summary: Slow Promscale querying.
expr: |
(
histogram_quantile(
0.90,
sum by (job, instance, namespace, type, le) (
rate(promscale_query_duration_seconds_bucket[5m])
)
) > 10
and
sum by (job, instance, namespace, type) (
rate(promscale_query_duration_seconds_bucket[5m])
) > 0
)
for: 5m
labels:
severity: critical
promscale-cache
PromscaleCacheHighNumberOfEvictions
alert: PromscaleCacheHighNumberOfEvictions
annotations:
description: Promscale {{ $labels.name }} is evicting at {{ $value }} entries a
second.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCacheHighNumberOfEvictions.md
summary: High cache eviction in Promscale.
expr: |
(
rate(promscale_cache_evictions_total[5m])
/
promscale_cache_capacity_elements
) > 0.2
labels:
severity: warning
promscale-database-connection
PromscaleStorageHighErrorRate
alert: PromscaleStorageHighErrorRate
annotations:
description: Promscale connection with the database has an error rate of {{ $value
| humanizePercentage }}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleDBHighErrorRate.md
summary: Promscale experiences a high error rate when connecting to the database.
expr: |
(
sum by (job, instance, namespace, method) (
rate(promscale_database_request_errors_total[5m])
)
/
sum by (job, instance, namespace, method) (
rate(promscale_database_requests_total[5m])
)
) > 0.05
labels:
severity: warning
PromscaleStorageHighLatency
alert: PromscaleStorageHighLatency
annotations:
description: Slowest 10% of database requests with method {{ $labels.method }} are
taking more than {{ $value | humanizeDuration }} seconds to respond.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageHighLatency.md
summary: Slow database response.
expr: |
(
histogram_quantile(0.9,
sum by (job, instance, namespace, method, le) (
rate(promscale_database_requests_duration_seconds_bucket{method!="exec"}[5m])
)
) > 5
and
sum by (job, instance, namespace, method) (
rate(promscale_database_requests_duration_seconds_count{method!="exec"}[5m])
) > 0
)
labels:
severity: warning
promscale-database
PromscaleStorageUnhealthy
alert: PromscaleStorageUnhealthy
annotations:
description: Promscale connection with the database has an error of {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleStorageUnhealthy.md
summary: Promscale database is unhealthy.
expr: |
(
sum by (job, instance, namespace) (
rate(promscale_sql_database_health_check_errors_total[5m])
)
/
sum by (job, instance, namespace) (
rate(promscale_sql_database_health_check_total[5m])
)
) > 0.05
labels:
severity: warning
PromscaleMaintenanceJobRunningTooLong
alert: PromscaleMaintenanceJobRunningTooLong
annotations:
description: Promscale Database is taking {{ $value | humanizeDuration }} seconds
to respond to Promscale's requests.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
summary: Promscale maintenance jobs taking too long to complete.
expr: |
(
(
(
time()
-
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds
)
>
30 * 60 * 2 # 30 mins (we launch maintenance jobs scheduled at 30 mins) * 60 (to seconds) * 2 (wait max for 2 complete scans before firing alert).
)
and
promscale_sql_database_worker_maintenance_job_start_timestamp_seconds > 0
)
labels:
severity: warning
PromscaleMaintenanceJobNotKeepingup
alert: PromscaleMaintenanceJobNotKeepingup
annotations:
description: The amount of work for the promscale maintenance {{ $labels.name }}
job is not decreasing for long time.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobRunningTooLong.md
summary: Promscale maintenance jobs are not keeping up.
expr: |
(
(
min_over_time(promscale_sql_database_chunks_metrics_uncompressed_count[1h]) > promscale_sql_database_metric_count
)
and
(
delta(promscale_sql_database_chunks_metrics_uncompressed_count[10m]) > 0
)
)
or
(
(
min_over_time(promscale_sql_database_chunks_metrics_expired_count[1h]) > promscale_sql_database_metric_count
)
and
(
delta(promscale_sql_database_chunks_metrics_expired_count[10m]) > 0
)
)
or
(
(
min_over_time(promscale_sql_database_chunks_traces_uncompressed_count[1h]) > promscale_sql_database_metric_count
)
and
(
delta(promscale_sql_database_chunks_traces_uncompressed_count[10m]) > 0
)
)
or
(
(
min_over_time(promscale_sql_database_chunks_traces_expired_count[1h]) > promscale_sql_database_metric_count
)
and
(
delta(promscale_sql_database_chunks_traces_expired_count[10m]) > 0
)
)
labels:
severity: warning
PromscaleMaintenanceJobFailures
alert: PromscaleMaintenanceJobFailures
annotations:
description: Maintenance job for Promscale instance {{ $labels.instance }} failed
to successfully execute.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleMaintenanceJobFailures.md
summary: Promscale maintenance job failed.
expr: promscale_sql_database_worker_maintenance_job_failed == 1
labels:
severity: warning
PromscaleCompressionLow
alert: PromscaleCompressionLow
annotations:
description: High uncompressed data in Promscale, on average, {{ $value }} uncompressed
chunks per metric.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscaleCompressionLow.md
summary: High uncompressed data.
expr: |
(
(
(promscale_sql_database_chunks_count - promscale_sql_database_chunks_compressed_count) # Number of uncompressed chunks.
/
promscale_sql_database_metric_count
) > 4 # If total number of average uncompressed chunk per metric is more than 4 chunks at maximum, we should alert.
and
promscale_sql_database_compression_status == 1
)
labels:
severity: warning
PromscalePostgreSQLSharedBuffersLow
alert: PromscalePostgreSQLSharedBuffersLow
annotations:
description: Currently open chunks are {{ $value | humanizePercentage }} of PostgreSQL
shared_buffers. This will impact database performance.
runbook_url: https://github.com/timescale/promscale/blob/master/docs/runbooks/PromscalePostgreSQLSharedBuffersLow.md
summary: Promscale database performance will be affected.
expr: |
(
((promscale_sql_database_open_chunks_total_table_size + promscale_sql_database_open_chunks_total_index_size)
/
promscale_sql_database_shared_buffers_size)
> 1 )
for: 10m
labels:
severity: warning
Dashboards
Following dashboards are generated from mixins and hosted on github: