kubernetes-autoscaling


Overview

Jsonnet source code is available at github.com/adinhodovic/kubernetes-autoscaling-mixin

Alerts

Complete list of pregenerated alerts is available here.

karpenter

KarpenterCloudProviderErrors

alert: KarpenterCloudProviderErrors
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance
  description: The Karpenter provider {{ $labels.provider }} with the controller {{
    $labels.controller }} has errors with the method {{ $labels.method }}.
  summary: Karpenter has Cloud Provider Errors.
expr: |
  sum(
    increase(
      karpenter_cloudprovider_errors_total{
        job="karpenter",
        controller!~"nodeclaim.termination|node.termination",
        error!="NodeClaimNotFoundError"
      }[5m]
    )
  ) by (cluster, namespace, job, provider, controller, method) > 0
for: 5m
labels:
  severity: warning

KarpenterNodeClaimsTerminationDurationHigh

alert: KarpenterNodeClaimsTerminationDurationHigh
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kact-jkwq/kubernetes-autoscaling-karpenter-activity
  description: The average node claim termination duration in Karpenter has exceeded
    20 minutes for more than 15 minutes in nodepool {{ $labels.nodepool }}. This may
    indicate cloud provider issues or improper instance termination handling.
  summary: Karpenter Node Claims Termination Duration is High.
expr: |
  sum(
    karpenter_nodeclaims_termination_duration_seconds_sum{
      job="karpenter"
    }
  ) by (cluster, namespace, job, nodepool)
  /
  sum(
    karpenter_nodeclaims_termination_duration_seconds_count{
      job="karpenter"
    }
  ) by (cluster, namespace, job, nodepool) > 1200
for: 15m
labels:
  severity: warning

KarpenterNodepoolNearCapacity

alert: KarpenterNodepoolNearCapacity
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview
  description: The resource {{ $labels.resource_type }} in the Karpenter node pool
    {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources.
  summary: Karpenter Nodepool near capacity.
expr: |
  sum (
    karpenter_nodepools_usage{job="karpenter"}
  ) by (cluster, namespace, job, nodepool, resource_type)
  /
  sum (
    karpenter_nodepools_limit{job="karpenter"}
  ) by (cluster, namespace, job, nodepool, resource_type)
  * 100 > 75
for: 15m
labels:
  severity: warning

cluster-autoscaler

ClusterAutoscalerNodeCountNearCapacity

alert: ClusterAutoscalerNodeCountNearCapacity
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
  description: The node count for the cluster autoscaler job {{ $labels.job }} is
    reaching max limit. Consider scaling node groups.
  summary: Cluster Autoscaler Node Count near Capacity.
expr: |
  sum (
    cluster_autoscaler_nodes_count{
      job="cluster-autoscaler"
    }
  ) by (cluster, namespace, job)
  /
  sum (
    cluster_autoscaler_max_nodes_count{
      job="cluster-autoscaler"
    }
  ) by (cluster, namespace, job)
  * 100 > 75
for: 15m
labels:
  severity: warning

ClusterAutoscalerUnschedulablePods

alert: ClusterAutoscalerUnschedulablePods
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler
  description: The cluster currently has unschedulable pods, indicating resource shortages.
    Consider adding more nodes or increasing node group capacity.
  summary: Pods Pending Scheduling - Cluster Node Group Scaling Required
expr: |
  sum (
    cluster_autoscaler_unschedulable_pods_count{
      job="cluster-autoscaler"
    }
  ) by (cluster, namespace, job)
  > 0
for: 15m
labels:
  severity: warning

keda

KedaScaledJobErrors

alert: KedaScaledJobErrors
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kedasj-jkwq/kubernetes-autoscaling-keda-scaled-job?var-scaled_job={{
    $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}
  description: KEDA scaled jobs are experiencing errors. Check the scaled job {{ $labels.scaledObject
    }} in the namespace {{ $labels.exported_namespace }}.
  summary: Errors detected for KEDA scaled jobs.
expr: |
  sum(
    increase(
      keda_scaled_job_errors_total{
        job="keda-operator"
      }[10m]
    )
  ) by (cluster, job, exported_namespace, scaledObject) > 0
for: 1m
labels:
  severity: warning

KedaScaledObjectErrors

alert: KedaScaledObjectErrors
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{
    $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}
  description: KEDA scaled objects are experiencing errors. Check the scaled object
    {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}.
  summary: Errors detected for KEDA scaled objects.
expr: |
  sum(
    increase(
      keda_scaled_object_errors_total{
        job="keda-operator"
      }[10m]
    )
  ) by (cluster, job, exported_namespace, scaledObject) > 0
for: 1m
labels:
  severity: warning

KedaScalerLatencyHigh

alert: KedaScalerLatencyHigh
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{
    $labels.scaledObject }}&var-scaler={{ $labels.scaler }}
  description: Metric latency for scaler {{ $labels.scaler }} for the object {{ $labels.scaledObject
    }} has exceeded acceptable limits.
  summary: High latency for KEDA scaler metrics.
expr: |
  avg(
    keda_scaler_metrics_latency_seconds{
      job="keda-operator"
    }
  ) by (cluster, job, exported_namespace, scaledObject, scaler) > 5
for: 10m
labels:
  severity: warning

KedaScaledObjectPaused

alert: KedaScaledObjectPaused
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{
    $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}
  description: The scaled object {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace
    }} is paused for longer than 25h. This may indicate a configuration issue or manual
    intervention.
  summary: KEDA scaled object is paused.
expr: |
  max(
    keda_scaled_object_paused{
      job="keda-operator"
    }
  ) by (cluster, job, exported_namespace, scaledObject) > 0
for: 25h
labels:
  severity: warning

KedaScalerDetailErrors

alert: KedaScalerDetailErrors
annotations:
  dashboard_url: https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaler={{
    $labels.scaler }}&var-scaled_object={{ $labels.scaledObject }}
  description: Errors have occurred in the KEDA scaler {{ $labels.scaler }}. Investigate
    the scaler for the {{ $labels.type }} {{ $labels.scaledObject }} in namespace
    {{ $labels.exported_namespace }}.
  summary: Errors detected in KEDA scaler.
expr: |
  sum(
    increase(
      keda_scaler_detail_errors_total{
        job="keda-operator"
      }[10m]
    )
  ) by (cluster, job, exported_namespace, scaledObject, type, scaler) > 0
for: 1m
labels:
  severity: warning

Dashboards

Following dashboards are generated from mixins and hosted on github: