opensearch

Overview

Jsonnet source code is available at github.com/grafana/jsonnet-libs

Alerts

Complete list of pregenerated alerts is available here.

opensearch-alerts

OpenSearchYellowCluster

alert: OpenSearchYellowCluster
annotations:
  description: '{{$labels.cluster}} health status is yellow over the last 5 minutes'
  summary: At least one of the clusters is reporting a yellow status.
expr: |
  opensearch_cluster_status{opensearch_cluster!=""} == 1
for: 5m
labels:
  severity: warning

OpenSearchRedCluster

alert: OpenSearchRedCluster
annotations:
  description: '{{$labels.cluster}} health status is red over the last 5 minutes'
  summary: At least one of the clusters is reporting a red status.
expr: |
  opensearch_cluster_status{opensearch_cluster!=""} == 2
for: 5m
labels:
  severity: critical

OpenSearchUnstableShardReallocation

alert: OpenSearchUnstableShardReallocation
annotations:
  description: |
    {{$labels.cluster}} has had {{ printf "%.0f" $value }} shard reallocation over the last 1m which is above the threshold of 0.
  summary: A node has gone offline or has been disconnected triggering shard reallocation.
expr: |
  sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="relocating"}) > 0
for: 1m
labels:
  severity: warning

OpenSearchUnstableShardUnassigned

alert: OpenSearchUnstableShardUnassigned
annotations:
  description: |
    {{$labels.cluster}} has had {{ printf "%.0f" $value }} shard unassigned over the last 5m which is above the threshold of 0.
  summary: There are shards that have been detected as unassigned.
expr: |
  sum without(type) (opensearch_cluster_shards_number{opensearch_cluster!="", type="unassigned"}) > 0
for: 5m
labels:
  severity: warning

OpenSearchHighNodeDiskUsage

alert: OpenSearchHighNodeDiskUsage
annotations:
  description: |
    {{$labels.node}} has had {{ printf "%.0f" $value }} disk usage over the last 5m which is above the threshold of 60.
  summary: The node disk usage has exceeded the warning threshold.
expr: |
  100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes{opensearch_cluster!=""}) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 60
for: 5m
labels:
  severity: warning

OpenSearchHighNodeDiskUsage

alert: OpenSearchHighNodeDiskUsage
annotations:
  description: |
    {{$labels.node}} has had {{ printf "%.0f" $value }}% disk usage over the last 5m which is above the threshold of 80.
  summary: The node disk usage has exceeded the critical threshold.
expr: |
  100 * sum without(nodeid, path, mount, type) ((opensearch_fs_path_total_bytes{opensearch_cluster!=""} - opensearch_fs_path_free_bytes) / opensearch_fs_path_total_bytes{opensearch_cluster!=""}) > 80
for: 5m
labels:
  severity: critical

OpenSearchHighNodeCpuUsage

alert: OpenSearchHighNodeCpuUsage
annotations:
  description: |
    {{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 70.
  summary: The node CPU usage has exceeded the warning threshold.
expr: |
  sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 70
for: 5m
labels:
  severity: warning

OpenSearchHighNodeCpuUsage

alert: OpenSearchHighNodeCpuUsage
annotations:
  description: |
    {{$labels.node}} has had {{ printf "%.0f" $value }}% CPU usage over the last 5m which is above the threshold of 85.
  summary: The node CPU usage has exceeded the critical threshold.
expr: |
  sum without(nodeid) (opensearch_os_cpu_percent{opensearch_cluster!=""}) > 85
for: 5m
labels:
  severity: critical

OpenSearchHighNodeMemoryUsage

alert: OpenSearchHighNodeMemoryUsage
annotations:
  description: |
    {{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 70.
  summary: The node memory usage has exceeded the warning threshold.
expr: |
  sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 70
for: 5m
labels:
  severity: warning

OpenSearchHighNodeMemoryUsage

alert: OpenSearchHighNodeMemoryUsage
annotations:
  description: |
    {{$labels.node}} has had {{ printf "%.0f" $value }}% memory usage over the last 5m which is above the threshold of 85.
  summary: The node memory usage has exceeded the critical threshold.
expr: |
  sum without(nodeid) (opensearch_os_mem_used_percent{opensearch_cluster!=""}) > 85
for: 5m
labels:
  severity: critical

OpenSearchModerateRequestLatency

alert: OpenSearchModerateRequestLatency
annotations:
  description: |
    {{$labels.index}} has had {{ printf "%.0f" $value }}s of request latency over the last 5m which is above the threshold of 0.5.
  summary: The request latency has exceeded the warning threshold.
expr: |
  sum without(context) ((increase(opensearch_index_search_fetch_time_seconds{opensearch_cluster!="", context="total"}[5m])+increase(opensearch_index_search_query_time_seconds{context="total"}[5m])+increase(opensearch_index_search_scroll_time_seconds{context="total"}[5m])) / clamp_min(increase(opensearch_index_search_fetch_count{context="total"}[5m])+increase(opensearch_index_search_query_count{context="total"}[5m])+increase(opensearch_index_search_scroll_count{context="total"}[5m]), 1)) > 0.5
for: 5m
labels:
  severity: warning

OpenSearchModerateIndexLatency

alert: OpenSearchModerateIndexLatency
annotations:
  description: |
    {{$labels.index}} has had {{ printf "%.0f" $value }}s of index latency over the last 5m which is above the threshold of 0.5.
  summary: The index latency has exceeded the warning threshold.
expr: |
  sum without(context) (increase(opensearch_index_indexing_index_time_seconds{opensearch_cluster!="", context="total"}[5m]) / clamp_min(increase(opensearch_index_indexing_index_count{context="total"}[5m]), 1)) > 0.5
for: 5m
labels:
  severity: warning

Dashboards

Following dashboards are generated from mixins and hosted on github: