⚠️ Caution ⚠️
Alert thresholds depend on nature of applications.
Some queries in this page may have arbitrary tolerance threshold.
Building an efficient and battle-tested monitoring platform takes time. 😉
#
1.1.
Prometheus self-monitoring
(28 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
#
1.1.1.
Prometheus job missing
A Prometheus job has disappeared
[copy]
- alert : PrometheusJobMissing
expr : absent(up{job="prometheus"})
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus job missing (instance {{ $labels.instance }})
description : " A Prometheus job has disappeared \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.2.
Prometheus target missing
A Prometheus target has disappeared. An exporter might be crashed.
[copy]
- alert : PrometheusTargetMissing
expr : up == 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus target missing (instance {{ $labels.instance }})
description : " A Prometheus target has disappeared. An exporter might be crashed. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.3.
Prometheus all targets missing
A Prometheus job does not have living target anymore.
[copy]
- alert : PrometheusAllTargetsMissing
expr : sum by (job) (up) == 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus all targets missing (instance {{ $labels.instance }})
description : " A Prometheus job does not have living target anymore. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.4.
Prometheus target missing with warmup time
Allow a job time to start up (10 minutes) before alerting that it's down.
[copy]
- alert : PrometheusTargetMissingWithWarmupTime
expr : sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus target missing with warmup time (instance {{ $labels.instance }})
description : " Allow a job time to start up (10 minutes) before alerting that it's down. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.5.
Prometheus configuration reload failure
Prometheus configuration reload error
[copy]
- alert : PrometheusConfigurationReloadFailure
expr : prometheus_config_last_reload_successful != 1
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus configuration reload failure (instance {{ $labels.instance }})
description : " Prometheus configuration reload error \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.6.
Prometheus too many restarts
Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
[copy]
- alert : PrometheusTooManyRestarts
expr : changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus too many restarts (instance {{ $labels.instance }})
description : " Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.7.
Prometheus AlertManager job missing
A Prometheus AlertManager job has disappeared
[copy]
- alert : PrometheusAlertmanagerJobMissing
expr : absent(up{job="alertmanager"})
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus AlertManager job missing (instance {{ $labels.instance }})
description : " A Prometheus AlertManager job has disappeared \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.8.
Prometheus AlertManager configuration reload failure
AlertManager configuration reload error
[copy]
- alert : PrometheusAlertmanagerConfigurationReloadFailure
expr : alertmanager_config_last_reload_successful != 1
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
description : " AlertManager configuration reload error \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.9.
Prometheus AlertManager config not synced
Configurations of AlertManager cluster instances are out of sync
[copy]
- alert : PrometheusAlertmanagerConfigNotSynced
expr : count(count_values("config_hash", alertmanager_config_hash)) > 1
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus AlertManager config not synced (instance {{ $labels.instance }})
description : " Configurations of AlertManager cluster instances are out of sync \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.10.
Prometheus AlertManager E2E dead man switch
Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
[copy]
- alert : PrometheusAlertmanagerE2eDeadManSwitch
expr : vector(1)
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
description : " Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.11.
Prometheus not connected to alertmanager
Prometheus cannot connect the alertmanager
[copy]
- alert : PrometheusNotConnectedToAlertmanager
expr : prometheus_notifications_alertmanagers_discovered < 1
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus not connected to alertmanager (instance {{ $labels.instance }})
description : " Prometheus cannot connect the alertmanager \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.12.
Prometheus rule evaluation failures
Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
[copy]
- alert : PrometheusRuleEvaluationFailures
expr : increase(prometheus_rule_evaluation_failures_total[3m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus rule evaluation failures (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.13.
Prometheus template text expansion failures
Prometheus encountered {{ $value }} template text expansion failures
[copy]
- alert : PrometheusTemplateTextExpansionFailures
expr : increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus template text expansion failures (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} template text expansion failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.14.
Prometheus rule evaluation slow
Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
[copy]
- alert : PrometheusRuleEvaluationSlow
expr : prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for : 5m
labels :
severity : warning
annotations :
summary : Prometheus rule evaluation slow (instance {{ $labels.instance }})
description : " Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.15.
Prometheus notifications backlog
The Prometheus notification queue has not been empty for 10 minutes
[copy]
- alert : PrometheusNotificationsBacklog
expr : min_over_time(prometheus_notifications_queue_length[10m]) > 0
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus notifications backlog (instance {{ $labels.instance }})
description : " The Prometheus notification queue has not been empty for 10 minutes \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.16.
Prometheus AlertManager notification failing
Alertmanager is failing sending notifications
[copy]
- alert : PrometheusAlertmanagerNotificationFailing
expr : rate(alertmanager_notifications_failed_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus AlertManager notification failing (instance {{ $labels.instance }})
description : " Alertmanager is failing sending notifications \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.17.
Prometheus target empty
Prometheus has no target in service discovery
[copy]
- alert : PrometheusTargetEmpty
expr : prometheus_sd_discovered_targets == 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus target empty (instance {{ $labels.instance }})
description : " Prometheus has no target in service discovery \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.18.
Prometheus target scraping slow
Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
[copy]
- alert : PrometheusTargetScrapingSlow
expr : prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05
for : 5m
labels :
severity : warning
annotations :
summary : Prometheus target scraping slow (instance {{ $labels.instance }})
description : " Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.19.
Prometheus large scrape
Prometheus has many scrapes that exceed the sample limit
[copy]
- alert : PrometheusLargeScrape
expr : increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for : 5m
labels :
severity : warning
annotations :
summary : Prometheus large scrape (instance {{ $labels.instance }})
description : " Prometheus has many scrapes that exceed the sample limit \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.20.
Prometheus target scrape duplicate
Prometheus has many samples rejected due to duplicate timestamps but different values
[copy]
- alert : PrometheusTargetScrapeDuplicate
expr : increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus target scrape duplicate (instance {{ $labels.instance }})
description : " Prometheus has many samples rejected due to duplicate timestamps but different values \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.21.
Prometheus TSDB checkpoint creation failures
Prometheus encountered {{ $value }} checkpoint creation failures
[copy]
- alert : PrometheusTsdbCheckpointCreationFailures
expr : increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} checkpoint creation failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.22.
Prometheus TSDB checkpoint deletion failures
Prometheus encountered {{ $value }} checkpoint deletion failures
[copy]
- alert : PrometheusTsdbCheckpointDeletionFailures
expr : increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} checkpoint deletion failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.23.
Prometheus TSDB compactions failed
Prometheus encountered {{ $value }} TSDB compactions failures
[copy]
- alert : PrometheusTsdbCompactionsFailed
expr : increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus TSDB compactions failed (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} TSDB compactions failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.24.
Prometheus TSDB head truncations failed
Prometheus encountered {{ $value }} TSDB head truncation failures
[copy]
- alert : PrometheusTsdbHeadTruncationsFailed
expr : increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} TSDB head truncation failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.25.
Prometheus TSDB reload failures
Prometheus encountered {{ $value }} TSDB reload failures
[copy]
- alert : PrometheusTsdbReloadFailures
expr : increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus TSDB reload failures (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} TSDB reload failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.26.
Prometheus TSDB WAL corruptions
Prometheus encountered {{ $value }} TSDB WAL corruptions
[copy]
- alert : PrometheusTsdbWalCorruptions
expr : increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} TSDB WAL corruptions \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.27.
Prometheus TSDB WAL truncations failed
Prometheus encountered {{ $value }} TSDB WAL truncation failures
[copy]
- alert : PrometheusTsdbWalTruncationsFailed
expr : increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
description : " Prometheus encountered {{ $value }} TSDB WAL truncation failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.1.28.
Prometheus timeseries cardinality
The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}
[copy]
- alert : PrometheusTimeseriesCardinality
expr : label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000
for : 0m
labels :
severity : warning
annotations :
summary : Prometheus timeseries cardinality (instance {{ $labels.instance }})
description : " The \" {{ $labels.name }} \" timeseries cardinality is getting very high: {{ $value }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.
Host and hardware
:
node-exporter
(38 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/host-and-hardware/node-exporter.yml
#
1.2.1.
Host out of memory
Node memory is filling up (< 10% left)
[copy]
- alert : HostOutOfMemory
expr : (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host out of memory (instance {{ $labels.instance }})
description : " Node memory is filling up (< 10% left) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.2.
Host memory under memory pressure
The node is under heavy memory pressure. High rate of major page faults
[copy]
- alert : HostMemoryUnderMemoryPressure
expr : (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host memory under memory pressure (instance {{ $labels.instance }})
description : " The node is under heavy memory pressure. High rate of major page faults \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.3.
Host Memory is underutilized
Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})
[copy]
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert : HostMemoryIsUnderutilized
expr : (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 1w
labels :
severity : info
annotations :
summary : Host Memory is underutilized (instance {{ $labels.instance }})
description : " Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }}) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.4.
Host unusual network throughput in
Host network interfaces are probably receiving too much data (> 100 MB/s)
[copy]
- alert : HostUnusualNetworkThroughputIn
expr : (sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 5m
labels :
severity : warning
annotations :
summary : Host unusual network throughput in (instance {{ $labels.instance }})
description : " Host network interfaces are probably receiving too much data (> 100 MB/s) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.5.
Host unusual network throughput out
Host network interfaces are probably sending too much data (> 100 MB/s)
[copy]
- alert : HostUnusualNetworkThroughputOut
expr : (sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 5m
labels :
severity : warning
annotations :
summary : Host unusual network throughput out (instance {{ $labels.instance }})
description : " Host network interfaces are probably sending too much data (> 100 MB/s) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.6.
Host unusual disk read rate
Disk is probably reading too much data (> 50 MB/s)
[copy]
- alert : HostUnusualDiskReadRate
expr : (sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 5m
labels :
severity : warning
annotations :
summary : Host unusual disk read rate (instance {{ $labels.instance }})
description : " Disk is probably reading too much data (> 50 MB/s) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.7.
Host unusual disk write rate
Disk is probably writing too much data (> 50 MB/s)
[copy]
- alert : HostUnusualDiskWriteRate
expr : (sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host unusual disk write rate (instance {{ $labels.instance }})
description : " Disk is probably writing too much data (> 50 MB/s) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.8.
Host out of disk space
Disk is almost full (< 10% left)
[copy]
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert : HostOutOfDiskSpace
expr : ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host out of disk space (instance {{ $labels.instance }})
description : " Disk is almost full (< 10% left) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.9.
Host disk will fill in 24 hours
Filesystem is predicted to run out of space within the next 24 hours at current write rate
[copy]
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert : HostDiskWillFillIn24Hours
expr : ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host disk will fill in 24 hours (instance {{ $labels.instance }})
description : " Filesystem is predicted to run out of space within the next 24 hours at current write rate \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.10.
Host out of inodes
Disk is almost running out of available inodes (< 10% left)
[copy]
- alert : HostOutOfInodes
expr : (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host out of inodes (instance {{ $labels.instance }})
description : " Disk is almost running out of available inodes (< 10% left) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.11.
Host filesystem device error
{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem
[copy]
- alert : HostFilesystemDeviceError
expr : node_filesystem_device_error == 1
for : 2m
labels :
severity : critical
annotations :
summary : Host filesystem device error (instance {{ $labels.instance }})
description : " {{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.12.
Host inodes will fill in 24 hours
Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
[copy]
- alert : HostInodesWillFillIn24Hours
expr : (node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description : " Filesystem is predicted to run out of inodes within the next 24 hours at current write rate \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.13.
Host unusual disk read latency
Disk latency is growing (read operations > 100ms)
[copy]
- alert : HostUnusualDiskReadLatency
expr : (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host unusual disk read latency (instance {{ $labels.instance }})
description : " Disk latency is growing (read operations > 100ms) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.14.
Host unusual disk write latency
Disk latency is growing (write operations > 100ms)
[copy]
- alert : HostUnusualDiskWriteLatency
expr : (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host unusual disk write latency (instance {{ $labels.instance }})
description : " Disk latency is growing (write operations > 100ms) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.15.
Host high CPU load
CPU load is > 80%
[copy]
- alert : HostHighCpuLoad
expr : (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 10m
labels :
severity : warning
annotations :
summary : Host high CPU load (instance {{ $labels.instance }})
description : " CPU load is > 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.16.
Host CPU is underutilized
CPU load is < 20% for 1 week. Consider reducing the number of CPUs.
[copy]
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert : HostCpuIsUnderutilized
expr : (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 1w
labels :
severity : info
annotations :
summary : Host CPU is underutilized (instance {{ $labels.instance }})
description : " CPU load is < 20% for 1 week. Consider reducing the number of CPUs. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.17.
Host CPU steal noisy neighbor
CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
[copy]
- alert : HostCpuStealNoisyNeighbor
expr : (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : warning
annotations :
summary : Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description : " CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.18.
Host CPU high iowait
CPU iowait > 10%. A high iowait means that you are disk or network bound.
[copy]
- alert : HostCpuHighIowait
expr : (avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : warning
annotations :
summary : Host CPU high iowait (instance {{ $labels.instance }})
description : " CPU iowait > 10%. A high iowait means that you are disk or network bound. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.19.
Host unusual disk IO
Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.
[copy]
- alert : HostUnusualDiskIo
expr : (rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 5m
labels :
severity : warning
annotations :
summary : Host unusual disk IO (instance {{ $labels.instance }})
description : " Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.20.
Host context switching high
Context switching is growing on the node (twice the daily average during the last 15m)
[copy]
# x2 context switches is an arbitrary number.
# The alert threshold depends on the nature of the application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert : HostContextSwitchingHigh
expr : (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
/
(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
for : 0m
labels :
severity : warning
annotations :
summary : Host context switching high (instance {{ $labels.instance }})
description : " Context switching is growing on the node (twice the daily average during the last 15m) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.21.
Host swap is filling up
Swap is filling up (>80%)
[copy]
- alert : HostSwapIsFillingUp
expr : ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host swap is filling up (instance {{ $labels.instance }})
description : " Swap is filling up (>80%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.22.
Host systemd service crashed
systemd service crashed
[copy]
- alert : HostSystemdServiceCrashed
expr : (node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : warning
annotations :
summary : Host systemd service crashed (instance {{ $labels.instance }})
description : " systemd service crashed \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.23.
Host physical component too hot
Physical hardware component too hot
[copy]
- alert : HostPhysicalComponentTooHot
expr : ((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 5m
labels :
severity : warning
annotations :
summary : Host physical component too hot (instance {{ $labels.instance }})
description : " Physical hardware component too hot \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.24.
Host node overtemperature alarm
Physical node temperature alarm triggered
[copy]
- alert : HostNodeOvertemperatureAlarm
expr : ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : critical
annotations :
summary : Host node overtemperature alarm (instance {{ $labels.instance }})
description : " Physical node temperature alarm triggered \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.25.
Host RAID array got inactive
RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.
[copy]
- alert : HostRaidArrayGotInactive
expr : (node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : critical
annotations :
summary : Host RAID array got inactive (instance {{ $labels.instance }})
description : " RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.26.
Host RAID disk failure
At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
[copy]
- alert : HostRaidDiskFailure
expr : (node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host RAID disk failure (instance {{ $labels.instance }})
description : " At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.27.
Host kernel version deviations
Different kernel versions are running
[copy]
- alert : HostKernelVersionDeviations
expr : (count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 6h
labels :
severity : warning
annotations :
summary : Host kernel version deviations (instance {{ $labels.instance }})
description : " Different kernel versions are running \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.28.
Host OOM kill detected
OOM kill detected
[copy]
- alert : HostOomKillDetected
expr : (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : warning
annotations :
summary : Host OOM kill detected (instance {{ $labels.instance }})
description : " OOM kill detected \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.29.
Host EDAC Correctable Errors detected
Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
[copy]
- alert : HostEdacCorrectableErrorsDetected
expr : (increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : info
annotations :
summary : Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description : " Host {{ $labels.instance }} has had {{ printf \" %.0f \" $value }} correctable memory errors reported by EDAC in the last 5 minutes. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.30.
Host EDAC Uncorrectable Errors detected
Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
[copy]
- alert : HostEdacUncorrectableErrorsDetected
expr : (node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 0m
labels :
severity : warning
annotations :
summary : Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description : " Host {{ $labels.instance }} has had {{ printf \" %.0f \" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.31.
Host Network Receive Errors
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
[copy]
- alert : HostNetworkReceiveErrors
expr : (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host Network Receive Errors (instance {{ $labels.instance }})
description : " Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \" %.0f \" $value }} receive errors in the last two minutes. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.32.
Host Network Transmit Errors
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
[copy]
- alert : HostNetworkTransmitErrors
expr : (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host Network Transmit Errors (instance {{ $labels.instance }})
description : " Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \" %.0f \" $value }} transmit errors in the last two minutes. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.33.
Host Network Interface Saturated
The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
[copy]
- alert : HostNetworkInterfaceSaturated
expr : ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 1m
labels :
severity : warning
annotations :
summary : Host Network Interface Saturated (instance {{ $labels.instance }})
description : " The network interface \" {{ $labels.device }} \" on \" {{ $labels.instance }} \" is getting overloaded. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.34.
Host Network Bond Degraded
Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
[copy]
- alert : HostNetworkBondDegraded
expr : ((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host Network Bond Degraded (instance {{ $labels.instance }})
description : " Bond \" {{ $labels.device }} \" degraded on \" {{ $labels.instance }} \" . \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.35.
Host conntrack limit
The number of conntrack is approaching limit
[copy]
- alert : HostConntrackLimit
expr : (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 5m
labels :
severity : warning
annotations :
summary : Host conntrack limit (instance {{ $labels.instance }})
description : " The number of conntrack is approaching limit \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.36.
Host clock skew
Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.
[copy]
- alert : HostClockSkew
expr : ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 10m
labels :
severity : warning
annotations :
summary : Host clock skew (instance {{ $labels.instance }})
description : " Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.37.
Host clock not synchronising
Clock not synchronising. Ensure NTP is configured on this host.
[copy]
- alert : HostClockNotSynchronising
expr : (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 2m
labels :
severity : warning
annotations :
summary : Host clock not synchronising (instance {{ $labels.instance }})
description : " Clock not synchronising. Ensure NTP is configured on this host. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.2.38.
Host requires reboot
{{ $labels.instance }} requires a reboot.
[copy]
- alert : HostRequiresReboot
expr : (node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for : 4h
labels :
severity : info
annotations :
summary : Host requires reboot (instance {{ $labels.instance }})
description : " {{ $labels.instance }} requires a reboot. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.3.
S.M.A.R.T Device Monitoring
:
smartctl-exporter
(5 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
#
1.3.1.
Smart device temperature warning
Device temperature warning (instance {{ $labels.instance }})
[copy]
- alert : SmartDeviceTemperatureWarning
expr : smartctl_device_temperature > 60
for : 2m
labels :
severity : warning
annotations :
summary : Smart device temperature warning (instance {{ $labels.instance }})
description : " Device temperature warning (instance {{ $labels.instance }}) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.3.2.
Smart device temperature critical
Device temperature critical (instance {{ $labels.instance }})
[copy]
- alert : SmartDeviceTemperatureCritical
expr : smartctl_device_temperature > 80
for : 2m
labels :
severity : critical
annotations :
summary : Smart device temperature critical (instance {{ $labels.instance }})
description : " Device temperature critical (instance {{ $labels.instance }}) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.3.3.
Smart critical warning
device has critical warning (instance {{ $labels.instance }})
[copy]
- alert : SmartCriticalWarning
expr : smartctl_device_critical_warning > 0
for : 15m
labels :
severity : critical
annotations :
summary : Smart critical warning (instance {{ $labels.instance }})
description : " device has critical warning (instance {{ $labels.instance }}) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.3.4.
Smart media errors
device has media errors (instance {{ $labels.instance }})
[copy]
- alert : SmartMediaErrors
expr : smartctl_device_media_errors > 0
for : 15m
labels :
severity : critical
annotations :
summary : Smart media errors (instance {{ $labels.instance }})
description : " device has media errors (instance {{ $labels.instance }}) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.3.5.
Smart NVME Wearout Indicator
NVMe device is wearing out (instance {{ $labels.instance }})
[copy]
- alert : SmartNvmeWearoutIndicator
expr : smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}
for : 15m
labels :
severity : critical
annotations :
summary : Smart NVME Wearout Indicator (instance {{ $labels.instance }})
description : " NVMe device is wearing out (instance {{ $labels.instance }}) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.
Docker containers
:
google/cAdvisor
(8 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/docker-containers/google-cadvisor.yml
#
1.4.1.
Container killed
A container has disappeared
[copy]
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert : ContainerKilled
expr : time() - container_last_seen > 60
for : 0m
labels :
severity : warning
annotations :
summary : Container killed (instance {{ $labels.instance }})
description : " A container has disappeared \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.2.
Container absent
A container is absent for 5 min
[copy]
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert : ContainerAbsent
expr : absent(container_last_seen)
for : 5m
labels :
severity : warning
annotations :
summary : Container absent (instance {{ $labels.instance }})
description : " A container is absent for 5 min \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.3.
Container High CPU utilization
Container CPU utilization is above 80%
[copy]
- alert : ContainerHighCpuUtilization
expr : (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80
for : 2m
labels :
severity : warning
annotations :
summary : Container High CPU utilization (instance {{ $labels.instance }})
description : " Container CPU utilization is above 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.4.
Container High Memory usage
Container Memory usage is above 80%
[copy]
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert : ContainerHighMemoryUsage
expr : (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
for : 2m
labels :
severity : warning
annotations :
summary : Container High Memory usage (instance {{ $labels.instance }})
description : " Container Memory usage is above 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.5.
Container Volume usage
Container Volume usage is above 80%
[copy]
- alert : ContainerVolumeUsage
expr : (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : Container Volume usage (instance {{ $labels.instance }})
description : " Container Volume usage is above 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.6.
Container high throttle rate
Container is being throttled
[copy]
- alert : ContainerHighThrottleRate
expr : sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )
for : 5m
labels :
severity : warning
annotations :
summary : Container high throttle rate (instance {{ $labels.instance }})
description : " Container is being throttled \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.7.
Container Low CPU utilization
Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.
[copy]
- alert : ContainerLowCpuUtilization
expr : (sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20
for : 7d
labels :
severity : info
annotations :
summary : Container Low CPU utilization (instance {{ $labels.instance }})
description : " Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.4.8.
Container Low Memory usage
Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.
[copy]
- alert : ContainerLowMemoryUsage
expr : (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20
for : 7d
labels :
severity : info
annotations :
summary : Container Low Memory usage (instance {{ $labels.instance }})
description : " Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/blackbox/blackbox-exporter.yml
#
1.5.1.
Blackbox probe failed
Probe failed
[copy]
- alert : BlackboxProbeFailed
expr : probe_success == 0
for : 0m
labels :
severity : critical
annotations :
summary : Blackbox probe failed (instance {{ $labels.instance }})
description : " Probe failed \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.2.
Blackbox configuration reload failure
Blackbox configuration reload failure
[copy]
- alert : BlackboxConfigurationReloadFailure
expr : blackbox_exporter_config_last_reload_successful != 1
for : 0m
labels :
severity : warning
annotations :
summary : Blackbox configuration reload failure (instance {{ $labels.instance }})
description : " Blackbox configuration reload failure \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.3.
Blackbox slow probe
Blackbox probe took more than 1s to complete
[copy]
- alert : BlackboxSlowProbe
expr : avg_over_time(probe_duration_seconds[1m]) > 1
for : 1m
labels :
severity : warning
annotations :
summary : Blackbox slow probe (instance {{ $labels.instance }})
description : " Blackbox probe took more than 1s to complete \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.4.
Blackbox probe HTTP failure
HTTP status code is not 200-399
[copy]
- alert : BlackboxProbeHttpFailure
expr : probe_http_status_code <= 199 OR probe_http_status_code >= 400
for : 0m
labels :
severity : critical
annotations :
summary : Blackbox probe HTTP failure (instance {{ $labels.instance }})
description : " HTTP status code is not 200-399 \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.5.
Blackbox SSL certificate will expire soon
SSL certificate expires in less than 20 days
[copy]
- alert : BlackboxSslCertificateWillExpireSoon
expr : 3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20
for : 0m
labels :
severity : warning
annotations :
summary : Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description : " SSL certificate expires in less than 20 days \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.6.
Blackbox SSL certificate will expire soon
SSL certificate expires in less than 3 days
[copy]
- alert : BlackboxSslCertificateWillExpireSoon
expr : 0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3
for : 0m
labels :
severity : critical
annotations :
summary : Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description : " SSL certificate expires in less than 3 days \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.7.
Blackbox SSL certificate expired
SSL certificate has expired already
[copy]
# For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
# need to enable insecure_skip_verify. Note that this will disable
# certificate validation.
# See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
- alert : BlackboxSslCertificateExpired
expr : round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0
for : 0m
labels :
severity : critical
annotations :
summary : Blackbox SSL certificate expired (instance {{ $labels.instance }})
description : " SSL certificate has expired already \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.8.
Blackbox probe slow HTTP
HTTP request took more than 1s
[copy]
- alert : BlackboxProbeSlowHttp
expr : avg_over_time(probe_http_duration_seconds[1m]) > 1
for : 1m
labels :
severity : warning
annotations :
summary : Blackbox probe slow HTTP (instance {{ $labels.instance }})
description : " HTTP request took more than 1s \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.5.9.
Blackbox probe slow ping
Blackbox ping took more than 1s
[copy]
- alert : BlackboxProbeSlowPing
expr : avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for : 1m
labels :
severity : warning
annotations :
summary : Blackbox probe slow ping (instance {{ $labels.instance }})
description : " Blackbox ping took more than 1s \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/windows-server/windows-exporter.yml
#
1.6.1.
Windows Server collector Error
Collector {{ $labels.collector }} was not successful
[copy]
- alert : WindowsServerCollectorError
expr : windows_exporter_collector_success == 0
for : 0m
labels :
severity : critical
annotations :
summary : Windows Server collector Error (instance {{ $labels.instance }})
description : " Collector {{ $labels.collector }} was not successful \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.6.2.
Windows Server service Status
Windows Service state is not OK
[copy]
- alert : WindowsServerServiceStatus
expr : windows_service_status{status="ok"} != 1
for : 1m
labels :
severity : critical
annotations :
summary : Windows Server service Status (instance {{ $labels.instance }})
description : " Windows Service state is not OK \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.6.3.
Windows Server CPU Usage
CPU Usage is more than 80%
[copy]
- alert : WindowsServerCpuUsage
expr : 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80
for : 0m
labels :
severity : warning
annotations :
summary : Windows Server CPU Usage (instance {{ $labels.instance }})
description : " CPU Usage is more than 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.6.4.
Windows Server memory Usage
Memory usage is more than 90%
[copy]
- alert : WindowsServerMemoryUsage
expr : 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
for : 2m
labels :
severity : warning
annotations :
summary : Windows Server memory Usage (instance {{ $labels.instance }})
description : " Memory usage is more than 90% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.6.5.
Windows Server disk Space Usage
Disk usage is more than 80%
[copy]
- alert : WindowsServerDiskSpaceUsage
expr : 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80
for : 2m
labels :
severity : critical
annotations :
summary : Windows Server disk Space Usage (instance {{ $labels.instance }})
description : " Disk usage is more than 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/vmware/pryorda-vmware-exporter.yml
#
1.7.1.
Virtual Machine Memory Warning
High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%
[copy]
- alert : VirtualMachineMemoryWarning
expr : vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90
for : 5m
labels :
severity : warning
annotations :
summary : Virtual Machine Memory Warning (instance {{ $labels.instance }})
description : " High memory usage on {{ $labels.instance }}: {{ $value | printf \" %.2f \" }}% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.7.2.
Virtual Machine Memory Critical
High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%
[copy]
- alert : VirtualMachineMemoryCritical
expr : vmware_vm_mem_usage_average / 100 >= 90
for : 1m
labels :
severity : critical
annotations :
summary : Virtual Machine Memory Critical (instance {{ $labels.instance }})
description : " High memory usage on {{ $labels.instance }}: {{ $value | printf \" %.2f \" }}% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.7.3.
High Number of Snapshots
High snapshots number on {{ $labels.instance }}: {{ $value }}
[copy]
- alert : HighNumberOfSnapshots
expr : vmware_vm_snapshots > 3
for : 30m
labels :
severity : warning
annotations :
summary : High Number of Snapshots (instance {{ $labels.instance }})
description : " High snapshots number on {{ $labels.instance }}: {{ $value }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.7.4.
Outdated Snapshots
Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days
[copy]
- alert : OutdatedSnapshots
expr : (time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3
for : 5m
labels :
severity : warning
annotations :
summary : Outdated Snapshots (instance {{ $labels.instance }})
description : " Outdated snapshots on {{ $labels.instance }}: {{ $value | printf \" %.0f \" }} days \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.
Netdata
:
Embedded exporter
(9 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/netdata/embedded-exporter.yml
#
1.8.1.
Netdata high cpu usage
Netdata high CPU usage (> 80%)
[copy]
- alert : NetdataHighCpuUsage
expr : rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m]) > 80
for : 5m
labels :
severity : warning
annotations :
summary : Netdata high cpu usage (instance {{ $labels.instance }})
description : " Netdata high CPU usage (> 80%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.2.
Host CPU steal noisy neighbor
CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
[copy]
- alert : HostCpuStealNoisyNeighbor
expr : rate(netdata_cpu_cpu_percentage_average{dimension="steal"}[1m]) > 10
for : 5m
labels :
severity : warning
annotations :
summary : Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description : " CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.3.
Netdata high memory usage
Netdata high memory usage (> 80%)
[copy]
- alert : NetdataHighMemoryUsage
expr : 100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20
for : 5m
labels :
severity : warning
annotations :
summary : Netdata high memory usage (instance {{ $labels.instance }})
description : " Netdata high memory usage (> 80%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.4.
Netdata low disk space
Netdata low disk space (> 80%)
[copy]
- alert : NetdataLowDiskSpace
expr : 100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20
for : 5m
labels :
severity : warning
annotations :
summary : Netdata low disk space (instance {{ $labels.instance }})
description : " Netdata low disk space (> 80%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.5.
Netdata predicted disk full
Netdata predicted disk full in 24 hours
[copy]
- alert : NetdataPredictedDiskFull
expr : predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0
for : 0m
labels :
severity : warning
annotations :
summary : Netdata predicted disk full (instance {{ $labels.instance }})
description : " Netdata predicted disk full in 24 hours \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.6.
Netdata MD mismatch cnt unsynchronized blocks
RAID Array have unsynchronized blocks
[copy]
- alert : NetdataMdMismatchCntUnsynchronizedBlocks
expr : netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024
for : 2m
labels :
severity : warning
annotations :
summary : Netdata MD mismatch cnt unsynchronized blocks (instance {{ $labels.instance }})
description : " RAID Array have unsynchronized blocks \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.7.
Netdata disk reallocated sectors
Reallocated sectors on disk
[copy]
- alert : NetdataDiskReallocatedSectors
expr : increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0
for : 0m
labels :
severity : info
annotations :
summary : Netdata disk reallocated sectors (instance {{ $labels.instance }})
description : " Reallocated sectors on disk \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.8.
Netdata disk current pending sector
Disk current pending sector
[copy]
- alert : NetdataDiskCurrentPendingSector
expr : netdata_smartd_log_current_pending_sector_count_sectors_average > 0
for : 0m
labels :
severity : warning
annotations :
summary : Netdata disk current pending sector (instance {{ $labels.instance }})
description : " Disk current pending sector \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
1.8.9.
Netdata reported uncorrectable disk sectors
Reported uncorrectable disk sectors
[copy]
- alert : NetdataReportedUncorrectableDiskSectors
expr : increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0
for : 0m
labels :
severity : warning
annotations :
summary : Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }})
description : " Reported uncorrectable disk sectors \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/mysql/mysqld-exporter.yml
#
2.1.1.
MySQL down
MySQL instance is down on {{ $labels.instance }}
[copy]
- alert : MysqlDown
expr : mysql_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : MySQL down (instance {{ $labels.instance }})
description : " MySQL instance is down on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.2.
MySQL too many connections (> 80%)
More than 80% of MySQL connections are in use on {{ $labels.instance }}
[copy]
- alert : MysqlTooManyConnections(>80%)
expr : max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : MySQL too many connections (> 80%) (instance {{ $labels.instance }})
description : " More than 80% of MySQL connections are in use on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.3.
MySQL high prepared statements utilization (> 80%)
High utilization of prepared statements (>80%) on {{ $labels.instance }}
[copy]
- alert : MysqlHighPreparedStatementsUtilization(>80%)
expr : max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
description : " High utilization of prepared statements (>80%) on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.4.
MySQL high threads running
More than 60% of MySQL connections are in running state on {{ $labels.instance }}
[copy]
- alert : MysqlHighThreadsRunning
expr : max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
for : 2m
labels :
severity : warning
annotations :
summary : MySQL high threads running (instance {{ $labels.instance }})
description : " More than 60% of MySQL connections are in running state on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.5.
MySQL Slave IO thread not running
MySQL Slave IO thread not running on {{ $labels.instance }}
[copy]
- alert : MysqlSlaveIoThreadNotRunning
expr : ( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0
for : 0m
labels :
severity : critical
annotations :
summary : MySQL Slave IO thread not running (instance {{ $labels.instance }})
description : " MySQL Slave IO thread not running on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.6.
MySQL Slave SQL thread not running
MySQL Slave SQL thread not running on {{ $labels.instance }}
[copy]
- alert : MysqlSlaveSqlThreadNotRunning
expr : ( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0
for : 0m
labels :
severity : critical
annotations :
summary : MySQL Slave SQL thread not running (instance {{ $labels.instance }})
description : " MySQL Slave SQL thread not running on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.7.
MySQL Slave replication lag
MySQL replication lag on {{ $labels.instance }}
[copy]
- alert : MysqlSlaveReplicationLag
expr : ( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30
for : 1m
labels :
severity : critical
annotations :
summary : MySQL Slave replication lag (instance {{ $labels.instance }})
description : " MySQL replication lag on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.8.
MySQL slow queries
MySQL server mysql has some new slow query.
[copy]
- alert : MysqlSlowQueries
expr : increase(mysql_global_status_slow_queries[1m]) > 0
for : 2m
labels :
severity : warning
annotations :
summary : MySQL slow queries (instance {{ $labels.instance }})
description : " MySQL server mysql has some new slow query. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.9.
MySQL InnoDB log waits
MySQL innodb log writes stalling
[copy]
- alert : MysqlInnodbLogWaits
expr : rate(mysql_global_status_innodb_log_waits[15m]) > 10
for : 0m
labels :
severity : warning
annotations :
summary : MySQL InnoDB log waits (instance {{ $labels.instance }})
description : " MySQL innodb log writes stalling \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.1.10.
MySQL restarted
MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
[copy]
- alert : MysqlRestarted
expr : mysql_global_status_uptime < 60
for : 0m
labels :
severity : info
annotations :
summary : MySQL restarted (instance {{ $labels.instance }})
description : " MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/postgresql/postgres-exporter.yml
#
2.2.1.
Postgresql down
Postgresql instance is down
[copy]
- alert : PostgresqlDown
expr : pg_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : Postgresql down (instance {{ $labels.instance }})
description : " Postgresql instance is down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.2.
Postgresql restarted
Postgresql restarted
[copy]
- alert : PostgresqlRestarted
expr : time() - pg_postmaster_start_time_seconds < 60
for : 0m
labels :
severity : critical
annotations :
summary : Postgresql restarted (instance {{ $labels.instance }})
description : " Postgresql restarted \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.3.
Postgresql exporter error
Postgresql exporter is showing errors. A query may be buggy in query.yaml
[copy]
- alert : PostgresqlExporterError
expr : pg_exporter_last_scrape_error > 0
for : 0m
labels :
severity : critical
annotations :
summary : Postgresql exporter error (instance {{ $labels.instance }})
description : " Postgresql exporter is showing errors. A query may be buggy in query.yaml \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.4.
Postgresql table not auto vacuumed
Table {{ $labels.relname }} has not been auto vacuumed for 10 days
[copy]
- alert : PostgresqlTableNotAutoVacuumed
expr : (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10
for : 0m
labels :
severity : warning
annotations :
summary : Postgresql table not auto vacuumed (instance {{ $labels.instance }})
description : " Table {{ $labels.relname }} has not been auto vacuumed for 10 days \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.5.
Postgresql table not auto analyzed
Table {{ $labels.relname }} has not been auto analyzed for 10 days
[copy]
- alert : PostgresqlTableNotAutoAnalyzed
expr : (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10
for : 0m
labels :
severity : warning
annotations :
summary : Postgresql table not auto analyzed (instance {{ $labels.instance }})
description : " Table {{ $labels.relname }} has not been auto analyzed for 10 days \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.6.
Postgresql too many connections
PostgreSQL instance has too many connections (> 80%).
[copy]
- alert : PostgresqlTooManyConnections
expr : sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)
for : 2m
labels :
severity : warning
annotations :
summary : Postgresql too many connections (instance {{ $labels.instance }})
description : " PostgreSQL instance has too many connections (> 80%). \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.7.
Postgresql not enough connections
PostgreSQL instance should have more connections (> 5)
[copy]
- alert : PostgresqlNotEnoughConnections
expr : sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
for : 2m
labels :
severity : warning
annotations :
summary : Postgresql not enough connections (instance {{ $labels.instance }})
description : " PostgreSQL instance should have more connections (> 5) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.8.
Postgresql dead locks
PostgreSQL has dead-locks
[copy]
- alert : PostgresqlDeadLocks
expr : increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
for : 0m
labels :
severity : warning
annotations :
summary : Postgresql dead locks (instance {{ $labels.instance }})
description : " PostgreSQL has dead-locks \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.9.
Postgresql high rollback rate
Ratio of transactions being aborted compared to committed is > 2 %
[copy]
- alert : PostgresqlHighRollbackRate
expr : sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02
for : 0m
labels :
severity : warning
annotations :
summary : Postgresql high rollback rate (instance {{ $labels.instance }})
description : " Ratio of transactions being aborted compared to committed is > 2 % \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.10.
Postgresql commit rate low
Postgresql seems to be processing very few transactions
[copy]
- alert : PostgresqlCommitRateLow
expr : rate(pg_stat_database_xact_commit[1m]) < 10
for : 2m
labels :
severity : critical
annotations :
summary : Postgresql commit rate low (instance {{ $labels.instance }})
description : " Postgresql seems to be processing very few transactions \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.11.
Postgresql low XID consumption
Postgresql seems to be consuming transaction IDs very slowly
[copy]
- alert : PostgresqlLowXidConsumption
expr : rate(pg_txid_current[1m]) < 5
for : 2m
labels :
severity : warning
annotations :
summary : Postgresql low XID consumption (instance {{ $labels.instance }})
description : " Postgresql seems to be consuming transaction IDs very slowly \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.12.
Postgresql high rate statement timeout
Postgres transactions showing high rate of statement timeouts
[copy]
- alert : PostgresqlHighRateStatementTimeout
expr : rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
for : 0m
labels :
severity : critical
annotations :
summary : Postgresql high rate statement timeout (instance {{ $labels.instance }})
description : " Postgres transactions showing high rate of statement timeouts \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.13.
Postgresql high rate deadlock
Postgres detected deadlocks
[copy]
- alert : PostgresqlHighRateDeadlock
expr : increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
for : 0m
labels :
severity : critical
annotations :
summary : Postgresql high rate deadlock (instance {{ $labels.instance }})
description : " Postgres detected deadlocks \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.14.
Postgresql unused replication slot
Unused Replication Slots
[copy]
- alert : PostgresqlUnusedReplicationSlot
expr : pg_replication_slots_active == 0
for : 1m
labels :
severity : warning
annotations :
summary : Postgresql unused replication slot (instance {{ $labels.instance }})
description : " Unused Replication Slots \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.15.
Postgresql too many dead tuples
PostgreSQL dead tuples is too large
[copy]
- alert : PostgresqlTooManyDeadTuples
expr : ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1
for : 2m
labels :
severity : warning
annotations :
summary : Postgresql too many dead tuples (instance {{ $labels.instance }})
description : " PostgreSQL dead tuples is too large \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.16.
Postgresql configuration changed
Postgres Database configuration change has occurred
[copy]
- alert : PostgresqlConfigurationChanged
expr : { __name__=~"pg_settings_.*" } != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m
for : 0m
labels :
severity : info
annotations :
summary : Postgresql configuration changed (instance {{ $labels.instance }})
description : " Postgres Database configuration change has occurred \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.17.
Postgresql SSL compression active
Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
[copy]
- alert : PostgresqlSslCompressionActive
expr : sum(pg_stat_ssl_compression) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Postgresql SSL compression active (instance {{ $labels.instance }})
description : " Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.18.
Postgresql too many locks acquired
Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
[copy]
- alert : PostgresqlTooManyLocksAcquired
expr : ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
for : 2m
labels :
severity : critical
annotations :
summary : Postgresql too many locks acquired (instance {{ $labels.instance }})
description : " Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.19.
Postgresql bloat index high (> 80%)
The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`
[copy]
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert : PostgresqlBloatIndexHigh(>80%)
expr : pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)
for : 1h
labels :
severity : warning
annotations :
summary : Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
description : " The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};` \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.20.
Postgresql bloat table high (> 80%)
The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`
[copy]
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert : PostgresqlBloatTableHigh(>80%)
expr : pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)
for : 1h
labels :
severity : warning
annotations :
summary : Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description : " The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};` \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.2.21.
Postgresql invalid index
The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`
[copy]
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert : PostgresqlInvalidIndex
expr : pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}
for : 6h
labels :
severity : warning
annotations :
summary : Postgresql invalid index (instance {{ $labels.instance }})
description : " The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};` \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/sql-server/ozarklake-mssql-exporter.yml
#
2.3.1.
SQL Server down
SQL server instance is down
[copy]
- alert : SqlServerDown
expr : mssql_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : SQL Server down (instance {{ $labels.instance }})
description : " SQL server instance is down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.3.2.
SQL Server deadlock
SQL Server is having some deadlock.
[copy]
- alert : SqlServerDeadlock
expr : increase(mssql_deadlocks[1m]) > 5
for : 0m
labels :
severity : warning
annotations :
summary : SQL Server deadlock (instance {{ $labels.instance }})
description : " SQL Server is having some deadlock. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/patroni/embedded-exporter-patroni.yml
#
2.4.1.
Patroni has no Leader
A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}
[copy]
- alert : PatroniHasNoLeader
expr : (max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)
for : 0m
labels :
severity : critical
annotations :
summary : Patroni has no Leader (instance {{ $labels.instance }})
description : " A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml
#
2.5.1.
PGBouncer active connections
PGBouncer pools are filling up
[copy]
- alert : PgbouncerActiveConnections
expr : pgbouncer_pools_server_active_connections > 200
for : 2m
labels :
severity : warning
annotations :
summary : PGBouncer active connections (instance {{ $labels.instance }})
description : " PGBouncer pools are filling up \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.5.2.
PGBouncer errors
PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.
[copy]
- alert : PgbouncerErrors
expr : increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10
for : 0m
labels :
severity : warning
annotations :
summary : PGBouncer errors (instance {{ $labels.instance }})
description : " PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.5.3.
PGBouncer max connections
The number of PGBouncer client connections has reached max_client_conn.
[copy]
- alert : PgbouncerMaxConnections
expr : increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : PGBouncer max connections (instance {{ $labels.instance }})
description : " The number of PGBouncer client connections has reached max_client_conn. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/redis/oliver006-redis-exporter.yml
#
2.6.1.
Redis down
Redis instance is down
[copy]
- alert : RedisDown
expr : redis_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : Redis down (instance {{ $labels.instance }})
description : " Redis instance is down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.2.
Redis missing master
Redis cluster has no node marked as master.
[copy]
- alert : RedisMissingMaster
expr : (count(redis_instance_info{role="master"}) or vector(0)) < 1
for : 0m
labels :
severity : critical
annotations :
summary : Redis missing master (instance {{ $labels.instance }})
description : " Redis cluster has no node marked as master. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.3.
Redis too many masters
Redis cluster has too many nodes marked as master.
[copy]
- alert : RedisTooManyMasters
expr : count(redis_instance_info{role="master"}) > 1
for : 0m
labels :
severity : critical
annotations :
summary : Redis too many masters (instance {{ $labels.instance }})
description : " Redis cluster has too many nodes marked as master. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.4.
Redis disconnected slaves
Redis not replicating for all slaves. Consider reviewing the redis replication status.
[copy]
- alert : RedisDisconnectedSlaves
expr : count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0
for : 0m
labels :
severity : critical
annotations :
summary : Redis disconnected slaves (instance {{ $labels.instance }})
description : " Redis not replicating for all slaves. Consider reviewing the redis replication status. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.5.
Redis replication broken
Redis instance lost a slave
[copy]
- alert : RedisReplicationBroken
expr : delta(redis_connected_slaves[1m]) < 0
for : 0m
labels :
severity : critical
annotations :
summary : Redis replication broken (instance {{ $labels.instance }})
description : " Redis instance lost a slave \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.6.
Redis cluster flapping
Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
[copy]
- alert : RedisClusterFlapping
expr : changes(redis_connected_slaves[1m]) > 1
for : 2m
labels :
severity : critical
annotations :
summary : Redis cluster flapping (instance {{ $labels.instance }})
description : " Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping). \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.7.
Redis missing backup
Redis has not been backuped for 24 hours
[copy]
- alert : RedisMissingBackup
expr : time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for : 0m
labels :
severity : critical
annotations :
summary : Redis missing backup (instance {{ $labels.instance }})
description : " Redis has not been backuped for 24 hours \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.8.
Redis out of system memory
Redis is running out of system memory (> 90%)
[copy]
# The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- alert : RedisOutOfSystemMemory
expr : redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
for : 2m
labels :
severity : warning
annotations :
summary : Redis out of system memory (instance {{ $labels.instance }})
description : " Redis is running out of system memory (> 90%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.9.
Redis out of configured maxmemory
Redis is running out of configured maxmemory (> 90%)
[copy]
- alert : RedisOutOfConfiguredMaxmemory
expr : redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0
for : 2m
labels :
severity : warning
annotations :
summary : Redis out of configured maxmemory (instance {{ $labels.instance }})
description : " Redis is running out of configured maxmemory (> 90%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.10.
Redis too many connections
Redis is running out of connections (> 90% used)
[copy]
- alert : RedisTooManyConnections
expr : redis_connected_clients / redis_config_maxclients * 100 > 90
for : 2m
labels :
severity : warning
annotations :
summary : Redis too many connections (instance {{ $labels.instance }})
description : " Redis is running out of connections (> 90% used) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.11.
Redis not enough connections
Redis instance should have more connections (> 5)
[copy]
- alert : RedisNotEnoughConnections
expr : redis_connected_clients < 5
for : 2m
labels :
severity : warning
annotations :
summary : Redis not enough connections (instance {{ $labels.instance }})
description : " Redis instance should have more connections (> 5) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.6.12.
Redis rejected connections
Some connections to Redis has been rejected
[copy]
- alert : RedisRejectedConnections
expr : increase(redis_rejected_connections_total[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Redis rejected connections (instance {{ $labels.instance }})
description : " Some connections to Redis has been rejected \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/mongodb/percona-mongodb-exporter.yml
#
2.7.1.1.
MongoDB Down
MongoDB instance is down
[copy]
- alert : MongodbDown
expr : mongodb_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB Down (instance {{ $labels.instance }})
description : " MongoDB instance is down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.1.2.
Mongodb replica member unhealthy
MongoDB replica member is not healthy
[copy]
- alert : MongodbReplicaMemberUnhealthy
expr : mongodb_rs_members_health == 0
for : 0m
labels :
severity : critical
annotations :
summary : Mongodb replica member unhealthy (instance {{ $labels.instance }})
description : " MongoDB replica member is not healthy \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.1.3.
MongoDB replication lag
Mongodb replication lag is more than 10s
[copy]
- alert : MongodbReplicationLag
expr : (mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication lag (instance {{ $labels.instance }})
description : " Mongodb replication lag is more than 10s \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.1.4.
MongoDB replication headroom
MongoDB replication headroom is <= 0
[copy]
- alert : MongodbReplicationHeadroom
expr : sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication headroom (instance {{ $labels.instance }})
description : " MongoDB replication headroom is <= 0 \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.1.5.
MongoDB number cursors open
Too many cursors opened by MongoDB for clients (> 10k)
[copy]
- alert : MongodbNumberCursorsOpen
expr : mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000
for : 2m
labels :
severity : warning
annotations :
summary : MongoDB number cursors open (instance {{ $labels.instance }})
description : " Too many cursors opened by MongoDB for clients (> 10k) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.1.6.
MongoDB cursors timeouts
Too many cursors are timing out
[copy]
- alert : MongodbCursorsTimeouts
expr : increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100
for : 2m
labels :
severity : warning
annotations :
summary : MongoDB cursors timeouts (instance {{ $labels.instance }})
description : " Too many cursors are timing out \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.1.7.
MongoDB too many connections
Too many connections (> 80%)
[copy]
- alert : MongodbTooManyConnections
expr : avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : MongoDB too many connections (instance {{ $labels.instance }})
description : " Too many connections (> 80%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.
MongoDB
:
dcu/mongodb_exporter
(10 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/mongodb/dcu-mongodb-exporter.yml
#
2.7.2.1.
MongoDB replication lag
Mongodb replication lag is more than 10s
[copy]
- alert : MongodbReplicationLag
expr : avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication lag (instance {{ $labels.instance }})
description : " Mongodb replication lag is more than 10s \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.2.
MongoDB replication Status 3
MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
[copy]
- alert : MongodbReplicationStatus3
expr : mongodb_replset_member_state == 3
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication Status 3 (instance {{ $labels.instance }})
description : " MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.3.
MongoDB replication Status 6
MongoDB Replication set member as seen from another member of the set, is not yet known
[copy]
- alert : MongodbReplicationStatus6
expr : mongodb_replset_member_state == 6
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication Status 6 (instance {{ $labels.instance }})
description : " MongoDB Replication set member as seen from another member of the set, is not yet known \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.4.
MongoDB replication Status 8
MongoDB Replication set member as seen from another member of the set, is unreachable
[copy]
- alert : MongodbReplicationStatus8
expr : mongodb_replset_member_state == 8
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication Status 8 (instance {{ $labels.instance }})
description : " MongoDB Replication set member as seen from another member of the set, is unreachable \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.5.
MongoDB replication Status 9
MongoDB Replication set member is actively performing a rollback. Data is not available for reads
[copy]
- alert : MongodbReplicationStatus9
expr : mongodb_replset_member_state == 9
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication Status 9 (instance {{ $labels.instance }})
description : " MongoDB Replication set member is actively performing a rollback. Data is not available for reads \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.6.
MongoDB replication Status 10
MongoDB Replication set member was once in a replica set but was subsequently removed
[copy]
- alert : MongodbReplicationStatus10
expr : mongodb_replset_member_state == 10
for : 0m
labels :
severity : critical
annotations :
summary : MongoDB replication Status 10 (instance {{ $labels.instance }})
description : " MongoDB Replication set member was once in a replica set but was subsequently removed \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.7.
MongoDB number cursors open
Too many cursors opened by MongoDB for clients (> 10k)
[copy]
- alert : MongodbNumberCursorsOpen
expr : mongodb_metrics_cursor_open{state="total_open"} > 10000
for : 2m
labels :
severity : warning
annotations :
summary : MongoDB number cursors open (instance {{ $labels.instance }})
description : " Too many cursors opened by MongoDB for clients (> 10k) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.8.
MongoDB cursors timeouts
Too many cursors are timing out
[copy]
- alert : MongodbCursorsTimeouts
expr : increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100
for : 2m
labels :
severity : warning
annotations :
summary : MongoDB cursors timeouts (instance {{ $labels.instance }})
description : " Too many cursors are timing out \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.9.
MongoDB too many connections
Too many connections (> 80%)
[copy]
- alert : MongodbTooManyConnections
expr : avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : MongoDB too many connections (instance {{ $labels.instance }})
description : " Too many connections (> 80%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.2.10.
MongoDB virtual memory usage
High memory usage
[copy]
- alert : MongodbVirtualMemoryUsage
expr : (sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3
for : 2m
labels :
severity : warning
annotations :
summary : MongoDB virtual memory usage (instance {{ $labels.instance }})
description : " High memory usage \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.7.3.
MongoDB
:
stefanprodan/mgob
(1 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/mongodb/stefanprodan-mgob-exporter.yml
#
2.7.3.1.
Mgob backup failed
MongoDB backup has failed
[copy]
- alert : MgobBackupFailed
expr : changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Mgob backup failed (instance {{ $labels.instance }})
description : " MongoDB backup has failed \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/rabbitmq/rabbitmq-exporter.yml
#
2.8.1.1.
RabbitMQ node down
Less than 3 nodes running in RabbitMQ cluster
[copy]
- alert : RabbitmqNodeDown
expr : sum(rabbitmq_build_info) < 3
for : 0m
labels :
severity : critical
annotations :
summary : RabbitMQ node down (instance {{ $labels.instance }})
description : " Less than 3 nodes running in RabbitMQ cluster \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.2.
RabbitMQ node not distributed
Distribution link state is not 'up'
[copy]
- alert : RabbitmqNodeNotDistributed
expr : erlang_vm_dist_node_state < 3
for : 0m
labels :
severity : critical
annotations :
summary : RabbitMQ node not distributed (instance {{ $labels.instance }})
description : " Distribution link state is not 'up' \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.3.
RabbitMQ instances different versions
Running different version of RabbitMQ in the same cluster, can lead to failure.
[copy]
- alert : RabbitmqInstancesDifferentVersions
expr : count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1
for : 1h
labels :
severity : warning
annotations :
summary : RabbitMQ instances different versions (instance {{ $labels.instance }})
description : " Running different version of RabbitMQ in the same cluster, can lead to failure. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.4.
RabbitMQ memory high
A node use more than 90% of allocated RAM
[copy]
- alert : RabbitmqMemoryHigh
expr : rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ memory high (instance {{ $labels.instance }})
description : " A node use more than 90% of allocated RAM \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.5.
RabbitMQ file descriptors usage
A node use more than 90% of file descriptors
[copy]
- alert : RabbitmqFileDescriptorsUsage
expr : rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ file descriptors usage (instance {{ $labels.instance }})
description : " A node use more than 90% of file descriptors \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.6.
RabbitMQ too many unack messages
Too many unacknowledged messages
[copy]
- alert : RabbitmqTooManyUnackMessages
expr : sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
for : 1m
labels :
severity : warning
annotations :
summary : RabbitMQ too many unack messages (instance {{ $labels.instance }})
description : " Too many unacknowledged messages \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.7.
RabbitMQ too many connections
The total connections of a node is too high
[copy]
- alert : RabbitmqTooManyConnections
expr : rabbitmq_connections > 1000
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ too many connections (instance {{ $labels.instance }})
description : " The total connections of a node is too high \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.8.
RabbitMQ no queue consumer
A queue has less than 1 consumer
[copy]
- alert : RabbitmqNoQueueConsumer
expr : rabbitmq_queue_consumers < 1
for : 1m
labels :
severity : warning
annotations :
summary : RabbitMQ no queue consumer (instance {{ $labels.instance }})
description : " A queue has less than 1 consumer \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.1.9.
RabbitMQ unroutable messages
A queue has unroutable messages
[copy]
- alert : RabbitmqUnroutableMessages
expr : increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ unroutable messages (instance {{ $labels.instance }})
description : " A queue has unroutable messages \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
#
2.8.2.1.
RabbitMQ down
RabbitMQ node down
[copy]
- alert : RabbitmqDown
expr : rabbitmq_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : RabbitMQ down (instance {{ $labels.instance }})
description : " RabbitMQ node down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.2.
RabbitMQ cluster down
Less than 3 nodes running in RabbitMQ cluster
[copy]
- alert : RabbitmqClusterDown
expr : sum(rabbitmq_running) < 3
for : 0m
labels :
severity : critical
annotations :
summary : RabbitMQ cluster down (instance {{ $labels.instance }})
description : " Less than 3 nodes running in RabbitMQ cluster \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.3.
RabbitMQ cluster partition
Cluster partition
[copy]
- alert : RabbitmqClusterPartition
expr : rabbitmq_partitions > 0
for : 0m
labels :
severity : critical
annotations :
summary : RabbitMQ cluster partition (instance {{ $labels.instance }})
description : " Cluster partition \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.4.
RabbitMQ out of memory
Memory available for RabbmitMQ is low (< 10%)
[copy]
- alert : RabbitmqOutOfMemory
expr : rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ out of memory (instance {{ $labels.instance }})
description : " Memory available for RabbmitMQ is low (< 10%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.5.
RabbitMQ too many connections
RabbitMQ instance has too many connections (> 1000)
[copy]
- alert : RabbitmqTooManyConnections
expr : rabbitmq_connectionsTotal > 1000
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ too many connections (instance {{ $labels.instance }})
description : " RabbitMQ instance has too many connections (> 1000) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.6.
RabbitMQ dead letter queue filling up
Dead letter queue is filling up (> 10 msgs)
[copy]
# Indicate the queue name in dedicated label.
- alert : RabbitmqDeadLetterQueueFillingUp
expr : rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10
for : 1m
labels :
severity : warning
annotations :
summary : RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
description : " Dead letter queue is filling up (> 10 msgs) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.7.
RabbitMQ too many messages in queue
Queue is filling up (> 1000 msgs)
[copy]
# Indicate the queue name in dedicated label.
- alert : RabbitmqTooManyMessagesInQueue
expr : rabbitmq_queue_messages_ready{queue="my-queue"} > 1000
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ too many messages in queue (instance {{ $labels.instance }})
description : " Queue is filling up (> 1000 msgs) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.8.
RabbitMQ slow queue consuming
Queue messages are consumed slowly (> 60s)
[copy]
# Indicate the queue name in dedicated label.
- alert : RabbitmqSlowQueueConsuming
expr : time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ slow queue consuming (instance {{ $labels.instance }})
description : " Queue messages are consumed slowly (> 60s) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.9.
RabbitMQ no consumer
Queue has no consumer
[copy]
- alert : RabbitmqNoConsumer
expr : rabbitmq_queue_consumers == 0
for : 1m
labels :
severity : critical
annotations :
summary : RabbitMQ no consumer (instance {{ $labels.instance }})
description : " Queue has no consumer \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.10.
RabbitMQ too many consumers
Queue should have only 1 consumer
[copy]
# Indicate the queue name in dedicated label.
- alert : RabbitmqTooManyConsumers
expr : rabbitmq_queue_consumers{queue="my-queue"} > 1
for : 0m
labels :
severity : critical
annotations :
summary : RabbitMQ too many consumers (instance {{ $labels.instance }})
description : " Queue should have only 1 consumer \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.8.2.11.
RabbitMQ unactive exchange
Exchange receive less than 5 msgs per second
[copy]
# Indicate the exchange name in dedicated label.
- alert : RabbitmqUnactiveExchange
expr : rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5
for : 2m
labels :
severity : warning
annotations :
summary : RabbitMQ unactive exchange (instance {{ $labels.instance }})
description : " Exchange receive less than 5 msgs per second \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
#
2.9.1.
Elasticsearch Heap Usage Too High
The heap usage is over 90%
[copy]
- alert : ElasticsearchHeapUsageTooHigh
expr : (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
for : 2m
labels :
severity : critical
annotations :
summary : Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})
description : " The heap usage is over 90% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.2.
Elasticsearch Heap Usage warning
The heap usage is over 80%
[copy]
- alert : ElasticsearchHeapUsageWarning
expr : (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : Elasticsearch Heap Usage warning (instance {{ $labels.instance }})
description : " The heap usage is over 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.3.
Elasticsearch disk out of space
The disk usage is over 90%
[copy]
- alert : ElasticsearchDiskOutOfSpace
expr : elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
for : 0m
labels :
severity : critical
annotations :
summary : Elasticsearch disk out of space (instance {{ $labels.instance }})
description : " The disk usage is over 90% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.4.
Elasticsearch disk space low
The disk usage is over 80%
[copy]
- alert : ElasticsearchDiskSpaceLow
expr : elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
for : 2m
labels :
severity : warning
annotations :
summary : Elasticsearch disk space low (instance {{ $labels.instance }})
description : " The disk usage is over 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.5.
Elasticsearch Cluster Red
Elastic Cluster Red status
[copy]
- alert : ElasticsearchClusterRed
expr : elasticsearch_cluster_health_status{color="red"} == 1
for : 0m
labels :
severity : critical
annotations :
summary : Elasticsearch Cluster Red (instance {{ $labels.instance }})
description : " Elastic Cluster Red status \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.6.
Elasticsearch Cluster Yellow
Elastic Cluster Yellow status
[copy]
- alert : ElasticsearchClusterYellow
expr : elasticsearch_cluster_health_status{color="yellow"} == 1
for : 0m
labels :
severity : warning
annotations :
summary : Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
description : " Elastic Cluster Yellow status \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.7.
Elasticsearch Healthy Nodes
Missing node in Elasticsearch cluster
[copy]
- alert : ElasticsearchHealthyNodes
expr : elasticsearch_cluster_health_number_of_nodes < 3
for : 0m
labels :
severity : critical
annotations :
summary : Elasticsearch Healthy Nodes (instance {{ $labels.instance }})
description : " Missing node in Elasticsearch cluster \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.8.
Elasticsearch Healthy Data Nodes
Missing data node in Elasticsearch cluster
[copy]
- alert : ElasticsearchHealthyDataNodes
expr : elasticsearch_cluster_health_number_of_data_nodes < 3
for : 0m
labels :
severity : critical
annotations :
summary : Elasticsearch Healthy Data Nodes (instance {{ $labels.instance }})
description : " Missing data node in Elasticsearch cluster \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.9.
Elasticsearch relocating shards
Elasticsearch is relocating shards
[copy]
- alert : ElasticsearchRelocatingShards
expr : elasticsearch_cluster_health_relocating_shards > 0
for : 0m
labels :
severity : info
annotations :
summary : Elasticsearch relocating shards (instance {{ $labels.instance }})
description : " Elasticsearch is relocating shards \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.10.
Elasticsearch relocating shards too long
Elasticsearch has been relocating shards for 15min
[copy]
- alert : ElasticsearchRelocatingShardsTooLong
expr : elasticsearch_cluster_health_relocating_shards > 0
for : 15m
labels :
severity : warning
annotations :
summary : Elasticsearch relocating shards too long (instance {{ $labels.instance }})
description : " Elasticsearch has been relocating shards for 15min \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.11.
Elasticsearch initializing shards
Elasticsearch is initializing shards
[copy]
- alert : ElasticsearchInitializingShards
expr : elasticsearch_cluster_health_initializing_shards > 0
for : 0m
labels :
severity : info
annotations :
summary : Elasticsearch initializing shards (instance {{ $labels.instance }})
description : " Elasticsearch is initializing shards \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.12.
Elasticsearch initializing shards too long
Elasticsearch has been initializing shards for 15 min
[copy]
- alert : ElasticsearchInitializingShardsTooLong
expr : elasticsearch_cluster_health_initializing_shards > 0
for : 15m
labels :
severity : warning
annotations :
summary : Elasticsearch initializing shards too long (instance {{ $labels.instance }})
description : " Elasticsearch has been initializing shards for 15 min \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.13.
Elasticsearch unassigned shards
Elasticsearch has unassigned shards
[copy]
- alert : ElasticsearchUnassignedShards
expr : elasticsearch_cluster_health_unassigned_shards > 0
for : 0m
labels :
severity : critical
annotations :
summary : Elasticsearch unassigned shards (instance {{ $labels.instance }})
description : " Elasticsearch has unassigned shards \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.14.
Elasticsearch pending tasks
Elasticsearch has pending tasks. Cluster works slowly.
[copy]
- alert : ElasticsearchPendingTasks
expr : elasticsearch_cluster_health_number_of_pending_tasks > 0
for : 15m
labels :
severity : warning
annotations :
summary : Elasticsearch pending tasks (instance {{ $labels.instance }})
description : " Elasticsearch has pending tasks. Cluster works slowly. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.15.
Elasticsearch no new documents
No new documents for 10 min!
[copy]
- alert : ElasticsearchNoNewDocuments
expr : increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1
for : 0m
labels :
severity : warning
annotations :
summary : Elasticsearch no new documents (instance {{ $labels.instance }})
description : " No new documents for 10 min! \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.16.
Elasticsearch High Indexing Latency
The indexing latency on Elasticsearch cluster is higher than the threshold.
[copy]
- alert : ElasticsearchHighIndexingLatency
expr : elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005
for : 10m
labels :
severity : warning
annotations :
summary : Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
description : " The indexing latency on Elasticsearch cluster is higher than the threshold. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.17.
Elasticsearch High Indexing Rate
The indexing rate on Elasticsearch cluster is higher than the threshold.
[copy]
- alert : ElasticsearchHighIndexingRate
expr : sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000
for : 5m
labels :
severity : warning
annotations :
summary : Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
description : " The indexing rate on Elasticsearch cluster is higher than the threshold. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.18.
Elasticsearch High Query Rate
The query rate on Elasticsearch cluster is higher than the threshold.
[copy]
- alert : ElasticsearchHighQueryRate
expr : sum(rate(elasticsearch_indices_search_query_total[1m])) > 100
for : 5m
labels :
severity : warning
annotations :
summary : Elasticsearch High Query Rate (instance {{ $labels.instance }})
description : " The query rate on Elasticsearch cluster is higher than the threshold. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.9.19.
Elasticsearch High Query Latency
The query latency on Elasticsearch cluster is higher than the threshold.
[copy]
- alert : ElasticsearchHighQueryLatency
expr : elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1
for : 5m
labels :
severity : warning
annotations :
summary : Elasticsearch High Query Latency (instance {{ $labels.instance }})
description : " The query latency on Elasticsearch cluster is higher than the threshold. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.10.
Meilisearch
:
Embedded exporter
(2 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/meilisearch/embedded-exporter.yml
#
2.10.1.
Meilisearch index is empty
Meilisearch instance is down
[copy]
- alert : MeilisearchIndexIsEmpty
expr : meilisearch_index_docs_count == 0
for : 0m
labels :
severity : warning
annotations :
summary : Meilisearch index is empty (instance {{ $labels.instance }})
description : " Meilisearch instance is down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.10.2.
Meilisearch http response time
Meilisearch http response time is too high
[copy]
- alert : MeilisearchHttpResponseTime
expr : meilisearch_http_response_time_seconds > 0.5
for : 0m
labels :
severity : warning
annotations :
summary : Meilisearch http response time (instance {{ $labels.instance }})
description : " Meilisearch http response time is too high \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/cassandra/instaclustr-cassandra-exporter.yml
#
2.11.1.1.
Cassandra Node is unavailable
Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}
[copy]
- alert : CassandraNodeIsUnavailable
expr : sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra Node is unavailable (instance {{ $labels.instance }})
description : " Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.2.
Cassandra many compaction tasks are pending
Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraManyCompactionTasksArePending
expr : cassandra_table_estimated_pending_compactions > 100
for : 0m
labels :
severity : warning
annotations :
summary : Cassandra many compaction tasks are pending (instance {{ $labels.instance }})
description : " Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.3.
Cassandra commitlog pending tasks
Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraCommitlogPendingTasks
expr : cassandra_commit_log_pending_tasks > 15
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra commitlog pending tasks (instance {{ $labels.instance }})
description : " Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.4.
Cassandra compaction executor blocked tasks
Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraCompactionExecutorBlockedTasks
expr : cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
description : " Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.5.
Cassandra flush writer blocked tasks
Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraFlushWriterBlockedTasks
expr : cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
description : " Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.6.
Cassandra connection timeouts total
Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraConnectionTimeoutsTotal
expr : avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra connection timeouts total (instance {{ $labels.instance }})
description : " Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.7.
Cassandra storage exceptions
Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraStorageExceptions
expr : changes(cassandra_storage_exceptions_total[1m]) > 1
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra storage exceptions (instance {{ $labels.instance }})
description : " Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.8.
Cassandra tombstone dump
Cassandra tombstone dump - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraTombstoneDump
expr : avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra tombstone dump (instance {{ $labels.instance }})
description : " Cassandra tombstone dump - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.9.
Cassandra client request unavailable write
Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraClientRequestUnavailableWrite
expr : changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra client request unavailable write (instance {{ $labels.instance }})
description : " Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.10.
Cassandra client request unavailable read
Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraClientRequestUnavailableRead
expr : changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra client request unavailable read (instance {{ $labels.instance }})
description : " Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.11.
Cassandra client request write failure
Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraClientRequestWriteFailure
expr : increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra client request write failure (instance {{ $labels.instance }})
description : " Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.1.12.
Cassandra client request read failure
Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}
[copy]
- alert : CassandraClientRequestReadFailure
expr : increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra client request read failure (instance {{ $labels.instance }})
description : " Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/cassandra/criteo-cassandra-exporter.yml
#
2.11.2.1.
Cassandra hints count
Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down
[copy]
- alert : CassandraHintsCount
expr : changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra hints count (instance {{ $labels.instance }})
description : " Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.2.
Cassandra compaction task pending
Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.
[copy]
- alert : CassandraCompactionTaskPending
expr : avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra compaction task pending (instance {{ $labels.instance }})
description : " Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.3.
Cassandra viewwrite latency
High viewwrite latency on {{ $labels.instance }} cassandra node
[copy]
- alert : CassandraViewwriteLatency
expr : cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra viewwrite latency (instance {{ $labels.instance }})
description : " High viewwrite latency on {{ $labels.instance }} cassandra node \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.4.
Cassandra bad hacker
Increase of Cassandra authentication failures
[copy]
- alert : CassandraBadHacker
expr : rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra bad hacker (instance {{ $labels.instance }})
description : " Increase of Cassandra authentication failures \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.5.
Cassandra node down
Cassandra node down
[copy]
- alert : CassandraNodeDown
expr : sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra node down (instance {{ $labels.instance }})
description : " Cassandra node down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.6.
Cassandra commitlog pending tasks
Unexpected number of Cassandra commitlog pending tasks
[copy]
- alert : CassandraCommitlogPendingTasks
expr : cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra commitlog pending tasks (instance {{ $labels.instance }})
description : " Unexpected number of Cassandra commitlog pending tasks \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.7.
Cassandra compaction executor blocked tasks
Some Cassandra compaction executor tasks are blocked
[copy]
- alert : CassandraCompactionExecutorBlockedTasks
expr : cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
description : " Some Cassandra compaction executor tasks are blocked \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.8.
Cassandra flush writer blocked tasks
Some Cassandra flush writer tasks are blocked
[copy]
- alert : CassandraFlushWriterBlockedTasks
expr : cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
description : " Some Cassandra flush writer tasks are blocked \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.9.
Cassandra repair pending tasks
Some Cassandra repair tasks are pending
[copy]
- alert : CassandraRepairPendingTasks
expr : cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra repair pending tasks (instance {{ $labels.instance }})
description : " Some Cassandra repair tasks are pending \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.10.
Cassandra repair blocked tasks
Some Cassandra repair tasks are blocked
[copy]
- alert : CassandraRepairBlockedTasks
expr : cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0
for : 2m
labels :
severity : warning
annotations :
summary : Cassandra repair blocked tasks (instance {{ $labels.instance }})
description : " Some Cassandra repair tasks are blocked \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.11.
Cassandra connection timeouts total
Some connection between nodes are ending in timeout
[copy]
- alert : CassandraConnectionTimeoutsTotal
expr : rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra connection timeouts total (instance {{ $labels.instance }})
description : " Some connection between nodes are ending in timeout \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.12.
Cassandra storage exceptions
Something is going wrong with cassandra storage
[copy]
- alert : CassandraStorageExceptions
expr : changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra storage exceptions (instance {{ $labels.instance }})
description : " Something is going wrong with cassandra storage \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.13.
Cassandra tombstone dump
Too much tombstones scanned in queries
[copy]
- alert : CassandraTombstoneDump
expr : cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra tombstone dump (instance {{ $labels.instance }})
description : " Too much tombstones scanned in queries \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.14.
Cassandra client request unavailable write
Write failures have occurred because too many nodes are unavailable
[copy]
- alert : CassandraClientRequestUnavailableWrite
expr : changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra client request unavailable write (instance {{ $labels.instance }})
description : " Write failures have occurred because too many nodes are unavailable \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.15.
Cassandra client request unavailable read
Read failures have occurred because too many nodes are unavailable
[copy]
- alert : CassandraClientRequestUnavailableRead
expr : changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra client request unavailable read (instance {{ $labels.instance }})
description : " Read failures have occurred because too many nodes are unavailable \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.16.
Cassandra client request write failure
A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
[copy]
- alert : CassandraClientRequestWriteFailure
expr : increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra client request write failure (instance {{ $labels.instance }})
description : " A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.17.
Cassandra client request read failure
A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
[copy]
- alert : CassandraClientRequestReadFailure
expr : increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0
for : 0m
labels :
severity : critical
annotations :
summary : Cassandra client request read failure (instance {{ $labels.instance }})
description : " A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.11.2.18.
Cassandra cache hit rate key cache
Key cache hit rate is below 85%
[copy]
- alert : CassandraCacheHitRateKeyCache
expr : cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85
for : 2m
labels :
severity : critical
annotations :
summary : Cassandra cache hit rate key cache (instance {{ $labels.instance }})
description : " Key cache hit rate is below 85% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.
Clickhouse
:
Embedded Exporter
(14 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/clickhouse/embedded-exporter.yml
#
2.12.1.
ClickHouse Memory Usage Critical
Memory usage is critically high, over 90%.
[copy]
- alert : ClickhouseMemoryUsageCritical
expr : ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90
for : 5m
labels :
severity : critical
annotations :
summary : ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
description : " Memory usage is critically high, over 90%. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.2.
ClickHouse Memory Usage Warning
Memory usage is over 80%.
[copy]
- alert : ClickhouseMemoryUsageWarning
expr : ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80
for : 5m
labels :
severity : warning
annotations :
summary : ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
description : " Memory usage is over 80%. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.3.
ClickHouse Disk Space Low on Default
Disk space on default is below 20%.
[copy]
- alert : ClickhouseDiskSpaceLowOnDefault
expr : ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20
for : 2m
labels :
severity : warning
annotations :
summary : ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
description : " Disk space on default is below 20%. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.4.
ClickHouse Disk Space Critical on Default
Disk space on default disk is critically low, below 10%.
[copy]
- alert : ClickhouseDiskSpaceCriticalOnDefault
expr : ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10
for : 2m
labels :
severity : critical
annotations :
summary : ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
description : " Disk space on default disk is critically low, below 10%. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.5.
ClickHouse Disk Space Low on Backups
Disk space on backups is below 20%.
[copy]
- alert : ClickhouseDiskSpaceLowOnBackups
expr : ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20
for : 2m
labels :
severity : warning
annotations :
summary : ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
description : " Disk space on backups is below 20%. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.6.
ClickHouse Replica Errors
Critical replica errors detected, either all replicas are stale or lost.
[copy]
- alert : ClickhouseReplicaErrors
expr : ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1
for : 0m
labels :
severity : critical
annotations :
summary : ClickHouse Replica Errors (instance {{ $labels.instance }})
description : " Critical replica errors detected, either all replicas are stale or lost. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.7.
ClickHouse No Available Replicas
No available replicas in ClickHouse.
[copy]
- alert : ClickhouseNoAvailableReplicas
expr : ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1
for : 0m
labels :
severity : critical
annotations :
summary : ClickHouse No Available Replicas (instance {{ $labels.instance }})
description : " No available replicas in ClickHouse. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.8.
ClickHouse No Live Replicas
There are too few live replicas available, risking data loss and service disruption.
[copy]
- alert : ClickhouseNoLiveReplicas
expr : ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1
for : 0m
labels :
severity : critical
annotations :
summary : ClickHouse No Live Replicas (instance {{ $labels.instance }})
description : " There are too few live replicas available, risking data loss and service disruption. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.9.
ClickHouse High Network Traffic
Network traffic is unusually high, may affect cluster performance.
[copy]
# Please replace the threshold with an appropriate value
- alert : ClickhouseHighNetworkTraffic
expr : ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250
for : 5m
labels :
severity : warning
annotations :
summary : ClickHouse High Network Traffic (instance {{ $labels.instance }})
description : " Network traffic is unusually high, may affect cluster performance. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.10.
ClickHouse High TCP Connections
High number of TCP connections, indicating heavy client or inter-cluster communication.
[copy]
# Please replace the threshold with an appropriate value
- alert : ClickhouseHighTcpConnections
expr : ClickHouseMetrics_TCPConnection > 400
for : 5m
labels :
severity : warning
annotations :
summary : ClickHouse High TCP Connections (instance {{ $labels.instance }})
description : " High number of TCP connections, indicating heavy client or inter-cluster communication. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.11.
ClickHouse Interserver Connection Issues
An increase in interserver connections may indicate replication or distributed query handling issues.
[copy]
- alert : ClickhouseInterserverConnectionIssues
expr : increase(ClickHouseMetrics_InterserverConnection[5m]) > 0
for : 1m
labels :
severity : warning
annotations :
summary : ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
description : " An increase in interserver connections may indicate replication or distributed query handling issues. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.12.
ClickHouse ZooKeeper Connection Issues
ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.
[copy]
- alert : ClickhouseZookeeperConnectionIssues
expr : avg(ClickHouseMetrics_ZooKeeperSession) != 1
for : 3m
labels :
severity : warning
annotations :
summary : ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
description : " ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.13.
ClickHouse Authentication Failures
Authentication failures detected, indicating potential security issues or misconfiguration.
[copy]
- alert : ClickhouseAuthenticationFailures
expr : increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0
for : 0m
labels :
severity : info
annotations :
summary : ClickHouse Authentication Failures (instance {{ $labels.instance }})
description : " Authentication failures detected, indicating potential security issues or misconfiguration. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.12.14.
ClickHouse Access Denied Errors
Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.
[copy]
- alert : ClickhouseAccessDeniedErrors
expr : increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0
for : 0m
labels :
severity : info
annotations :
summary : ClickHouse Access Denied Errors (instance {{ $labels.instance }})
description : " Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
// @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml
#
2.13.2.1.
Zookeeper Down
Zookeeper down on instance {{ $labels.instance }}
[copy]
- alert : ZookeeperDown
expr : zk_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : Zookeeper Down (instance {{ $labels.instance }})
description : " Zookeeper down on instance {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.13.2.2.
Zookeeper missing leader
Zookeeper cluster has no node marked as leader
[copy]
- alert : ZookeeperMissingLeader
expr : sum(zk_server_leader) == 0
for : 0m
labels :
severity : critical
annotations :
summary : Zookeeper missing leader (instance {{ $labels.instance }})
description : " Zookeeper cluster has no node marked as leader \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.13.2.3.
Zookeeper Too Many Leaders
Zookeeper cluster has too many nodes marked as leader
[copy]
- alert : ZookeeperTooManyLeaders
expr : sum(zk_server_leader) > 1
for : 0m
labels :
severity : critical
annotations :
summary : Zookeeper Too Many Leaders (instance {{ $labels.instance }})
description : " Zookeeper cluster has too many nodes marked as leader \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.13.2.4.
Zookeeper Not Ok
Zookeeper instance is not ok
[copy]
- alert : ZookeeperNotOk
expr : zk_ruok == 0
for : 3m
labels :
severity : warning
annotations :
summary : Zookeeper Not Ok (instance {{ $labels.instance }})
description : " Zookeeper instance is not ok \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/kafka/danielqsj-kafka-exporter.yml
#
2.14.1.1.
Kafka topics replicas
Kafka topic in-sync partition
[copy]
- alert : KafkaTopicsReplicas
expr : sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
for : 0m
labels :
severity : critical
annotations :
summary : Kafka topics replicas (instance {{ $labels.instance }})
description : " Kafka topic in-sync partition \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.14.1.2.
Kafka consumers group
Kafka consumers group
[copy]
- alert : KafkaConsumersGroup
expr : sum(kafka_consumergroup_lag) by (consumergroup) > 50
for : 1m
labels :
severity : critical
annotations :
summary : Kafka consumers group (instance {{ $labels.instance }})
description : " Kafka consumers group \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.14.2.
Kafka
:
linkedin/Burrow
(2 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/kafka/linkedin-kafka-exporter.yml
#
2.14.2.1.
Kafka topic offset decreased
Kafka topic offset has decreased
[copy]
- alert : KafkaTopicOffsetDecreased
expr : delta(kafka_burrow_partition_current_offset[1m]) < 0
for : 0m
labels :
severity : warning
annotations :
summary : Kafka topic offset decreased (instance {{ $labels.instance }})
description : " Kafka topic offset has decreased \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.14.2.2.
Kafka consumer lag
Kafka consumer has a 30 minutes and increasing lag
[copy]
- alert : KafkaConsumerLag
expr : kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0
for : 15m
labels :
severity : warning
annotations :
summary : Kafka consumer lag (instance {{ $labels.instance }})
description : " Kafka consumer has a 30 minutes and increasing lag \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.
Pulsar
:
embedded exporter
(10 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/pulsar/embedded-exporter.yml
#
2.15.1.
Pulsar subscription high number of backlog entries
The number of subscription backlog entries is over 5k
[copy]
- alert : PulsarSubscriptionHighNumberOfBacklogEntries
expr : sum(pulsar_subscription_back_log) by (subscription) > 5000
for : 1h
labels :
severity : warning
annotations :
summary : Pulsar subscription high number of backlog entries (instance {{ $labels.instance }})
description : " The number of subscription backlog entries is over 5k \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.2.
Pulsar subscription very high number of backlog entries
The number of subscription backlog entries is over 100k
[copy]
- alert : PulsarSubscriptionVeryHighNumberOfBacklogEntries
expr : sum(pulsar_subscription_back_log) by (subscription) > 100000
for : 1h
labels :
severity : critical
annotations :
summary : Pulsar subscription very high number of backlog entries (instance {{ $labels.instance }})
description : " The number of subscription backlog entries is over 100k \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.3.
Pulsar topic large backlog storage size
The topic backlog storage size is over 5 GB
[copy]
- alert : PulsarTopicLargeBacklogStorageSize
expr : sum(pulsar_storage_size > 5*1024*1024*1024) by (topic)
for : 1h
labels :
severity : warning
annotations :
summary : Pulsar topic large backlog storage size (instance {{ $labels.instance }})
description : " The topic backlog storage size is over 5 GB \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.4.
Pulsar topic very large backlog storage size
The topic backlog storage size is over 20 GB
[copy]
- alert : PulsarTopicVeryLargeBacklogStorageSize
expr : sum(pulsar_storage_size > 20*1024*1024*1024) by (topic)
for : 1h
labels :
severity : critical
annotations :
summary : Pulsar topic very large backlog storage size (instance {{ $labels.instance }})
description : " The topic backlog storage size is over 20 GB \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.5.
Pulsar high write latency
Messages cannot be written in a timely fashion
[copy]
- alert : PulsarHighWriteLatency
expr : sum(pulsar_storage_write_latency_overflow > 0) by (topic)
for : 1h
labels :
severity : critical
annotations :
summary : Pulsar high write latency (instance {{ $labels.instance }})
description : " Messages cannot be written in a timely fashion \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.6.
Pulsar large message payload
Observing large message payload (> 1MB)
[copy]
- alert : PulsarLargeMessagePayload
expr : sum(pulsar_entry_size_overflow > 0) by (topic)
for : 1h
labels :
severity : warning
annotations :
summary : Pulsar large message payload (instance {{ $labels.instance }})
description : " Observing large message payload (> 1MB) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.7.
Pulsar high ledger disk usage
Observing Ledger Disk Usage (> 75%)
[copy]
- alert : PulsarHighLedgerDiskUsage
expr : sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75
for : 1h
labels :
severity : critical
annotations :
summary : Pulsar high ledger disk usage (instance {{ $labels.instance }})
description : " Observing Ledger Disk Usage (> 75%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.8.
Pulsar read only bookies
Observing Readonly Bookies
[copy]
- alert : PulsarReadOnlyBookies
expr : count(bookie_SERVER_STATUS{} == 0) by (pod)
for : 5m
labels :
severity : critical
annotations :
summary : Pulsar read only bookies (instance {{ $labels.instance }})
description : " Observing Readonly Bookies \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.9.
Pulsar high number of function errors
Observing more than 10 Function errors per minute
[copy]
- alert : PulsarHighNumberOfFunctionErrors
expr : sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name)
for : 1m
labels :
severity : critical
annotations :
summary : Pulsar high number of function errors (instance {{ $labels.instance }})
description : " Observing more than 10 Function errors per minute \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.15.10.
Pulsar high number of sink errors
Observing more than 10 Sink errors per minute
[copy]
- alert : PulsarHighNumberOfSinkErrors
expr : sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name)
for : 1m
labels :
severity : critical
annotations :
summary : Pulsar high number of sink errors (instance {{ $labels.instance }})
description : " Observing more than 10 Sink errors per minute \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/nats/nats-exporter.yml
#
2.16.1.
Nats high connection count
High number of NATS connections ({{ $value }}) for {{ $labels.instance }}
[copy]
- alert : NatsHighConnectionCount
expr : gnatsd_varz_connections > 100
for : 3m
labels :
severity : warning
annotations :
summary : Nats high connection count (instance {{ $labels.instance }})
description : " High number of NATS connections ({{ $value }}) for {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.2.
Nats high pending bytes
High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}
[copy]
- alert : NatsHighPendingBytes
expr : gnatsd_connz_pending_bytes > 100000
for : 3m
labels :
severity : warning
annotations :
summary : Nats high pending bytes (instance {{ $labels.instance }})
description : " High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.3.
Nats high subscriptions count
High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}
[copy]
- alert : NatsHighSubscriptionsCount
expr : gnatsd_connz_subscriptions > 50
for : 3m
labels :
severity : warning
annotations :
summary : Nats high subscriptions count (instance {{ $labels.instance }})
description : " High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.4.
Nats high routes count
High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
[copy]
- alert : NatsHighRoutesCount
expr : gnatsd_varz_routes > 10
for : 3m
labels :
severity : warning
annotations :
summary : Nats high routes count (instance {{ $labels.instance }})
description : " High number of NATS routes ({{ $value }}) for {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.5.
Nats high memory usage
NATS server memory usage is above 200MB for {{ $labels.instance }}
[copy]
- alert : NatsHighMemoryUsage
expr : gnatsd_varz_mem > 200 * 1024 * 1024
for : 5m
labels :
severity : warning
annotations :
summary : Nats high memory usage (instance {{ $labels.instance }})
description : " NATS server memory usage is above 200MB for {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.6.
Nats slow consumers
There are slow consumers in NATS for {{ $labels.instance }}
[copy]
- alert : NatsSlowConsumers
expr : gnatsd_varz_slow_consumers > 0
for : 3m
labels :
severity : critical
annotations :
summary : Nats slow consumers (instance {{ $labels.instance }})
description : " There are slow consumers in NATS for {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.7.
Nats server down
NATS server has been down for more than 5 minutes
[copy]
- alert : NatsServerDown
expr : absent(up{job="nats"})
for : 5m
labels :
severity : critical
annotations :
summary : Nats server down (instance {{ $labels.instance }})
description : " NATS server has been down for more than 5 minutes \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.8.
Nats high CPU usage
NATS server is using more than 80% CPU for the last 5 minutes
[copy]
- alert : NatsHighCpuUsage
expr : rate(gnatsd_varz_cpu[5m]) > 0.8
for : 5m
labels :
severity : warning
annotations :
summary : Nats high CPU usage (instance {{ $labels.instance }})
description : " NATS server is using more than 80% CPU for the last 5 minutes \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.9.
Nats high number of connections
NATS server has more than 1000 active connections
[copy]
- alert : NatsHighNumberOfConnections
expr : gnatsd_connz_num_connections > 1000
for : 5m
labels :
severity : warning
annotations :
summary : Nats high number of connections (instance {{ $labels.instance }})
description : " NATS server has more than 1000 active connections \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.10.
Nats high JetStream store usage
JetStream store usage is over 80%
[copy]
- alert : NatsHighJetstreamStoreUsage
expr : gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8
for : 5m
labels :
severity : warning
annotations :
summary : Nats high JetStream store usage (instance {{ $labels.instance }})
description : " JetStream store usage is over 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.11.
Nats high JetStream memory usage
JetStream memory usage is over 80%
[copy]
- alert : NatsHighJetstreamMemoryUsage
expr : gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8
for : 5m
labels :
severity : warning
annotations :
summary : Nats high JetStream memory usage (instance {{ $labels.instance }})
description : " JetStream memory usage is over 80% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.12.
Nats high number of subscriptions
NATS server has more than 1000 active subscriptions
[copy]
- alert : NatsHighNumberOfSubscriptions
expr : gnatsd_connz_subscriptions > 1000
for : 5m
labels :
severity : warning
annotations :
summary : Nats high number of subscriptions (instance {{ $labels.instance }})
description : " NATS server has more than 1000 active subscriptions \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.13.
Nats high pending bytes
NATS server has more than 100,000 pending bytes
[copy]
- alert : NatsHighPendingBytes
expr : gnatsd_connz_pending_bytes > 100000
for : 5m
labels :
severity : warning
annotations :
summary : Nats high pending bytes (instance {{ $labels.instance }})
description : " NATS server has more than 100,000 pending bytes \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.14.
Nats too many errors
NATS server has encountered errors in the last 5 minutes
[copy]
- alert : NatsTooManyErrors
expr : increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0
for : 5m
labels :
severity : warning
annotations :
summary : Nats too many errors (instance {{ $labels.instance }})
description : " NATS server has encountered errors in the last 5 minutes \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.15.
Nats JetStream consumers exceeded
JetStream has more than 100 active consumers
[copy]
- alert : NatsJetstreamConsumersExceeded
expr : sum(gnatsd_varz_jetstream_stats_accounts) > 100
for : 5m
labels :
severity : warning
annotations :
summary : Nats JetStream consumers exceeded (instance {{ $labels.instance }})
description : " JetStream has more than 100 active consumers \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.16.
Nats frequent authentication timeouts
There have been more than 5 authentication timeouts in the last 5 minutes
[copy]
- alert : NatsFrequentAuthenticationTimeouts
expr : increase(gnatsd_varz_auth_timeout[5m]) > 5
for : 5m
labels :
severity : warning
annotations :
summary : Nats frequent authentication timeouts (instance {{ $labels.instance }})
description : " There have been more than 5 authentication timeouts in the last 5 minutes \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.17.
Nats max payload size exceeded
The max payload size allowed by NATS has been exceeded (1MB)
[copy]
- alert : NatsMaxPayloadSizeExceeded
expr : max(gnatsd_varz_max_payload) > 1024 * 1024
for : 5m
labels :
severity : critical
annotations :
summary : Nats max payload size exceeded (instance {{ $labels.instance }})
description : " The max payload size allowed by NATS has been exceeded (1MB) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.18.
Nats leaf node connection issue
No leaf node connections have been established in the last 5 minutes
[copy]
- alert : NatsLeafNodeConnectionIssue
expr : increase(gnatsd_varz_leafnodes[5m]) == 0
for : 5m
labels :
severity : critical
annotations :
summary : Nats leaf node connection issue (instance {{ $labels.instance }})
description : " No leaf node connections have been established in the last 5 minutes \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.19.
Nats max ping operations exceeded
The maximum number of ping operations in NATS has exceeded 50
[copy]
- alert : NatsMaxPingOperationsExceeded
expr : gnatsd_varz_ping_max > 50
for : 5m
labels :
severity : warning
annotations :
summary : Nats max ping operations exceeded (instance {{ $labels.instance }})
description : " The maximum number of ping operations in NATS has exceeded 50 \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.16.20.
Nats write deadline exceeded
The write deadline has been exceeded in NATS, indicating potential message delivery issues
[copy]
- alert : NatsWriteDeadlineExceeded
expr : gnatsd_varz_write_deadline > 10
for : 5m
labels :
severity : critical
annotations :
summary : Nats write deadline exceeded (instance {{ $labels.instance }})
description : " The write deadline has been exceeded in NATS, indicating potential message delivery issues \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.17.
Solr
:
embedded exporter
(4 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/solr/embedded-exporter.yml
#
2.17.1.
Solr update errors
Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
[copy]
- alert : SolrUpdateErrors
expr : increase(solr_metrics_core_update_handler_errors_total[1m]) > 1
for : 0m
labels :
severity : critical
annotations :
summary : Solr update errors (instance {{ $labels.instance }})
description : " Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.17.2.
Solr query errors
Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.
[copy]
- alert : SolrQueryErrors
expr : increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1
for : 5m
labels :
severity : warning
annotations :
summary : Solr query errors (instance {{ $labels.instance }})
description : " Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.17.3.
Solr replication errors
Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
[copy]
- alert : SolrReplicationErrors
expr : increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1
for : 0m
labels :
severity : critical
annotations :
summary : Solr replication errors (instance {{ $labels.instance }})
description : " Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.17.4.
Solr low live node count
Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.
[copy]
- alert : SolrLowLiveNodeCount
expr : solr_collections_live_nodes < 2
for : 0m
labels :
severity : critical
annotations :
summary : Solr low live node count (instance {{ $labels.instance }})
description : " Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.
Hadoop
:
hadoop/jmx_exporter
(10 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/hadoop/jmx_exporter.yml
#
2.18.1.
Hadoop Name Node Down
The Hadoop NameNode service is unavailable.
[copy]
- alert : HadoopNameNodeDown
expr : up{job="hadoop-namenode"} == 0
for : 5m
labels :
severity : critical
annotations :
summary : Hadoop Name Node Down (instance {{ $labels.instance }})
description : " The Hadoop NameNode service is unavailable. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.2.
Hadoop Resource Manager Down
The Hadoop ResourceManager service is unavailable.
[copy]
- alert : HadoopResourceManagerDown
expr : up{job="hadoop-resourcemanager"} == 0
for : 5m
labels :
severity : critical
annotations :
summary : Hadoop Resource Manager Down (instance {{ $labels.instance }})
description : " The Hadoop ResourceManager service is unavailable. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.3.
Hadoop Data Node Out Of Service
The Hadoop DataNode is not sending heartbeats.
[copy]
- alert : HadoopDataNodeOutOfService
expr : hadoop_datanode_last_heartbeat == 0
for : 10m
labels :
severity : warning
annotations :
summary : Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
description : " The Hadoop DataNode is not sending heartbeats. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.4.
Hadoop HDFS Disk Space Low
Available HDFS disk space is running low.
[copy]
- alert : HadoopHdfsDiskSpaceLow
expr : (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1
for : 15m
labels :
severity : warning
annotations :
summary : Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
description : " Available HDFS disk space is running low. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.5.
Hadoop Map Reduce Task Failures
There is an unusually high number of MapReduce task failures.
[copy]
- alert : HadoopMapReduceTaskFailures
expr : hadoop_mapreduce_task_failures_total > 100
for : 10m
labels :
severity : critical
annotations :
summary : Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
description : " There is an unusually high number of MapReduce task failures. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.6.
Hadoop Resource Manager Memory High
The Hadoop ResourceManager is approaching its memory limit.
[copy]
- alert : HadoopResourceManagerMemoryHigh
expr : hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
for : 15m
labels :
severity : warning
annotations :
summary : Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
description : " The Hadoop ResourceManager is approaching its memory limit. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.7.
Hadoop YARN Container Allocation Failures
There is a significant number of YARN container allocation failures.
[copy]
- alert : HadoopYarnContainerAllocationFailures
expr : hadoop_yarn_container_allocation_failures_total > 10
for : 10m
labels :
severity : warning
annotations :
summary : Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
description : " There is a significant number of YARN container allocation failures. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.8.
Hadoop HBase Region Count High
The HBase cluster has an unusually high number of regions.
[copy]
- alert : HadoopHbaseRegionCountHigh
expr : hadoop_hbase_region_count > 5000
for : 15m
labels :
severity : warning
annotations :
summary : Hadoop HBase Region Count High (instance {{ $labels.instance }})
description : " The HBase cluster has an unusually high number of regions. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.9.
Hadoop HBase Region Server Heap Low
HBase Region Servers are running low on heap space.
[copy]
- alert : HadoopHbaseRegionServerHeapLow
expr : hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2
for : 10m
labels :
severity : critical
annotations :
summary : Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
description : " HBase Region Servers are running low on heap space. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
2.18.10.
Hadoop HBase Write Requests Latency High
HBase Write Requests are experiencing high latency.
[copy]
- alert : HadoopHbaseWriteRequestsLatencyHigh
expr : hadoop_hbase_write_requests_latency_seconds > 0.5
for : 10m
labels :
severity : warning
annotations :
summary : Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
description : " HBase Write Requests are experiencing high latency. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/nginx/knyar-nginx-exporter.yml
#
3.1.1.
Nginx high HTTP 4xx error rate
Too many HTTP requests with status 4xx (> 5%)
[copy]
- alert : NginxHighHttp4xxErrorRate
expr : sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for : 1m
labels :
severity : critical
annotations :
summary : Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 4xx (> 5%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.1.2.
Nginx high HTTP 5xx error rate
Too many HTTP requests with status 5xx (> 5%)
[copy]
- alert : NginxHighHttp5xxErrorRate
expr : sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for : 1m
labels :
severity : critical
annotations :
summary : Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 5xx (> 5%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.1.3.
Nginx latency high
Nginx p99 latency is higher than 3 seconds
[copy]
- alert : NginxLatencyHigh
expr : histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3
for : 2m
labels :
severity : warning
annotations :
summary : Nginx latency high (instance {{ $labels.instance }})
description : " Nginx p99 latency is higher than 3 seconds \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/apache/lusitaniae-apache-exporter.yml
#
3.2.1.
Apache down
Apache down
[copy]
- alert : ApacheDown
expr : apache_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : Apache down (instance {{ $labels.instance }})
description : " Apache down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.2.2.
Apache workers load
Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}
[copy]
- alert : ApacheWorkersLoad
expr : (sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : Apache workers load (instance {{ $labels.instance }})
description : " Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.2.3.
Apache restart
Apache has just been restarted.
[copy]
- alert : ApacheRestart
expr : apache_uptime_seconds_total / 60 < 1
for : 0m
labels :
severity : warning
annotations :
summary : Apache restart (instance {{ $labels.instance }})
description : " Apache has just been restarted. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/haproxy/embedded-exporter-v2.yml
#
3.3.1.1.
HAProxy high HTTP 4xx error rate backend
Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
[copy]
- alert : HaproxyHighHttp4xxErrorRateBackend
expr : ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.2.
HAProxy high HTTP 5xx error rate backend
Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
[copy]
- alert : HaproxyHighHttp5xxErrorRateBackend
expr : ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.3.
HAProxy high HTTP 4xx error rate server
Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
[copy]
- alert : HaproxyHighHttp4xxErrorRateServer
expr : ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.4.
HAProxy high HTTP 5xx error rate server
Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
[copy]
- alert : HaproxyHighHttp5xxErrorRateServer
expr : ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.5.
HAProxy server response errors
Too many response errors to {{ $labels.server }} server (> 5%).
[copy]
- alert : HaproxyServerResponseErrors
expr : (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy server response errors (instance {{ $labels.instance }})
description : " Too many response errors to {{ $labels.server }} server (> 5%). \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.6.
HAProxy backend connection errors
Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
[copy]
- alert : HaproxyBackendConnectionErrors
expr : (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy backend connection errors (instance {{ $labels.instance }})
description : " Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.7.
HAProxy server connection errors
Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
[copy]
- alert : HaproxyServerConnectionErrors
expr : (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
for : 0m
labels :
severity : critical
annotations :
summary : HAProxy server connection errors (instance {{ $labels.instance }})
description : " Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.8.
HAProxy backend max active session > 80%
Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf "%.2f"}}%
[copy]
- alert : HaproxyBackendMaxActiveSession>80%
expr : ((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy backend max active session > 80% (instance {{ $labels.instance }})
description : " Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \" %.2f \" }}% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.9.
HAProxy pending requests
Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
[copy]
- alert : HaproxyPendingRequests
expr : sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy pending requests (instance {{ $labels.instance }})
description : " Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \" %.2f \" }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.10.
HAProxy HTTP slowing down
Average request time is increasing - {{ $value | printf "%.2f"}}
[copy]
- alert : HaproxyHttpSlowingDown
expr : avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1
for : 1m
labels :
severity : warning
annotations :
summary : HAProxy HTTP slowing down (instance {{ $labels.instance }})
description : " Average request time is increasing - {{ $value | printf \" %.2f \" }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.11.
HAProxy retry high
High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
[copy]
- alert : HaproxyRetryHigh
expr : sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy retry high (instance {{ $labels.instance }})
description : " High rate of retry on {{ $labels.proxy }} - {{ $value | printf \" %.2f \" }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.12.
HAproxy has no alive backends
HAProxy has no alive active or backup backends for {{ $labels.proxy }}
[copy]
- alert : HaproxyHasNoAliveBackends
expr : haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
for : 0m
labels :
severity : critical
annotations :
summary : HAproxy has no alive backends (instance {{ $labels.instance }})
description : " HAProxy has no alive active or backup backends for {{ $labels.proxy }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.13.
HAProxy frontend security blocked requests
HAProxy is blocking requests for security reason
[copy]
- alert : HaproxyFrontendSecurityBlockedRequests
expr : sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy frontend security blocked requests (instance {{ $labels.instance }})
description : " HAProxy is blocking requests for security reason \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.1.14.
HAProxy server healthcheck failure
Some server healthcheck are failing on {{ $labels.server }}
[copy]
- alert : HaproxyServerHealthcheckFailure
expr : increase(haproxy_server_check_failures_total[1m]) > 0
for : 1m
labels :
severity : warning
annotations :
summary : HAProxy server healthcheck failure (instance {{ $labels.instance }})
description : " Some server healthcheck are failing on {{ $labels.server }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/haproxy/haproxy-exporter-v1.yml
#
3.3.2.1.
HAProxy down
HAProxy down
[copy]
- alert : HaproxyDown
expr : haproxy_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : HAProxy down (instance {{ $labels.instance }})
description : " HAProxy down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.2.
HAProxy high HTTP 4xx error rate backend
Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
[copy]
- alert : HaproxyHighHttp4xxErrorRateBackend
expr : sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.3.
HAProxy high HTTP 5xx error rate backend
Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
[copy]
- alert : HaproxyHighHttp5xxErrorRateBackend
expr : sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.4.
HAProxy high HTTP 4xx error rate server
Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
[copy]
- alert : HaproxyHighHttp4xxErrorRateServer
expr : sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.5.
HAProxy high HTTP 5xx error rate server
Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
[copy]
- alert : HaproxyHighHttp5xxErrorRateServer
expr : sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
description : " Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.6.
HAProxy server response errors
Too many response errors to {{ $labels.server }} server (> 5%).
[copy]
- alert : HaproxyServerResponseErrors
expr : sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy server response errors (instance {{ $labels.instance }})
description : " Too many response errors to {{ $labels.server }} server (> 5%). \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.7.
HAProxy backend connection errors
Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
[copy]
- alert : HaproxyBackendConnectionErrors
expr : sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100
for : 1m
labels :
severity : critical
annotations :
summary : HAProxy backend connection errors (instance {{ $labels.instance }})
description : " Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.8.
HAProxy server connection errors
Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
[copy]
- alert : HaproxyServerConnectionErrors
expr : sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100
for : 0m
labels :
severity : critical
annotations :
summary : HAProxy server connection errors (instance {{ $labels.instance }})
description : " Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.9.
HAProxy backend max active session
HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
[copy]
- alert : HaproxyBackendMaxActiveSession
expr : ((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy backend max active session (instance {{ $labels.instance }})
description : " HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.10.
HAProxy pending requests
Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
[copy]
- alert : HaproxyPendingRequests
expr : sum by (backend) (haproxy_backend_current_queue) > 0
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy pending requests (instance {{ $labels.instance }})
description : " Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.11.
HAProxy HTTP slowing down
Average request time is increasing
[copy]
- alert : HaproxyHttpSlowingDown
expr : avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1
for : 1m
labels :
severity : warning
annotations :
summary : HAProxy HTTP slowing down (instance {{ $labels.instance }})
description : " Average request time is increasing \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.12.
HAProxy retry high
High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
[copy]
- alert : HaproxyRetryHigh
expr : sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy retry high (instance {{ $labels.instance }})
description : " High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.13.
HAProxy backend down
HAProxy backend is down
[copy]
- alert : HaproxyBackendDown
expr : haproxy_backend_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : HAProxy backend down (instance {{ $labels.instance }})
description : " HAProxy backend is down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.14.
HAProxy server down
HAProxy server is down
[copy]
- alert : HaproxyServerDown
expr : haproxy_server_up == 0
for : 0m
labels :
severity : critical
annotations :
summary : HAProxy server down (instance {{ $labels.instance }})
description : " HAProxy server is down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.15.
HAProxy frontend security blocked requests
HAProxy is blocking requests for security reason
[copy]
- alert : HaproxyFrontendSecurityBlockedRequests
expr : sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10
for : 2m
labels :
severity : warning
annotations :
summary : HAProxy frontend security blocked requests (instance {{ $labels.instance }})
description : " HAProxy is blocking requests for security reason \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.3.2.16.
HAProxy server healthcheck failure
Some server healthcheck are failing on {{ $labels.server }}
[copy]
- alert : HaproxyServerHealthcheckFailure
expr : increase(haproxy_server_check_failures_total[1m]) > 0
for : 1m
labels :
severity : warning
annotations :
summary : HAProxy server healthcheck failure (instance {{ $labels.instance }})
description : " Some server healthcheck are failing on {{ $labels.server }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.4.1.
Traefik
:
Embedded exporter v2
(3 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/traefik/embedded-exporter-v2.yml
#
3.4.1.1.
Traefik service down
All Traefik services are down
[copy]
- alert : TraefikServiceDown
expr : count(traefik_service_server_up) by (service) == 0
for : 0m
labels :
severity : critical
annotations :
summary : Traefik service down (instance {{ $labels.instance }})
description : " All Traefik services are down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.4.1.2.
Traefik high HTTP 4xx error rate service
Traefik service 4xx error rate is above 5%
[copy]
- alert : TraefikHighHttp4xxErrorRateService
expr : sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5
for : 1m
labels :
severity : critical
annotations :
summary : Traefik high HTTP 4xx error rate service (instance {{ $labels.instance }})
description : " Traefik service 4xx error rate is above 5% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.4.1.3.
Traefik high HTTP 5xx error rate service
Traefik service 5xx error rate is above 5%
[copy]
- alert : TraefikHighHttp5xxErrorRateService
expr : sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5
for : 1m
labels :
severity : critical
annotations :
summary : Traefik high HTTP 5xx error rate service (instance {{ $labels.instance }})
description : " Traefik service 5xx error rate is above 5% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.4.2.
Traefik
:
Embedded exporter v1
(3 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/traefik/embedded-exporter-v1.yml
#
3.4.2.1.
Traefik backend down
All Traefik backends are down
[copy]
- alert : TraefikBackendDown
expr : count(traefik_backend_server_up) by (backend) == 0
for : 0m
labels :
severity : critical
annotations :
summary : Traefik backend down (instance {{ $labels.instance }})
description : " All Traefik backends are down \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.4.2.2.
Traefik high HTTP 4xx error rate backend
Traefik backend 4xx error rate is above 5%
[copy]
- alert : TraefikHighHttp4xxErrorRateBackend
expr : sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5
for : 1m
labels :
severity : critical
annotations :
summary : Traefik high HTTP 4xx error rate backend (instance {{ $labels.instance }})
description : " Traefik backend 4xx error rate is above 5% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
3.4.2.3.
Traefik high HTTP 5xx error rate backend
Traefik backend 5xx error rate is above 5%
[copy]
- alert : TraefikHighHttp5xxErrorRateBackend
expr : sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5
for : 1m
labels :
severity : critical
annotations :
summary : Traefik high HTTP 5xx error rate backend (instance {{ $labels.instance }})
description : " Traefik backend 5xx error rate is above 5% \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/php-fpm/bakins-fpm-exporter.yml
#
4.1.1.
PHP-FPM max-children reached
PHP-FPM reached max children - {{ $labels.instance }}
[copy]
- alert : Php-fpmMax-childrenReached
expr : sum(phpfpm_max_children_reached_total) by (instance) > 0
for : 0m
labels :
severity : warning
annotations :
summary : PHP-FPM max-children reached (instance {{ $labels.instance }})
description : " PHP-FPM reached max children - {{ $labels.instance }} \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
4.2.
JVM
:
java-client
(1 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/jvm/jvm-exporter.yml
#
4.2.1.
JVM memory filling up
JVM memory is filling up (> 80%)
[copy]
- alert : JvmMemoryFillingUp
expr : (sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80
for : 2m
labels :
severity : warning
annotations :
summary : JVM memory filling up (instance {{ $labels.instance }})
description : " JVM memory is filling up (> 80%) \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/sidekiq/strech-sidekiq-exporter.yml
#
4.3.1.
Sidekiq queue size
Sidekiq queue {{ $labels.name }} is growing
[copy]
- alert : SidekiqQueueSize
expr : sidekiq_queue_size > 100
for : 1m
labels :
severity : warning
annotations :
summary : Sidekiq queue size (instance {{ $labels.instance }})
description : " Sidekiq queue {{ $labels.name }} is growing \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
4.3.2.
Sidekiq scheduling latency too high
Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing.
[copy]
- alert : SidekiqSchedulingLatencyTooHigh
expr : max(sidekiq_queue_latency) > 60
for : 0m
labels :
severity : critical
annotations :
summary : Sidekiq scheduling latency too high (instance {{ $labels.instance }})
description : " Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing. \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
5.1.
Kubernetes
:
kube-state-metrics
(34 rules)
[copy section]
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/kubernetes/kubestate-exporter.yml
#
5.1.1.
Kubernetes Node not ready
Node {{ $labels.node }} has been unready for a long time
[copy]
- alert : KubernetesNodeNotReady
expr : kube_node_status_condition{condition="Ready",status="true"} == 0
for : 10m
labels :
severity : critical
annotations :
summary : Kubernetes Node not ready (instance {{ $labels.instance }})
description : " Node {{ $labels.node }} has been unready for a long time \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
5.1.2.
Kubernetes Node memory pressure
Node {{ $labels.node }} has MemoryPressure condition
[copy]
- alert : KubernetesNodeMemoryPressure
expr : kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for : 2m
labels :
severity : critical
annotations :
summary : Kubernetes Node memory pressure (instance {{ $labels.instance }})
description : " Node {{ $labels.node }} has MemoryPressure condition \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
5.1.3.
Kubernetes Node disk pressure
Node {{ $labels.node }} has DiskPressure condition
[copy]
- alert : KubernetesNodeDiskPressure
expr : kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for : 2m
labels :
severity : critical
annotations :
summary : Kubernetes Node disk pressure (instance {{ $labels.instance }})
description : " Node {{ $labels.node }} has DiskPressure condition \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
5.1.4.
Kubernetes Node network unavailable
Node {{ $labels.node }} has NetworkUnavailable condition
[copy]
- alert : KubernetesNodeNetworkUnavailable
expr : kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
for : 2m
labels :
severity : critical
annotations :
summary : Kubernetes Node network unavailable (instance {{ $labels.instance }})
description : " Node {{ $labels.node }} has NetworkUnavailable condition \n VALUE = {{ $value }} \n LABELS = {{ $labels }}"
#
5.1.5.
Kubernetes Node out of pod capacity
Node {{ $labels.node }} is out of pod capacity
[copy]