Alerts


/etc/prometheus/blackbox.rules.yml > blackbox.rules
ATS down (0 active)
alert: ATS down
expr: probe_success{job=~"ATS"} == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} АТС не отвечает больше 5 минут'
Blackbox Slow Probe (0 active)
alert: Blackbox Slow Probe
expr: avg_over_time(probe_duration_seconds[5m]) > 10
for: 5m
labels:
  severity: critical
annotations:
  summary: 'Blackbox очень долго обрабатывает сайт {{ $labels.instance }}. Время обработки: {{ $value }}'
Radioactive Down (0 active)
alert: Radioactive Down
expr: probe_success{job=~"radioactive"} == 0
for: 5m
labels:
  severity: radioactive
annotations:
  summary: Радиоактивная комната {{ $labels.instance }} не отвечает больше 5 минут
Slow Http (0 active)
alert: Slow Http
expr: avg_over_time(probe_http_duration_seconds[5m]) > 5
for: 5m
labels:
  severity: critical
annotations:
  summary: Очень медленная обработка HTTP-запросов на {{ $labels.instance }}
Ssl Certificate Expired (0 active)
alert: Ssl Certificate Expired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 1d
labels:
  severity: critical
annotations:
  summary: SSL certificate has expired already on {{ $labels.instance }}
Ssl Certificate Has Expired Soon (0 active)
alert: Ssl Certificate Has Expired Soon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 25
for: 1d
labels:
  severity: critical
annotations:
  summary: SSL сертификат на {{ $labels.instance }} закончится через 25 дней
/etc/prometheus/custom.rules.yml > custom.rules
Omada device down (3 active)
alert: Omada device down
expr: omada_device_uptime_seconds == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: Device {{ $labels.device_type }} {{ $labels.device }} from site {{ $labels.site }} is down
Labels State Active Since Value
alertname="Omada device down" device="b9f1r108-tplink-test" device_type="switch" instance="localhost:14274" ip="10.0.1.64" job="omada" mac="34-60-F9-DA-E1-F3" model="SG3428" severity="critical" site="BuildTest" site_id="6719ddf04982f17271fd0565" version="2.30.0" firing 2024-11-26 13:55:19.196947433 +0000 UTC 0
alertname="Omada device down" device="b9f1r108-tplinksfp1" device_type="switch" instance="localhost:14274" ip="10.0.1.56" job="omada" mac="34-60-F9-24-72-42" model="SG3452X" severity="critical" site="BuildTest" site_id="6719ddf04982f17271fd0565" version="1.20.1" firing 2024-11-26 13:55:19.196947433 +0000 UTC 0
alertname="Omada device down" device="9C-A2-F4-71-74-16" device_type="switch" instance="localhost:14274" ip="10.0.1.66" job="omada" mac="9C-A2-F4-71-74-16" model="SG3452XP" severity="critical" site="BuildTest" site_id="6719ddf04982f17271fd0565" version="2.20.3" firing 2024-11-26 13:55:19.196947433 +0000 UTC 0
Switch Down (11 active)
alert: Switch Down
expr: up{job=~"upTime|dmc1002"} == 0
for: 1m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} not responding for more than 1 minutes'
Labels State Active Since Value
alertname="Switch Down" building="9 корпус" instance="b19f1r111-hp" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
alertname="Switch Down" building="1 корпус" instance="b9f1r108-qtech2" job="upTime" severity="critical" firing 2024-11-26 05:23:49.196947433 +0000 UTC 0
alertname="Switch Down" building="14 корпус" instance="swimingpool-dlink" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
alertname="Switch Down" building="12 корпус" instance="b12f1pool8" job="upTime" severity="critical" firing 2024-11-26 22:25:19.196947433 +0000 UTC 0
alertname="Switch Down" building="12 корпус" instance="b12f1pool6" job="upTime" severity="critical" firing 2024-11-25 07:24:49.196947433 +0000 UTC 0
alertname="Switch Down" building="9 корпус" instance="b9f3pool2b" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
alertname="Switch Down" building="9 корпус" instance="b9f3r310" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
alertname="Switch Down" building="9 корпус" instance="edgecore-01" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
alertname="Switch Down" building="12 корпус" instance="b12f2r225-cluster" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
alertname="Switch Down" building="9 корпус" instance="b9f1r108-tplink-test" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
alertname="Switch Down" building="1 корпус" instance="mikrotik-9-1" job="upTime" severity="critical" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
Unifi device down (1 active)
alert: Unifi device down
expr: unifipoller_device_uptime_seconds == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: Device {{ $labels.type }} {{ $labels.name }} from site {{ $labels.site_name }} is down
Labels State Active Since Value
alertname="Unifi device down" instance="localhost:9130" job="unifi" name="ap8f0w3" severity="critical" site_name="build8 (bi39um96)" source="https://unifi.sgu.ru:8443" type="uap" firing 2024-11-25 05:57:19.196947433 +0000 UTC 0
Bind Server Down (0 active)
alert: Bind Server Down
expr: bind_up == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.job }} on {{ $labels.instance }} not responding for more than 5 minutes'
Borg Backup Missing (0 active)
alert: Borg Backup Missing
expr: time() - borgbackup_last_modified > 86400 * 8
for: 30m
labels:
  severity: critical
annotations:
  summary: Backup on {{ $labels.instance }} not created last 3 days
Certificate expired (0 active)
alert: Certificate expired
expr: nginx_cert_exporter_file_expired - time() < 86400 * 25
for: 12h
labels:
  severity: critical
annotations:
  summary: Локальный сертификат {{ $labels.name }} на {{ $labels.instance }} просрочиться в течении 25 дней
Cisco Netflow Cache High (0 active)
alert: Cisco Netflow Cache High
expr: netflow_active_entries:ratio > 0.8
for: 5m
labels:
  severity: ctritical
DDOS (0 active)
alert: DDOS
expr: (rate(nginx_http_requests_total[5m]) / rate(nginx_http_requests_total[5m] offset 5m) > 5) and rate(nginx_http_requests_total[5m]) > 100
for: 5m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} находится под DDOS атакой'
Error in dhcpserv config (0 active)
alert: Error in dhcpserv config
expr: rate(dhcpserv_build_errors[1m]) > 1
for: 2m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} have too many error in dhcpserv generate config: {{ $value }}'
Fail2Ban DDOS (0 active)
alert: Fail2Ban DDOS
expr: avg by(instance, jail) (f2b_jail_banned_current) > 20
for: 5m
labels:
  severity: critical
annotations:
  summary: 'Слишком часто срабатывает правило {{ $labels.jail }}: {{ $value }} на сервере {{ $labels.instance }}'
Instance Down (0 active)
alert: Instance Down
expr: up{job=~"squid|wpad|etcd|dhcpd6|bind"} == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.job }} {{ $labels.instance }} not responding for more than 5 minutes.'
Nginx high active connections (0 active)
alert: Nginx high active connections
expr: nginx_connections_active{job="nginx"} > 300
for: 5m
labels:
  severity: critical
annotations:
  summary: Большое коллчиество активных подключений в Nginx на {{ $labels.instance }} в течении 5 минут
Nginx is down (0 active)
alert: Nginx is down
expr: nginx_up == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: Nginx is down on the host {{ $labels.instance }} for more than 5 minutes
Nginx not all connections are handled (0 active)
alert: Nginx not all connections are handled
expr: rate(nginx_connections_handled{job="nginx"}[5m]) / rate(nginx_connections_accepted{job="nginx"}[5m]) < 1
for: 3m
labels:
  severity: critical
annotations:
  summary: Nginx does not handle all accept connections on the host {{ $labels.instance }} for more than 3 minutes
Postfix Queue High (0 active)
alert: Postfix Queue High
expr: postfix_showq_message_size_bytes_count > 200
for: 10m
labels:
  severity: critical
annotations:
  summary: Postfix query overflow. Query exceeded {{ $value }} mails
Squid Server Down (0 active)
alert: Squid Server Down
expr: squid_server_up == 0
for: 10m
labels:
  severity: critical
annotations:
  summary: Squid on {{ $labels.instance }} not responding for more than 10 minutes
/etc/prometheus/mysql.rules.yml > mysql.rules
MysqlDown (0 active)
alert: MysqlDown
expr: mysql_up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: MySQL instance is down on {{ $labels.instance }}
  summary: MySQL down (instance {{ $labels.instance }})
MysqlHighThreadsRunning (0 active)
alert: MysqlHighThreadsRunning
expr: avg by(instance) (rate(mysql_global_status_threads_running[1m])) / avg by(instance) (mysql_global_variables_max_connections) * 100 > 60
for: 5m
labels:
  severity: critical
annotations:
  description: More than 60% of MySQL connections are in running state on {{ $labels.instance }}
  summary: MySQL high threads running (instance {{ $labels.instance }})
MysqlInnodbLogWaits (0 active)
alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
for: 5m
labels:
  severity: critical
annotations:
  description: MySQL innodb log writes stalling {{ $value }}
  summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
MysqlSlowQueries (0 active)
alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[2m]) > 10
for: 5m
labels:
  severity: critical
annotations:
  description: MySQL server mysql has some new slow query. {{ $value }}
  summary: MySQL slow queries (instance {{ $labels.instance }})
MysqlTooManyConnections (0 active)
alert: MysqlTooManyConnections
expr: avg by(instance) (rate(mysql_global_status_threads_connected[5m])) / avg by(instance) (mysql_global_variables_max_connections) * 100 > 60
for: 5m
labels:
  severity: critical
annotations:
  description: More than 60% of MySQL connections are in use on {{ $labels.instance }}
  summary: MySQL too many connections (> 60%) (instance {{ $labels.instance }})
/etc/prometheus/node.rules.yml > node.rules
Node SWAP Out Of Memory (2 active)
alert: Node SWAP Out Of Memory
expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes * 100 < 20
for: 10m
labels:
  severity: critical
annotations:
  summary: 'Node {{ $labels.instance }} swap memory is filing up (< 20% left). Free now: {{ $value }}%'
Labels State Active Since Value
alertname="Node SWAP Out Of Memory" instance="proxy.sgu.ru" job="node" severity="critical" firing 2024-11-16 05:51:54 +0000 UTC 0.5260470037684458
alertname="Node SWAP Out Of Memory" instance="walrus.sgu.ru" job="node" severity="critical" firing 2024-09-07 19:43:24 +0000 UTC 0.018548974298941756
Node Unusual Disk WriteLatency (3 active)
alert: Node Unusual Disk WriteLatency
expr: rate(node_disk_write_time_seconds_total[30m]) / rate(node_disk_writes_completed_total[30m]) > 0.15 and rate(node_disk_writes_completed_total[30m]) > 0
for: 30m
labels:
  severity: critical
annotations:
  summary: Host unusual disk write latency {{ $labels.instance }} - {{ $value }} s
Labels State Active Since Value
alertname="Node Unusual Disk WriteLatency" device="sdq" instance="mega.sgu.ru" job="node" severity="critical" pending 2024-11-26 22:02:24.92329382 +0000 UTC 0.15500000000000114
alertname="Node Unusual Disk WriteLatency" device="sda" instance="geocol.sgu.ru" job="node" severity="critical" firing 2024-11-19 21:41:24 +0000 UTC 0.27512837837836684
alertname="Node Unusual Disk WriteLatency" device="sdb" instance="geocol.sgu.ru" job="node" severity="critical" firing 2024-11-20 05:38:54 +0000 UTC 0.27309954954955734
Service Down (4 active)
alert: Service Down
expr: node_systemd_unit_state{name!="dnf-makecache.service",state=~"failed"} == 1
for: 5m
labels:
  severity: critical
annotations:
  summary: Service {{ $labels.name }} is not responding for 5m on {{ $labels.instance }} with status {{ $labels.state }}
Labels State Active Since Value
alertname="Service Down" instance="arcane.stingr.net" job="node" name="fwupd-refresh.service" severity="critical" state="failed" type="oneshot" firing 2024-11-26 16:25:24.92329382 +0000 UTC 1
alertname="Service Down" instance="mega.sgu.ru" job="node" name="rngd.service" severity="critical" state="failed" type="simple" firing 2024-11-25 05:57:24.92329382 +0000 UTC 1
alertname="Service Down" instance="mega.sgu.ru" job="node" name="systemd-binfmt.service" severity="critical" state="failed" type="oneshot" firing 2024-11-25 05:57:24.92329382 +0000 UTC 1
alertname="Service Down" instance="vuvuzela.sgu.ru" job="node" name="rabbitmq-server.service" severity="critical" state="failed" type="notify" firing 2024-11-25 05:57:24.92329382 +0000 UTC 1
Relocated Sectors (2 active)
alert: Relocated Sectors
expr: smart_attribute_raw{attribute_id=~"197|198"} > 0
for: 3h
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} имеет {{ $value }} переназначенных секторов. {{ $labels.device_model_family }} {{ $labels.device_model_name }} под именем {{ $labels.device_name }} с серийным номером {{ $labels.device_serial_number }}'
Labels State Active Since Value
alertname="Relocated Sectors" attribute_id="197" attribute_name="Current_Pending_Sector" device_model_family="Western Digital Blue" device_model_name="WDC WD40EZRZ-00WN9B0" device_name="sdp" device_serial_number="WD-WCC4E5JFV95F" device_type="sat" instance="mega.sgu.ru" job="smart" severity="critical" pending 2024-11-26 20:50:54.92329382 +0000 UTC 5
alertname="Relocated Sectors" attribute_id="198" attribute_name="Offline_Uncorrectable" device_model_family="Western Digital Blue" device_model_name="WDC WD40EZRZ-00WN9B0" device_name="sdp" device_serial_number="WD-WCC4E5JFV95F" device_type="sat" instance="mega.sgu.ru" job="smart" severity="critical" pending 2024-11-26 20:50:54.92329382 +0000 UTC 2
Big IOWait (0 active)
alert: Big IOWait
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[1m])) * 100 > 15
for: 2m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} have big iowait query: {{ $value }}%'
Host Oom Kill Detected (0 active)
alert: Host Oom Kill Detected
expr: increase(node_vmstat_oom_kill[1m]) > 0
labels:
  severity: warning
annotations:
  summary: OOM kill detected on {{ $labels.instance }}
HostClockSkew (0 active)
alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
  severity: warning
annotations:
  summary: Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host {{ $labels.instance }}
Node Clock Not Synchronising (0 active)
alert: Node Clock Not Synchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
  severity: critical
annotations:
  summary: Clock not synchronising. Ensure NTP is configured on this host {{ $labels.instance }}
Node Disk Is Missing (0 active)
alert: Node Disk Is Missing
expr: node_btrfs_device_size_bytes == 0
for: 1m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} has a disk {{ $labels.device }} that is not in the file system'
Node Down (0 active)
alert: Node Down
expr: up{job=~"node"} == 0
for: 1m
labels:
  severity: critical
annotations:
  summary: Node {{ $labels.instance }} not responding for more than 1 minutes.
Node Edac Correctable Errors Detected (0 active)
alert: Node Edac Correctable Errors Detected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 10m
labels:
  severity: critical
annotations:
  summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) {{ $value }}
Node Edac Uncorrectable Errors Detected (0 active)
alert: Node Edac Uncorrectable Errors Detected
expr: node_edac_uncorrectable_errors_total > 0
labels:
  severity: critical
annotations:
  summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) {{ $value }}
Node High Cpu Load (0 active)
alert: Node High Cpu Load
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{job="node",mode="idle"}[15m])) * 100) > 80
for: 15m
labels:
  severity: critical
annotations:
  summary: Host high CPU load {{ $labels.instance }} - {{ $value }}%
Node Low Disk Space (0 active)
alert: Node Low Disk Space
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes{mountpoint!~".mnt.*"} < 10 and on(instance, device, mountpoint) node_filesystem_readonly == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} low disk space. {{ $value }} percent of the {{ $labels.mountpoint }} is free left'
Node Memory Fill Up Soon (0 active)
alert: Node Memory Fill Up Soon
expr: predict_linear(node_memory_MemAvailable_bytes[2h], 1 * 3600) <= 0
for: 2h
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} memory will fiil up soon in 1h'
Node Memory Under Pressure (0 active)
alert: Node Memory Under Pressure
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100) < 20 and rate(node_vmstat_pgmajfault[5m]) > 100
for: 5m
labels:
  severity: critical
annotations:
  summary: Node {{ $labels.instance }} is under heavy memory pressure. High rate of major page faults. VALUE = {{ $value }}
Node NVMe Drive TooHot (0 active)
alert: Node NVMe Drive TooHot
expr: node_hwmon_temp_celsius{chip=~"nvme.+"} > 100
for: 10m
labels:
  severity: critical
annotations:
  summary: Host physical component too hot (instance {{ $labels.instance }}) {{ $labels.chip }} - {{ $value }}C
Node Out Of Memory (0 active)
alert: Node Out Of Memory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 20
for: 5m
labels:
  severity: critical
annotations:
  summary: 'Node {{ $labels.instance }} memory is filling up (< 20% left). Free now: {{ $value }}%'
Node Overtemperature Alarm (0 active)
alert: Node Overtemperature Alarm
expr: node_hwmon_temp_crit_alarm_celsius == 1
labels:
  severity: critical
annotations:
  summary: Host node overtemperature alarm (instance {{ $labels.instance }}) - {{ $value }}
Node Physical Component TooHot (0 active)
alert: Node Physical Component TooHot
expr: node_hwmon_temp_celsius{chip!~"nvme.+"} > 75
for: 10m
labels:
  severity: critical
annotations:
  summary: Host physical component too hot (instance {{ $labels.instance }}) {{ $labels.chip }} - {{ $value }}C
Node Raid Array Got Inactive (0 active)
alert: Node Raid Array Got Inactive
expr: node_md_state{state="inactive"} > 0
for: 10m
labels:
  severity: critical
annotations:
  summary: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically
Node Raid Disk Failure (0 active)
alert: Node Raid Disk Failure
expr: node_md_disks{state="failed"} > 0
for: 10m
labels:
  severity: critical
annotations:
  summary: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
Node Unusual Disk Read Rate (0 active)
alert: Node Unusual Disk Read Rate
expr: sum by(instance) (rate(node_disk_read_bytes_total[30m])) / 1024 / 1024 > 200
for: 30m
labels:
  severity: critical
annotations:
  summary: Host unusual disk read rate last 30 minutes {{ $labels.instance }} - {{ $value }} mb/s
Node Unusual Disk ReadLatency (0 active)
alert: Node Unusual Disk ReadLatency
expr: rate(node_disk_read_time_seconds_total[30m]) / rate(node_disk_reads_completed_total[30m]) > 0.3 and rate(node_disk_reads_completed_total[30m]) > 0
for: 30m
labels:
  severity: critical
annotations:
  summary: Host unusual disk read latency {{ $labels.instance }} - {{ $value }} s
Node Unusual Disk Write Rate (0 active)
alert: Node Unusual Disk Write Rate
expr: sum by(instance) (rate(node_disk_written_bytes_total[30m])) / 1024 / 1024 > 200
for: 30m
labels:
  severity: critical
annotations:
  summary: Host unusual disk write rate last 30 minutes {{ $labels.instance }} - {{ $value }} mb/s
Node Unusual Network ThroughputIn (0 active)
alert: Node Unusual Network ThroughputIn
expr: sum by(instance) (rate(node_network_receive_bytes_total[30m])) / 1024 / 1024 > 100
for: 15m
labels:
  severity: critical
annotations:
  summary: Host unusual network throughput in {{ $labels.instance }} - {{ $value }}
Node Unusual Network ThroughputOut (0 active)
alert: Node Unusual Network ThroughputOut
expr: sum by(instance) (rate(node_network_transmit_bytes_total[30m])) / 1024 / 1024 > 100
for: 15m
labels:
  severity: critical
annotations:
  summary: Host unusual network throughput out {{ $labels.instance }} - {{ $value }}
/etc/prometheus/self.rules.yml > self.rules
Prometheus Target Empty (1 active)
alert: Prometheus Target Empty
expr: prometheus_sd_discovered_targets == 0
labels:
  severity: critical
annotations:
  description: Prometheus has no target in service discovery
  summary: Prometheus target empty (instance {{ $labels.instance }})
Labels State Active Since Value
alertname="Prometheus Target Empty" config="contest_windows" instance="localhost" job="prometheus" name="scrape" severity="critical" firing 2024-11-26 22:02:50.755937521 +0000 UTC 0
AlertManager Configuration Failure (0 active)
alert: AlertManager Configuration Failure
expr: alertmanager_config_last_reload_successful != 1
labels:
  severity: critical
annotations:
  summary: AlertManager configuration reload failure (instance {{ $labels.iinstance }})
Prometheus Configuration Failure (0 active)
alert: Prometheus Configuration Failure
expr: prometheus_config_last_reload_successful != 1
labels:
  severity: critical
annotations:
  summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
Prometheus Large Scrape (0 active)
alert: Prometheus Large Scrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
  severity: critical
annotations:
  description: Prometheus has many scrapes that exceed the sample limit
  summary: Prometheus large scrape (instance {{ $labels.instance }})
Prometheus NotConnected To Alertmanager (0 active)
alert: Prometheus NotConnected To Alertmanager
expr: prometheus_notifications_alertmanagers_discovered < 1
labels:
  severity: critical
annotations:
  summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
Prometheus Rule Evaluation Failures (0 active)
alert: Prometheus Rule Evaluation Failures
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
labels:
  severity: critical
annotations:
  summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
Prometheus Rule Evaluation Slow (0 active)
alert: Prometheus Rule Evaluation Slow
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
for: 5m
labels:
  severity: critical
annotations:
  summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
Prometheus Target Scrape Duplicate (0 active)
alert: Prometheus Target Scrape Duplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
labels:
  severity: critical
annotations:
  description: Prometheus has many samples rejected due to duplicate timestamps but different values
  summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
Prometheus Template Text Expansion Failures (0 active)
alert: Prometheus Template Text Expansion Failures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
labels:
  severity: critical
annotations:
  summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
/etc/prometheus/ups.rules.yml > ups.rules
Low UPS battery capacity (1 active)
alert: Low UPS battery capacity
expr: upsEstimatedChargeRemaining < 75
for: 5m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} has low battery capacity: {{ $value }}%'
Labels State Active Since Value
alertname="Low UPS battery capacity" building="12 корпус" instance="b12f3-ups.ups" job="upssnmp" severity="critical" firing 2024-11-25 05:57:30.831221517 +0000 UTC 0
UPS not response (2 active)
alert: UPS not response
expr: up{job=~"apcsnmp|upssnmp"} == 0
for: 5m
labels:
  severity: critical
annotations:
  summary: UPS {{ $labels.instance }} not responding for more than 5 minutes.
Labels State Active Since Value
alertname="UPS not response" building="9 корпус" instance="Symmetra.ups" job="apcsnmp" severity="critical" firing 2024-11-25 05:57:30.831221517 +0000 UTC 0
alertname="UPS not response" building="10 корпус" instance="b10f1-ups.ups" job="upssnmp" severity="critical" firing 2024-11-25 05:57:30.831221517 +0000 UTC 0
UPS too hot! (2 active)
alert: UPS too hot!
expr: upsBatteryTemperature > 45
for: 5m
labels:
  severity: critical
annotations:
  summary: Too hot in {{ $labels.instance }}. Temperatur is {{ $value }}C
Labels State Active Since Value
alertname="UPS too hot!" building="15 корпус" instance="build15-ups.ups" job="upssnmp" severity="critical" firing 2024-11-25 05:57:30.831221517 +0000 UTC 50
alertname="UPS too hot!" building="Бассейн СГУ" instance="swimingpool-ups.ups" job="upssnmp" severity="critical" firing 2024-11-25 05:58:00.831221517 +0000 UTC 50
APC UPS low Input voltage (0 active)
alert: APC UPS low Input voltage
expr: upsAdvInputLineVoltage < 200
for: 5m
labels:
  severity: critical
annotations:
  summary: 'APC UPS {{ $labels.instance }} has low voltage: {{ $value }}V'
APC UPS too hot! (0 active)
alert: APC UPS too hot!
expr: iemStatusProbeCurrentTemp > 30
for: 5m
labels:
  severity: critical
annotations:
  summary: Too hot in {{ $labels.instance }}. Temperatur is {{ $value }}C
Low APC UPS battery capacity (0 active)
alert: Low APC UPS battery capacity
expr: upsAdvBatteryCapacity < 75
for: 5m
labels:
  severity: critical
annotations:
  summary: '{{ $labels.instance }} has low battery capacity: {{ $value }}%'
Server Room In FIRE! (0 active)
alert: Server Room In FIRE!
expr: iemStatusProbeCurrentTemp{instance="Symmetra.ups"} > 24
for: 10m
labels:
  severity: critical
annotations:
  summary: 'Server room too hot: {{ $value }}C'
There is no ELECTRICITY in the Server Room! (0 active)
alert: There is no ELECTRICITY in the Server Room!
expr: upsBasicInputPhase{instance="Symmetra.ups"} < 3
for: 5m
labels:
  severity: critical
annotations:
  summary: 'Working phases in Symmetra at the moment: {{ $value }}'
UPS big load (0 active)
alert: UPS big load
expr: upsOutputPercentLoad{upsOutputLineIndex="1"} > 75
for: 5m
labels:
  severity: critical
annotations:
  summary: 'UPS {{ $labels.instance }} has big load: {{ $value }}%'
UPS low Input voltage (0 active)
alert: UPS low Input voltage
expr: upsInputVoltage < 200
for: 5m
labels:
  severity: critical
annotations:
  summary: 'UPS {{ $labels.instance }} has low input voltage: {{ $value }}V'
UPS low Output voltage (0 active)
alert: UPS low Output voltage
expr: upsOutputVoltage{upsOutputLineIndex="1"} < 200
for: 5m
labels:
  severity: critical
annotations:
  summary: 'UPS {{ $labels.instance }} has low output voltage: {{ $value }}V'
/etc/prometheus/windows.rules.yml > windows.rules
Windows Server Collector Error (0 active)
alert: Windows Server Collector Error
expr: windows_exporter_collector_success{job="node"} == 0
labels:
  severity: critical
annotations:
  summary: Windows Server collector Error on {{ $labels.instance }}. Collector {{ $labels.collector }} was not successful
Windows Server Cpu Usage (0 active)
alert: Windows Server Cpu Usage
expr: 100 - (avg by(instance) (rate(windows_cpu_time_total{job="node",mode="idle"}[10m])) * 100) > 80
for: 10m
labels:
  severity: critical
annotations:
  summary: Windows Server CPU Usage on {{ $labels.instance }} is more than 80%
Windows Server Disk Space Usage (0 active)
alert: Windows Server Disk Space Usage
expr: 100 - 100 * ((windows_logical_disk_free_bytes{job="node"} / 1024 / 1024) / ((windows_logical_disk_size_bytes{job="node"} / 1024 / 1024) > 20000)) > 90
for: 10m
labels:
  severity: critical
annotations:
  summary: Windows Server disk Space Usage on {{ $labels.instance }} is more than 90% on {{ $labels.volume }}
Windows Server Memory Usage (0 active)
alert: Windows Server Memory Usage
expr: 100 - ((windows_os_physical_memory_free_bytes{job="node"} / windows_cs_physical_memory_bytes{job="node"}) * 100) > 80
for: 5m
labels:
  severity: critical
annotations:
  summary: Windows Server memory Usage on {{ $labels.instance }} is more than 80%
Windows Server Service Status (0 active)
alert: Windows Server Service Status
expr: windows_service_status{job="node",status="ok"} != 1
for: 1m
labels:
  severity: critical
annotations:
  summary: Windows Server service Status on {{ $labels.instance }}. {{ $labels.name }} state is not OK