/etc/prometheus/blackbox.rules.yml > blackbox.rules
|
|
|
|
|
|
|
/etc/prometheus/custom.rules.yml > custom.rules
|
Labels |
State |
Active Since |
Value |
alertname="Omada device down"
device="hostel10-ap8-04"
device_type="ap"
instance="localhost:14274"
ip="10.1.51.43"
job="omada"
mac="54-AF-97-8C-96-FA"
model="EAP265 HD"
severity="critical"
site="Hostel10"
site_id="636dd2ddbfb65530449e09d9"
version="5.0.6"
|
firing |
2024-07-08 15:39:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device ap hostel10-ap8-04 from site Hostel10 is down
|
alertname="Omada device down"
device="hostel10-ap3-01"
device_type="ap"
instance="localhost:14274"
ip="10.1.51.11"
job="omada"
mac="54-AF-97-8C-96-D0"
model="EAP265 HD"
severity="critical"
site="Hostel10"
site_id="636dd2ddbfb65530449e09d9"
version="5.0.6"
|
firing |
2024-07-08 15:43:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device ap hostel10-ap3-01 from site Hostel10 is down
|
|
Labels |
State |
Active Since |
Value |
alertname="Switch Down"
instance="lyaf-remote.sgu.ru"
job="snmp"
severity="critical"
|
firing |
2024-06-28 09:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- lyaf-remote.sgu.ru not responding for more than 1 minutes
|
alertname="Switch Down"
building="8 корпус"
instance="b13-cpod3"
job="snmp"
severity="critical"
|
pending |
2024-09-19 05:31:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b13-cpod3 not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f3pool2b"
job="snmp"
severity="critical"
|
firing |
2024-08-12 12:42:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f3pool2b not responding for more than 1 minutes
|
alertname="Switch Down"
building="5 корпус"
instance="b5f1r24-new"
job="snmp"
severity="critical"
|
firing |
2024-09-19 05:28:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b5f1r24-new not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="hostel10-f5"
job="snmp"
severity="critical"
|
firing |
2024-06-28 09:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- hostel10-f5 not responding for more than 1 minutes
|
alertname="Switch Down"
building="Общежитие №1"
instance="hostel1-03"
job="snmp"
severity="critical"
|
firing |
2024-09-03 12:02:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- hostel1-03 not responding for more than 1 minutes
|
alertname="Switch Down"
building="1 корпус"
instance="b9f1r108-testnekrasov"
job="snmp"
severity="critical"
|
firing |
2024-06-28 09:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f1r108-testnekrasov not responding for more than 1 minutes
|
alertname="Switch Down"
building="1 корпус"
instance="b12-priem"
job="snmp"
severity="critical"
|
firing |
2024-08-28 07:16:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b12-priem not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f3r310"
job="snmp"
severity="critical"
|
firing |
2024-06-28 09:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f3r310 not responding for more than 1 minutes
|
alertname="Switch Down"
building="1 корпус"
instance="mikrotik-9-1"
job="snmp"
severity="critical"
|
firing |
2024-06-28 09:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- mikrotik-9-1 not responding for more than 1 minutes
|
alertname="Switch Down"
building="1 корпус"
instance="b9f1r108-tplink-rad"
job="snmp"
severity="critical"
|
firing |
2024-06-28 09:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f1r108-tplink-rad not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="edgecore-01"
job="snmp"
severity="critical"
|
firing |
2024-06-28 09:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- edgecore-01 not responding for more than 1 minutes
|
|
Labels |
State |
Active Since |
Value |
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap12f3r333"
severity="critical"
site_name="build12 (aedjxheo)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-09-04 04:51:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap12f3r333 from site build12 (aedjxheo) is down
|
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap12-mirantis-wifi-04"
severity="critical"
site_name="build12 (aedjxheo)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-08-23 21:23:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap12-mirantis-wifi-04 from site build12 (aedjxheo) is down
|
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap18-r110"
severity="critical"
site_name="build18 (hl5m5as6)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-08-23 21:23:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap18-r110 from site build18 (hl5m5as6) is down
|
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap8f0w3"
severity="critical"
site_name="build8 (bi39um96)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-08-23 21:23:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap8f0w3 from site build8 (bi39um96) is down
|
|
Labels |
State |
Active Since |
Value |
alertname="Borg Backup Missing"
host="clusternew"
instance="slon.sgu.ru"
job="borgbackup"
repo="all"
severity="critical"
|
pending |
2024-09-19 05:31:49.196947433 +0000 UTC |
900823.1960000992 |
Annotations |
- summary
- Backup on slon.sgu.ru not created last 3 days
|
|
alert: Bind Server Down
expr: bind_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: '{{ $labels.job }} on {{ $labels.instance }} not responding for more than 5 minutes'
|
|
|
|
|
|
|
|
alert: Nginx is down
expr: nginx_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: Nginx is down on the host {{ $labels.instance }} for more than 5 minutes
|
|
|
|
/etc/prometheus/mysql.rules.yml > mysql.rules
|
alert: MysqlDown
expr: mysql_up == 0
for: 1m
labels:
severity: critical
annotations:
description: MySQL instance is down on {{ $labels.instance }}
summary: MySQL down (instance {{ $labels.instance }})
|
|
|
|
|
/etc/prometheus/node.rules.yml > node.rules
|
Labels |
State |
Active Since |
Value |
alertname="Node Physical Component TooHot"
chip="thermal_thermal_zone3"
instance="walrus.sgu.ru"
job="node"
sensor="temp0"
severity="critical"
|
firing |
2024-09-19 05:03:54.92329382 +0000 UTC |
81 |
Annotations |
- summary
- Host physical component too hot (instance walrus.sgu.ru) thermal_thermal_zone3 - 81C
|
alertname="Node Physical Component TooHot"
chip="thermal_thermal_zone4"
instance="walrus.sgu.ru"
job="node"
sensor="temp1"
severity="critical"
|
firing |
2024-09-19 05:03:54.92329382 +0000 UTC |
81 |
Annotations |
- summary
- Host physical component too hot (instance walrus.sgu.ru) thermal_thermal_zone4 - 81C
|
alertname="Node Physical Component TooHot"
chip="thermal_thermal_zone3"
instance="walrus.sgu.ru"
job="node"
sensor="temp1"
severity="critical"
|
firing |
2024-09-19 05:03:54.92329382 +0000 UTC |
81 |
Annotations |
- summary
- Host physical component too hot (instance walrus.sgu.ru) thermal_thermal_zone3 - 81C
|
alertname="Node Physical Component TooHot"
chip="thermal_thermal_zone4"
instance="walrus.sgu.ru"
job="node"
sensor="temp0"
severity="critical"
|
firing |
2024-09-19 05:03:54.92329382 +0000 UTC |
81 |
Annotations |
- summary
- Host physical component too hot (instance walrus.sgu.ru) thermal_thermal_zone4 - 81C
|
|
Labels |
State |
Active Since |
Value |
alertname="Node SWAP Out Of Memory"
instance="walrus.sgu.ru"
job="node"
severity="critical"
|
firing |
2024-09-07 19:43:24.92329382 +0000 UTC |
0.0018119820651922537 |
Annotations |
- summary
- Node walrus.sgu.ru swap memory is filing up (< 20% left). Free now: 0.0018119820651922537%
|
|
alert: Relocated Sectors
expr: smart_attribute_raw{attribute_id=~"197|198"} > 0
for: 3h
labels:
severity: critical
annotations:
summary: '{{ $labels.instance }} имеет {{ $value }} переназначенных секторов. {{ $labels.device_model_family }} {{ $labels.device_model_name }} под именем {{ $labels.device_name }} с серийным номером {{ $labels.device_serial_number }}'
Labels |
State |
Active Since |
Value |
alertname="Relocated Sectors"
attribute_id="198"
attribute_name="Offline_Uncorrectable"
device_model_family="Western Digital Blue"
device_model_name="WDC WD40EZRZ-00WN9B0"
device_name="sdp"
device_serial_number="WD-WCC4E5JFV95F"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
firing |
2024-09-18 22:16:54.92329382 +0000 UTC |
2 |
Annotations |
- summary
- mega.sgu.ru имеет 2 переназначенных секторов. Western Digital Blue WDC WD40EZRZ-00WN9B0 под именем sdp с серийным номером WD-WCC4E5JFV95F
|
alertname="Relocated Sectors"
attribute_id="197"
attribute_name="Current_Pending_Sector"
device_model_family="Western Digital Blue"
device_model_name="WDC WD40EZRZ-00WN9B0"
device_name="sdp"
device_serial_number="WD-WCC4E5JFV95F"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
firing |
2024-09-18 22:16:54.92329382 +0000 UTC |
5 |
Annotations |
- summary
- mega.sgu.ru имеет 5 переназначенных секторов. Western Digital Blue WDC WD40EZRZ-00WN9B0 под именем sdp с серийным номером WD-WCC4E5JFV95F
|
|
Labels |
State |
Active Since |
Value |
alertname="Service Down"
instance="arcane.stingr.net"
job="node"
name="fwupd-refresh.service"
severity="critical"
state="failed"
type="oneshot"
|
firing |
2024-09-19 01:49:54.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service fwupd-refresh.service is not responding for 5m on arcane.stingr.net with status failed
|
alertname="Service Down"
instance="mega.sgu.ru"
job="node"
name="rngd.service"
severity="critical"
state="failed"
type="simple"
|
firing |
2024-08-18 10:42:24.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service rngd.service is not responding for 5m on mega.sgu.ru with status failed
|
|
|
|
|
|
|
alert: Node Down
expr: up{job=~"node"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: Node {{ $labels.instance }} not responding for more than 1 minutes.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/self.rules.yml > self.rules
|
Labels |
State |
Active Since |
Value |
alertname="Prometheus Target Empty"
config="contest_windows"
instance="localhost"
job="prometheus"
name="scrape"
severity="critical"
|
firing |
2024-09-19 05:30:50.755937521 +0000 UTC |
0 |
Annotations |
- description
- Prometheus has no target in service discovery
- summary
- Prometheus target empty (instance localhost)
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/ups.rules.yml > ups.rules
|
Labels |
State |
Active Since |
Value |
alertname="Low UPS battery capacity"
building="12 корпус"
instance="b12f3-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-08-30 09:38:30.831221517 +0000 UTC |
0 |
Annotations |
- summary
- b12f3-ups.ups has low battery capacity: 0%
|
|
Labels |
State |
Active Since |
Value |
alertname="UPS not response"
building="9 корпус"
instance="Symmetra.ups"
job="apcsnmp"
severity="critical"
|
firing |
2024-06-28 09:52:00.831221517 +0000 UTC |
0 |
Annotations |
- summary
- UPS Symmetra.ups not responding for more than 5 minutes.
|
|
Labels |
State |
Active Since |
Value |
alertname="UPS too hot!"
building="15 корпус"
instance="build15-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-06-28 09:52:00.831221517 +0000 UTC |
49 |
Annotations |
- summary
- Too hot in build15-ups.ups. Temperatur is 49C
|
alertname="UPS too hot!"
building="Бассейн СГУ"
instance="swimingpool-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-09-18 19:51:00.831221517 +0000 UTC |
47 |
Annotations |
- summary
- Too hot in swimingpool-ups.ups. Temperatur is 47C
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/windows.rules.yml > windows.rules
|
|
|
|
|
|