/etc/prometheus/blackbox.rules.yml > blackbox.rules
|
|
|
|
|
|
|
/etc/prometheus/custom.rules.yml > custom.rules
|
Labels |
State |
Active Since |
Value |
alertname="Switch Down"
building="12 корпус"
instance="b12f1pool7"
job="upTime"
severity="critical"
|
firing |
2025-01-18 01:48:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b12f1pool7 not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f3r310"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f3r310 not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="edgecore-01"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- edgecore-01 not responding for more than 1 minutes
|
alertname="Switch Down"
building="12 корпус"
instance="b12f2r225-cluster"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b12f2r225-cluster not responding for more than 1 minutes
|
alertname="Switch Down"
building="8 корпус"
instance="b8f1pool4-new"
job="upTime"
severity="critical"
|
firing |
2024-12-12 08:58:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b8f1pool4-new not responding for more than 1 minutes
|
alertname="Switch Down"
building="12 корпус"
instance="b12f1pool6"
job="upTime"
severity="critical"
|
firing |
2024-12-17 15:11:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b12f1pool6 not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f3pool2b"
job="upTime"
severity="critical"
|
firing |
2024-12-19 04:37:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f3pool2b not responding for more than 1 minutes
|
alertname="Switch Down"
building="Колледж Яблочкова"
instance="b19f1r111"
job="upTime"
severity="critical"
|
firing |
2024-12-16 09:19:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b19f1r111 not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f1r108-tplink-test"
job="upTime"
severity="critical"
|
firing |
2024-11-28 09:07:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f1r108-tplink-test not responding for more than 1 minutes
|
alertname="Switch Down"
building="1 корпус"
instance="mikrotik-9-1"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- mikrotik-9-1 not responding for more than 1 minutes
|
|
Labels |
State |
Active Since |
Value |
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap8f0w3"
severity="critical"
site_name="build8 (bi39um96)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-12-14 08:06:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap8f0w3 from site build8 (bi39um96) is down
|
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap9f1r108-test"
severity="critical"
site_name="test (qhujeoql)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-12-14 08:06:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap9f1r108-test from site test (qhujeoql) is down
|
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap9f1r108-test2"
severity="critical"
site_name="test (qhujeoql)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-12-14 08:06:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap9f1r108-test2 from site test (qhujeoql) is down
|
|
Labels |
State |
Active Since |
Value |
alertname="Squid Server Down"
instance="proxy.sgu.ru"
job="squid"
severity="critical"
|
pending |
2025-01-18 02:03:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Squid on proxy.sgu.ru not responding for more than 10 minutes
|
|
alert: Bind Server Down
expr: bind_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: '{{ $labels.job }} on {{ $labels.instance }} not responding for more than 5 minutes'
|
|
|
|
|
|
|
|
|
alert: Nginx is down
expr: nginx_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: Nginx is down on the host {{ $labels.instance }} for more than 5 minutes
|
|
|
|
/etc/prometheus/mysql.rules.yml > mysql.rules
|
alert: MysqlDown
expr: mysql_up == 0
for: 1m
labels:
severity: critical
annotations:
description: MySQL instance is down on {{ $labels.instance }}
summary: MySQL down (instance {{ $labels.instance }})
|
|
|
|
|
/etc/prometheus/node.rules.yml > node.rules
|
Labels |
State |
Active Since |
Value |
alertname="Node SWAP Out Of Memory"
instance="proxy.sgu.ru"
job="node"
severity="critical"
|
firing |
2024-11-16 05:51:54 +0000 UTC |
9.836678427066053 |
Annotations |
- summary
- Node proxy.sgu.ru swap memory is filing up (< 20% left). Free now: 9.836678427066053%
|
alertname="Node SWAP Out Of Memory"
instance="walrus.sgu.ru"
job="node"
severity="critical"
|
firing |
2024-12-14 11:29:24.92329382 +0000 UTC |
2.2286425727093566 |
Annotations |
- summary
- Node walrus.sgu.ru swap memory is filing up (< 20% left). Free now: 2.2286425727093566%
|
|
alert: Relocated Sectors
expr: smart_attribute_raw{attribute_id=~"197|198"} > 0
for: 3h
labels:
severity: critical
annotations:
summary: '{{ $labels.instance }} имеет {{ $value }} переназначенных секторов. {{ $labels.device_model_family }} {{ $labels.device_model_name }} под именем {{ $labels.device_name }} с серийным номером {{ $labels.device_serial_number }}'
Labels |
State |
Active Since |
Value |
alertname="Relocated Sectors"
attribute_id="198"
attribute_name="Offline_Uncorrectable"
device_model_family="Western Digital Blue"
device_model_name="WDC WD40EZRZ-00WN9B0"
device_name="sdp"
device_serial_number="WD-WCC4E5JFV95F"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
firing |
2025-01-17 20:41:54.92329382 +0000 UTC |
3 |
Annotations |
- summary
- mega.sgu.ru имеет 3 переназначенных секторов. Western Digital Blue WDC WD40EZRZ-00WN9B0 под именем sdp с серийным номером WD-WCC4E5JFV95F
|
alertname="Relocated Sectors"
attribute_id="197"
attribute_name="Current_Pending_Sector"
device_model_family="Seagate Desktop HDD.15"
device_model_name="ST4000DM000-1F2168"
device_name="sdq"
device_serial_number="W300GVBF"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
firing |
2025-01-17 20:41:54.92329382 +0000 UTC |
80 |
Annotations |
- summary
- mega.sgu.ru имеет 80 переназначенных секторов. Seagate Desktop HDD.15 ST4000DM000-1F2168 под именем sdq с серийным номером W300GVBF
|
alertname="Relocated Sectors"
attribute_id="197"
attribute_name="Current_Pending_Sector"
device_model_family="Western Digital Blue"
device_model_name="WDC WD40EZRZ-00WN9B0"
device_name="sdp"
device_serial_number="WD-WCC4E5JFV95F"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
firing |
2025-01-17 20:41:54.92329382 +0000 UTC |
6 |
Annotations |
- summary
- mega.sgu.ru имеет 6 переназначенных секторов. Western Digital Blue WDC WD40EZRZ-00WN9B0 под именем sdp с серийным номером WD-WCC4E5JFV95F
|
alertname="Relocated Sectors"
attribute_id="198"
attribute_name="Offline_Uncorrectable"
device_model_family="Seagate Desktop HDD.15"
device_model_name="ST4000DM000-1F2168"
device_name="sdq"
device_serial_number="W300GVBF"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
firing |
2025-01-17 20:41:54.92329382 +0000 UTC |
80 |
Annotations |
- summary
- mega.sgu.ru имеет 80 переназначенных секторов. Seagate Desktop HDD.15 ST4000DM000-1F2168 под именем sdq с серийным номером W300GVBF
|
|
Labels |
State |
Active Since |
Value |
alertname="Service Down"
instance="vuvuzela.sgu.ru"
job="node"
name="rabbitmq-server.service"
severity="critical"
state="failed"
type="notify"
|
firing |
2024-11-25 05:57:24.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service rabbitmq-server.service is not responding for 5m on vuvuzela.sgu.ru with status failed
|
alertname="Service Down"
instance="arcane.stingr.net"
job="node"
name="fwupd-refresh.service"
severity="critical"
state="failed"
type="oneshot"
|
firing |
2025-01-17 22:30:24.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service fwupd-refresh.service is not responding for 5m on arcane.stingr.net with status failed
|
alertname="Service Down"
instance="mega.sgu.ru"
job="node"
name="rngd.service"
severity="critical"
state="failed"
type="simple"
|
firing |
2024-12-21 13:55:54.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service rngd.service is not responding for 5m on mega.sgu.ru with status failed
|
alertname="Service Down"
instance="vgw11.sgu.ru"
job="node"
name="borgv-backup@all.service"
severity="critical"
state="failed"
type="notify"
|
firing |
2025-01-17 20:00:54.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service borgv-backup@all.service is not responding for 5m on vgw11.sgu.ru with status failed
|
|
|
|
|
|
|
alert: Node Down
expr: up{job=~"node"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: Node {{ $labels.instance }} not responding for more than 1 minutes.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/self.rules.yml > self.rules
|
Labels |
State |
Active Since |
Value |
alertname="Prometheus Target Empty"
config="contest_windows"
instance="localhost"
job="prometheus"
name="scrape"
severity="critical"
|
firing |
2025-01-18 00:02:20.755937521 +0000 UTC |
0 |
Annotations |
- description
- Prometheus has no target in service discovery
- summary
- Prometheus target empty (instance localhost)
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/ups.rules.yml > ups.rules
|
Labels |
State |
Active Since |
Value |
alertname="Low UPS battery capacity"
building="12 корпус"
instance="b12f3-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2025-01-17 06:00:00.831221517 +0000 UTC |
0 |
Annotations |
- summary
- b12f3-ups.ups has low battery capacity: 0%
|
|
Labels |
State |
Active Since |
Value |
alertname="UPS not response"
building="10 корпус"
instance="b10f1-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-11-25 05:57:30.831221517 +0000 UTC |
0 |
Annotations |
- summary
- UPS b10f1-ups.ups not responding for more than 5 minutes.
|
alertname="UPS not response"
building="9 корпус"
instance="Symmetra.ups"
job="apcsnmp"
severity="critical"
|
firing |
2024-11-25 05:57:30.831221517 +0000 UTC |
0 |
Annotations |
- summary
- UPS Symmetra.ups not responding for more than 5 minutes.
|
|
Labels |
State |
Active Since |
Value |
alertname="UPS too hot!"
building="15 корпус"
instance="build15-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-11-25 05:57:30.831221517 +0000 UTC |
51 |
Annotations |
- summary
- Too hot in build15-ups.ups. Temperatur is 51C
|
alertname="UPS too hot!"
building="Бассейн СГУ"
instance="swimingpool-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2025-01-17 02:50:00.831221517 +0000 UTC |
48 |
Annotations |
- summary
- Too hot in swimingpool-ups.ups. Temperatur is 48C
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/windows.rules.yml > windows.rules
|
|
|
|
|
|