/etc/prometheus/blackbox.rules.yml > blackbox.rules
|
|
|
|
|
|
|
/etc/prometheus/custom.rules.yml > custom.rules
|
Labels |
State |
Active Since |
Value |
alertname="Omada device down"
device="b9f1r108-tplink-test"
device_type="switch"
instance="localhost:14274"
ip="10.0.1.64"
job="omada"
mac="34-60-F9-DA-E1-F3"
model="SG3428"
severity="critical"
site="BuildTest"
site_id="6719ddf04982f17271fd0565"
version="2.30.0"
|
firing |
2024-11-26 13:55:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device switch b9f1r108-tplink-test from site BuildTest is down
|
alertname="Omada device down"
device="b9f1r108-tplinksfp1"
device_type="switch"
instance="localhost:14274"
ip="10.0.1.56"
job="omada"
mac="34-60-F9-24-72-42"
model="SG3452X"
severity="critical"
site="BuildTest"
site_id="6719ddf04982f17271fd0565"
version="1.20.1"
|
firing |
2024-11-26 13:55:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device switch b9f1r108-tplinksfp1 from site BuildTest is down
|
alertname="Omada device down"
device="9C-A2-F4-71-74-16"
device_type="switch"
instance="localhost:14274"
ip="10.0.1.66"
job="omada"
mac="9C-A2-F4-71-74-16"
model="SG3452XP"
severity="critical"
site="BuildTest"
site_id="6719ddf04982f17271fd0565"
version="2.20.3"
|
firing |
2024-11-26 13:55:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device switch 9C-A2-F4-71-74-16 from site BuildTest is down
|
|
Labels |
State |
Active Since |
Value |
alertname="Switch Down"
building="9 корпус"
instance="b19f1r111-hp"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b19f1r111-hp not responding for more than 1 minutes
|
alertname="Switch Down"
building="1 корпус"
instance="b9f1r108-qtech2"
job="upTime"
severity="critical"
|
firing |
2024-11-26 05:23:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f1r108-qtech2 not responding for more than 1 minutes
|
alertname="Switch Down"
building="14 корпус"
instance="swimingpool-dlink"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- swimingpool-dlink not responding for more than 1 minutes
|
alertname="Switch Down"
building="12 корпус"
instance="b12f1pool8"
job="upTime"
severity="critical"
|
firing |
2024-11-26 22:25:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b12f1pool8 not responding for more than 1 minutes
|
alertname="Switch Down"
building="12 корпус"
instance="b12f1pool6"
job="upTime"
severity="critical"
|
firing |
2024-11-25 07:24:49.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b12f1pool6 not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f3pool2b"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f3pool2b not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f3r310"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f3r310 not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="edgecore-01"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- edgecore-01 not responding for more than 1 minutes
|
alertname="Switch Down"
building="12 корпус"
instance="b12f2r225-cluster"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b12f2r225-cluster not responding for more than 1 minutes
|
alertname="Switch Down"
building="9 корпус"
instance="b9f1r108-tplink-test"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- b9f1r108-tplink-test not responding for more than 1 minutes
|
alertname="Switch Down"
building="1 корпус"
instance="mikrotik-9-1"
job="upTime"
severity="critical"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- mikrotik-9-1 not responding for more than 1 minutes
|
|
Labels |
State |
Active Since |
Value |
alertname="Unifi device down"
instance="localhost:9130"
job="unifi"
name="ap8f0w3"
severity="critical"
site_name="build8 (bi39um96)"
source="https://unifi.sgu.ru:8443"
type="uap"
|
firing |
2024-11-25 05:57:19.196947433 +0000 UTC |
0 |
Annotations |
- summary
- Device uap ap8f0w3 from site build8 (bi39um96) is down
|
|
alert: Bind Server Down
expr: bind_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: '{{ $labels.job }} on {{ $labels.instance }} not responding for more than 5 minutes'
|
|
|
|
|
|
|
|
|
alert: Nginx is down
expr: nginx_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: Nginx is down on the host {{ $labels.instance }} for more than 5 minutes
|
|
|
|
/etc/prometheus/mysql.rules.yml > mysql.rules
|
alert: MysqlDown
expr: mysql_up == 0
for: 1m
labels:
severity: critical
annotations:
description: MySQL instance is down on {{ $labels.instance }}
summary: MySQL down (instance {{ $labels.instance }})
|
|
|
|
|
/etc/prometheus/node.rules.yml > node.rules
|
Labels |
State |
Active Since |
Value |
alertname="Node SWAP Out Of Memory"
instance="proxy.sgu.ru"
job="node"
severity="critical"
|
firing |
2024-11-16 05:51:54 +0000 UTC |
0.5260470037684458 |
Annotations |
- summary
- Node proxy.sgu.ru swap memory is filing up (< 20% left). Free now: 0.5260470037684458%
|
alertname="Node SWAP Out Of Memory"
instance="walrus.sgu.ru"
job="node"
severity="critical"
|
firing |
2024-09-07 19:43:24 +0000 UTC |
0.018548974298941756 |
Annotations |
- summary
- Node walrus.sgu.ru swap memory is filing up (< 20% left). Free now: 0.018548974298941756%
|
|
Labels |
State |
Active Since |
Value |
alertname="Node Unusual Disk WriteLatency"
device="sdq"
instance="mega.sgu.ru"
job="node"
severity="critical"
|
pending |
2024-11-26 22:02:24.92329382 +0000 UTC |
0.15500000000000114 |
Annotations |
- summary
- Host unusual disk write latency mega.sgu.ru - 0.15500000000000114 s
|
alertname="Node Unusual Disk WriteLatency"
device="sda"
instance="geocol.sgu.ru"
job="node"
severity="critical"
|
firing |
2024-11-19 21:41:24 +0000 UTC |
0.27512837837836684 |
Annotations |
- summary
- Host unusual disk write latency geocol.sgu.ru - 0.27512837837836684 s
|
alertname="Node Unusual Disk WriteLatency"
device="sdb"
instance="geocol.sgu.ru"
job="node"
severity="critical"
|
firing |
2024-11-20 05:38:54 +0000 UTC |
0.27309954954955734 |
Annotations |
- summary
- Host unusual disk write latency geocol.sgu.ru - 0.27309954954955734 s
|
|
Labels |
State |
Active Since |
Value |
alertname="Service Down"
instance="arcane.stingr.net"
job="node"
name="fwupd-refresh.service"
severity="critical"
state="failed"
type="oneshot"
|
firing |
2024-11-26 16:25:24.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service fwupd-refresh.service is not responding for 5m on arcane.stingr.net with status failed
|
alertname="Service Down"
instance="mega.sgu.ru"
job="node"
name="rngd.service"
severity="critical"
state="failed"
type="simple"
|
firing |
2024-11-25 05:57:24.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service rngd.service is not responding for 5m on mega.sgu.ru with status failed
|
alertname="Service Down"
instance="mega.sgu.ru"
job="node"
name="systemd-binfmt.service"
severity="critical"
state="failed"
type="oneshot"
|
firing |
2024-11-25 05:57:24.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service systemd-binfmt.service is not responding for 5m on mega.sgu.ru with status failed
|
alertname="Service Down"
instance="vuvuzela.sgu.ru"
job="node"
name="rabbitmq-server.service"
severity="critical"
state="failed"
type="notify"
|
firing |
2024-11-25 05:57:24.92329382 +0000 UTC |
1 |
Annotations |
- summary
- Service rabbitmq-server.service is not responding for 5m on vuvuzela.sgu.ru with status failed
|
|
alert: Relocated Sectors
expr: smart_attribute_raw{attribute_id=~"197|198"} > 0
for: 3h
labels:
severity: critical
annotations:
summary: '{{ $labels.instance }} имеет {{ $value }} переназначенных секторов. {{ $labels.device_model_family }} {{ $labels.device_model_name }} под именем {{ $labels.device_name }} с серийным номером {{ $labels.device_serial_number }}'
Labels |
State |
Active Since |
Value |
alertname="Relocated Sectors"
attribute_id="197"
attribute_name="Current_Pending_Sector"
device_model_family="Western Digital Blue"
device_model_name="WDC WD40EZRZ-00WN9B0"
device_name="sdp"
device_serial_number="WD-WCC4E5JFV95F"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
pending |
2024-11-26 20:50:54.92329382 +0000 UTC |
5 |
Annotations |
- summary
- mega.sgu.ru имеет 5 переназначенных секторов. Western Digital Blue WDC WD40EZRZ-00WN9B0 под именем sdp с серийным номером WD-WCC4E5JFV95F
|
alertname="Relocated Sectors"
attribute_id="198"
attribute_name="Offline_Uncorrectable"
device_model_family="Western Digital Blue"
device_model_name="WDC WD40EZRZ-00WN9B0"
device_name="sdp"
device_serial_number="WD-WCC4E5JFV95F"
device_type="sat"
instance="mega.sgu.ru"
job="smart"
severity="critical"
|
pending |
2024-11-26 20:50:54.92329382 +0000 UTC |
2 |
Annotations |
- summary
- mega.sgu.ru имеет 2 переназначенных секторов. Western Digital Blue WDC WD40EZRZ-00WN9B0 под именем sdp с серийным номером WD-WCC4E5JFV95F
|
|
|
|
|
|
|
alert: Node Down
expr: up{job=~"node"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: Node {{ $labels.instance }} not responding for more than 1 minutes.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/self.rules.yml > self.rules
|
Labels |
State |
Active Since |
Value |
alertname="Prometheus Target Empty"
config="contest_windows"
instance="localhost"
job="prometheus"
name="scrape"
severity="critical"
|
firing |
2024-11-26 22:02:50.755937521 +0000 UTC |
0 |
Annotations |
- description
- Prometheus has no target in service discovery
- summary
- Prometheus target empty (instance localhost)
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/ups.rules.yml > ups.rules
|
Labels |
State |
Active Since |
Value |
alertname="Low UPS battery capacity"
building="12 корпус"
instance="b12f3-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-11-25 05:57:30.831221517 +0000 UTC |
0 |
Annotations |
- summary
- b12f3-ups.ups has low battery capacity: 0%
|
|
Labels |
State |
Active Since |
Value |
alertname="UPS not response"
building="9 корпус"
instance="Symmetra.ups"
job="apcsnmp"
severity="critical"
|
firing |
2024-11-25 05:57:30.831221517 +0000 UTC |
0 |
Annotations |
- summary
- UPS Symmetra.ups not responding for more than 5 minutes.
|
alertname="UPS not response"
building="10 корпус"
instance="b10f1-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-11-25 05:57:30.831221517 +0000 UTC |
0 |
Annotations |
- summary
- UPS b10f1-ups.ups not responding for more than 5 minutes.
|
|
Labels |
State |
Active Since |
Value |
alertname="UPS too hot!"
building="15 корпус"
instance="build15-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-11-25 05:57:30.831221517 +0000 UTC |
50 |
Annotations |
- summary
- Too hot in build15-ups.ups. Temperatur is 50C
|
alertname="UPS too hot!"
building="Бассейн СГУ"
instance="swimingpool-ups.ups"
job="upssnmp"
severity="critical"
|
firing |
2024-11-25 05:58:00.831221517 +0000 UTC |
50 |
Annotations |
- summary
- Too hot in swimingpool-ups.ups. Temperatur is 50C
|
|
|
|
|
|
|
|
|
|
/etc/prometheus/windows.rules.yml > windows.rules
|
|
|
|
|
|