| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- groups:
- - name: infrastructure_alerts
- rules:
- # ==========================================
- # 1. 实例存活状态报警 (适用于所有组件)
- # ==========================================
- - alert: InstanceDown
- expr: up == 0
- for: 1m
- labels:
- severity: critical
- annotations:
- summary: "实例挂了: {{ $labels.instance }}"
- description: "任务 {{ $labels.job }} 中的实例 {{ $labels.instance }} 已经断开连接超过 1 分钟。"
- # ==========================================
- # 2. Node Exporter 报警 (服务器主机资源)
- # ==========================================
- - alert: HostCpuHighUsage
- expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "宿主机 CPU 使用率过高: {{ $labels.instance }}"
- description: "主机 CPU 使用率已超过 85%(当前值: {{ $value | printf \"%.2f\" }}%),已持续 5 分钟。"
- - alert: HostMemoryDiskSpaceLow
- expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
- for: 2m
- labels:
- severity: critical
- annotations:
- summary: "根分区磁盘空间不足: {{ $labels.instance }}"
- description: "根分区 (/) 可用空间低于 10%(当前剩余: {{ $value | printf \"%.2f\" }}%)。"
- # ==========================================
- # 3. cAdvisor 报警 (Docker 容器监控)
- # ==========================================
- - alert: ContainerCpuHigh
- expr: sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (container, instance) * 100 > 80
- for: 3m
- labels:
- severity: warning
- annotations:
- summary: "容器 CPU 使用率过高: {{ $labels.container }}"
- description: "主机 {{ $labels.instance }} 上的容器 {{ $labels.container }} CPU 使用率超过 80%(当前值: {{ $value | printf \"%.2f\" }}%)。"
- - alert: ContainerMemoryOOMRisk
- expr: (container_memory_working_set_bytes{container!=""} / container_spec_memory_limit_bytes{container!=""}) * 100 > 90
- for: 5m
- labels:
- severity: critical
- annotations:
- summary: "容器内存即将耗尽 (OOM 风险): {{ $labels.container }}"
- description: "容器 {{ $labels.container }} 的内存使用率已超过其限制的 90%(当前值: {{ $value | printf \"%.2f\" }}%),有被 OOM KILLED 的风险。"
- # ==========================================
- # 4. Spring Boot 应用报警 (业务性能监控)
- # ==========================================
- - alert: SpringBootHttp5xxErrorHigh
- expr: sum(rate(http_server_requests_seconds_count{status=~"5.*"}[5m])) by (instance) / sum(rate(http_server_requests_seconds_count[5m])) by (instance) * 100 > 5
- for: 2m
- labels:
- severity: critical
- annotations:
- summary: "Spring Boot 接口 5xx 错误率过高: {{ $labels.instance }}"
- description: "服务 {{ $labels.instance }} 在过去 5 分钟内的 5xx 错误率超过 5%(当前值: {{ $value | printf \"%.2f\" }}%)。"
|