groups: - name: infrastructure_alerts rules: # ========================================== # 1. 实例存活状态报警 (适用于所有组件) # ========================================== - alert: InstanceDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "实例挂了: {{ $labels.instance }}" description: "任务 {{ $labels.job }} 中的实例 {{ $labels.instance }} 已经断开连接超过 1 分钟。" # ========================================== # 2. Node Exporter 报警 (服务器主机资源) # ========================================== - alert: HostCpuHighUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 for: 5m labels: severity: warning annotations: summary: "宿主机 CPU 使用率过高: {{ $labels.instance }}" description: "主机 CPU 使用率已超过 85%(当前值: {{ $value | printf \"%.2f\" }}%),已持续 5 分钟。" - alert: HostMemoryDiskSpaceLow expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10 for: 2m labels: severity: critical annotations: summary: "根分区磁盘空间不足: {{ $labels.instance }}" description: "根分区 (/) 可用空间低于 10%(当前剩余: {{ $value | printf \"%.2f\" }}%)。" # ========================================== # 3. cAdvisor 报警 (Docker 容器监控) # ========================================== - alert: ContainerCpuHigh expr: sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (container, instance) * 100 > 80 for: 3m labels: severity: warning annotations: summary: "容器 CPU 使用率过高: {{ $labels.container }}" description: "主机 {{ $labels.instance }} 上的容器 {{ $labels.container }} CPU 使用率超过 80%(当前值: {{ $value | printf \"%.2f\" }}%)。" - alert: ContainerMemoryOOMRisk expr: (container_memory_working_set_bytes{container!=""} / container_spec_memory_limit_bytes{container!=""}) * 100 > 90 for: 5m labels: severity: critical annotations: summary: "容器内存即将耗尽 (OOM 风险): {{ $labels.container }}" description: "容器 {{ $labels.container }} 的内存使用率已超过其限制的 90%(当前值: {{ $value | printf \"%.2f\" }}%),有被 OOM KILLED 的风险。" # ========================================== # 4. Spring Boot 应用报警 (业务性能监控) # ========================================== - alert: SpringBootHttp5xxErrorHigh expr: sum(rate(http_server_requests_seconds_count{status=~"5.*"}[5m])) by (instance) / sum(rate(http_server_requests_seconds_count[5m])) by (instance) * 100 > 5 for: 2m labels: severity: critical annotations: summary: "Spring Boot 接口 5xx 错误率过高: {{ $labels.instance }}" description: "服务 {{ $labels.instance }} 在过去 5 分钟内的 5xx 错误率超过 5%(当前值: {{ $value | printf \"%.2f\" }}%)。"