|
|
@@ -0,0 +1,69 @@
|
|
|
+groups:
|
|
|
+ - name: infrastructure_alerts
|
|
|
+ rules:
|
|
|
+
|
|
|
+ # ==========================================
|
|
|
+ # 1. 实例存活状态报警 (适用于所有组件)
|
|
|
+ # ==========================================
|
|
|
+ - alert: InstanceDown
|
|
|
+ expr: up == 0
|
|
|
+ for: 1m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ summary: "实例挂了: {{ $labels.instance }}"
|
|
|
+ description: "任务 {{ $labels.job }} 中的实例 {{ $labels.instance }} 已经断开连接超过 1 分钟。"
|
|
|
+
|
|
|
+ # ==========================================
|
|
|
+ # 2. Node Exporter 报警 (服务器主机资源)
|
|
|
+ # ==========================================
|
|
|
+ - alert: HostCpuHighUsage
|
|
|
+ expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ summary: "宿主机 CPU 使用率过高: {{ $labels.instance }}"
|
|
|
+ description: "主机 CPU 使用率已超过 85%(当前值: {{ $value | printf \"%.2f\" }}%),已持续 5 分钟。"
|
|
|
+
|
|
|
+ - alert: HostMemoryDiskSpaceLow
|
|
|
+ expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
|
|
|
+ for: 2m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ summary: "根分区磁盘空间不足: {{ $labels.instance }}"
|
|
|
+ description: "根分区 (/) 可用空间低于 10%(当前剩余: {{ $value | printf \"%.2f\" }}%)。"
|
|
|
+
|
|
|
+ # ==========================================
|
|
|
+ # 3. cAdvisor 报警 (Docker 容器监控)
|
|
|
+ # ==========================================
|
|
|
+ - alert: ContainerCpuHigh
|
|
|
+ expr: sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (container, instance) * 100 > 80
|
|
|
+ for: 3m
|
|
|
+ labels:
|
|
|
+ severity: warning
|
|
|
+ annotations:
|
|
|
+ summary: "容器 CPU 使用率过高: {{ $labels.container }}"
|
|
|
+ description: "主机 {{ $labels.instance }} 上的容器 {{ $labels.container }} CPU 使用率超过 80%(当前值: {{ $value | printf \"%.2f\" }}%)。"
|
|
|
+
|
|
|
+ - alert: ContainerMemoryOOMRisk
|
|
|
+ expr: (container_memory_working_set_bytes{container!=""} / container_spec_memory_limit_bytes{container!=""}) * 100 > 90
|
|
|
+ for: 5m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ summary: "容器内存即将耗尽 (OOM 风险): {{ $labels.container }}"
|
|
|
+ description: "容器 {{ $labels.container }} 的内存使用率已超过其限制的 90%(当前值: {{ $value | printf \"%.2f\" }}%),有被 OOM KILLED 的风险。"
|
|
|
+
|
|
|
+ # ==========================================
|
|
|
+ # 4. Spring Boot 应用报警 (业务性能监控)
|
|
|
+ # ==========================================
|
|
|
+ - alert: SpringBootHttp5xxErrorHigh
|
|
|
+ expr: sum(rate(http_server_requests_seconds_count{status=~"5.*"}[5m])) by (instance) / sum(rate(http_server_requests_seconds_count[5m])) by (instance) * 100 > 5
|
|
|
+ for: 2m
|
|
|
+ labels:
|
|
|
+ severity: critical
|
|
|
+ annotations:
|
|
|
+ summary: "Spring Boot 接口 5xx 错误率过高: {{ $labels.instance }}"
|
|
|
+ description: "服务 {{ $labels.instance }} 在过去 5 分钟内的 5xx 错误率超过 5%(当前值: {{ $value | printf \"%.2f\" }}%)。"
|