alert_rules.yml 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. groups:
  2. - name: infrastructure_alerts
  3. rules:
  4. # ==========================================
  5. # 1. 实例存活状态报警 (适用于所有组件)
  6. # ==========================================
  7. - alert: InstanceDown
  8. expr: up == 0
  9. for: 1m
  10. labels:
  11. severity: critical
  12. annotations:
  13. summary: "实例挂了: {{ $labels.instance }}"
  14. description: "任务 {{ $labels.job }} 中的实例 {{ $labels.instance }} 已经断开连接超过 1 分钟。"
  15. # ==========================================
  16. # 2. Node Exporter 报警 (服务器主机资源)
  17. # ==========================================
  18. - alert: HostCpuHighUsage
  19. expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
  20. for: 5m
  21. labels:
  22. severity: warning
  23. annotations:
  24. summary: "宿主机 CPU 使用率过高: {{ $labels.instance }}"
  25. description: "主机 CPU 使用率已超过 85%(当前值: {{ $value | printf \"%.2f\" }}%),已持续 5 分钟。"
  26. - alert: HostMemoryDiskSpaceLow
  27. expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
  28. for: 2m
  29. labels:
  30. severity: critical
  31. annotations:
  32. summary: "根分区磁盘空间不足: {{ $labels.instance }}"
  33. description: "根分区 (/) 可用空间低于 10%(当前剩余: {{ $value | printf \"%.2f\" }}%)。"
  34. # ==========================================
  35. # 3. cAdvisor 报警 (Docker 容器监控)
  36. # ==========================================
  37. - alert: ContainerCpuHigh
  38. expr: sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (container, instance) * 100 > 80
  39. for: 3m
  40. labels:
  41. severity: warning
  42. annotations:
  43. summary: "容器 CPU 使用率过高: {{ $labels.container }}"
  44. description: "主机 {{ $labels.instance }} 上的容器 {{ $labels.container }} CPU 使用率超过 80%(当前值: {{ $value | printf \"%.2f\" }}%)。"
  45. - alert: ContainerMemoryOOMRisk
  46. expr: (container_memory_working_set_bytes{container!=""} / container_spec_memory_limit_bytes{container!=""}) * 100 > 90
  47. for: 5m
  48. labels:
  49. severity: critical
  50. annotations:
  51. summary: "容器内存即将耗尽 (OOM 风险): {{ $labels.container }}"
  52. description: "容器 {{ $labels.container }} 的内存使用率已超过其限制的 90%(当前值: {{ $value | printf \"%.2f\" }}%),有被 OOM KILLED 的风险。"
  53. # ==========================================
  54. # 4. Spring Boot 应用报警 (业务性能监控)
  55. # ==========================================
  56. - alert: SpringBootHttp5xxErrorHigh
  57. expr: sum(rate(http_server_requests_seconds_count{status=~"5.*"}[5m])) by (instance) / sum(rate(http_server_requests_seconds_count[5m])) by (instance) * 100 > 5
  58. for: 2m
  59. labels:
  60. severity: critical
  61. annotations:
  62. summary: "Spring Boot 接口 5xx 错误率过高: {{ $labels.instance }}"
  63. description: "服务 {{ $labels.instance }} 在过去 5 分钟内的 5xx 错误率超过 5%(当前值: {{ $value | printf \"%.2f\" }}%)。"