Jelajahi Sumber

添加系统监控

reghao 2 minggu lalu
induk
melakukan
607f15593f

+ 25 - 0
monitor/cadvisor/docker-compose.yml

@@ -0,0 +1,25 @@
+services:
+  cadvisor:
+    image: registry.cn-chengdu.aliyuncs.com/reghao/google_cadvisor:0.55.1
+    container_name: cadvisor
+    privileged: true
+    ports:
+      - "9099:8080"
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /dev/disk/:/dev/disk:ro
+    devices:
+      - /dev/kmsg
+    deploy:
+      resources:
+        limits:
+          cpus: '0.50'
+          memory: 400M
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "5m"
+        max-file: "2"

+ 47 - 0
monitor/conf/prometheus.yml

@@ -0,0 +1,47 @@
+rule_files:
+  - "alert_rules.yml"
+# 全局配置
+global:
+  scrape_interval:     15s 
+  evaluation_interval: 15s 
+# 抓取配置列表
+scrape_configs:
+  # 1. 抓取 Prometheus 自身的监控数据
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+  # 2. 抓取服务器基础监控数据 (Node Exporter)
+  - job_name: 'node-exporter'
+    static_configs:
+      - targets: ['192.168.0.181:9100', '192.168.0.182:9100', '192.168.0.208:9100', '192.168.0.209:9100']
+  # 3. 新增:抓取 Docker 容器监控数据 (cAdvisor)
+  - job_name: 'cadvisor'
+    # cAdvisor 的默认指标路径也是 /metrics,所以这里不需要特意写 metrics_path
+    static_configs:
+      - targets: ['192.168.0.181:9099', '192.168.0.182:9099', '192.168.0.208:9099', '192.168.0.209:9099']
+        labels:
+          cluster: 'docker-swarm' # 可选:添加自定义标签方便在 Grafana 中过滤
+  # 4. 抓取自定义微服务
+  # 1. 认证服务 (Auth Service)
+  - job_name: 'springboot-auth'
+    metrics_path: '/api/auth/actuator/prometheus' # 显式指定完整的监控路径
+    static_configs:
+      - targets: ['192.168.0.181:6001']
+        labels:
+          app: 'auth-service'
+
+  # 2. 用户服务 (User Service)
+  - job_name: 'springboot-user'
+    metrics_path: '/api/user/actuator/prometheus' # 显式指定完整的监控路径
+    static_configs:
+      - targets: ['192.168.0.181:6004']
+        labels:
+          app: 'user-service'
+
+  # 3. 内容服务 (Content Service)
+  - job_name: 'springboot-content'
+    metrics_path: '/api/content/actuator/prometheus' # 显式指定完整的监控路径
+    static_configs:
+      - targets: ['192.168.0.181:6005']
+        labels:
+          app: 'content-service'

+ 59 - 0
monitor/docker-compose.yml

@@ -0,0 +1,59 @@
+services:
+  prometheus:
+    image: registry.cn-chengdu.aliyuncs.com/reghao/prometheus:2.54.1
+    container_name: prometheus
+    restart: always
+    ports:
+      - "9090:9090"
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      # 极致内存优化参数:控制内存中缓存的数据块数量
+      - '--storage.tsdb.min-block-duration=2h'
+      - '--storage.tsdb.max-block-duration=2h'
+      # 减少历史保留时间,节省资源
+      - '--storage.tsdb.retention.time=7d'
+      - '--web.enable-admin-api'
+      - '--web.enable-lifecycle'
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - /opt/docker/prometheus/conf/prometheus.yml:/etc/prometheus/prometheus.yml
+      - /opt/docker/prometheus/rules:/etc/prometheus/rules
+      - /opt/docker/prometheus/data:/prometheus
+    # 限制资源使用
+    deploy:
+      resources:
+        limits:
+          cpus: '0.5'     # 最高使用 0.5 核
+          memory: 512M    # 强制上限
+        reservations:
+          memory: 128M    # 启动预留
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "10m"   # 单个日志文件最大 10MB
+        max-file: "3"     # 最多保留 3 个归档文件
+  grafana:
+    image: registry.cn-chengdu.aliyuncs.com/reghao/grafana_grafana-oss:11.2.2
+    container_name: grafana
+    user: root              # 相当于 -u root
+    privileged: true
+    restart: always
+    ports:
+      - "3000:3000"
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - /opt/docker/grafana/data:/var/lib/grafana
+    # --- 资源限制 ---
+    deploy:
+      resources:
+        limits:
+          cpus: '0.5'
+          memory: 256M
+        reservations:
+          memory: 128M
+    # --- 日志限制 ---
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "10m"
+        max-file: "2"

+ 26 - 0
monitor/init_monitor.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+###############################################################################
+# 初始化时使用, 只能执行一次
+###############################################################################
+
+echo "init prometheus..."
+base_dir="/opt/docker/prometheus"
+if [ -d ${base_dir} ]; then
+    echo ${base_dir}"已存在, 删除后重新创建..."
+    rm -rf ${base_dir}
+fi
+
+# 准备工作
+mkdir -p ${base_dir}/conf
+mkdir ${base_dir}/rules
+mkdir ${base_dir}/data
+cp conf/prometheus.yml ${base_dir}/conf
+cp rules/* ${base_dir}/rules
+
+chmod -R go+w ${base_dir}
+
+# 启动
+docker compose up -d

+ 19 - 0
monitor/monitor.md

@@ -0,0 +1,19 @@
+## grafana
+默认帐号密码
+```
+admin admin
+```
+
+## prometheus
+prometheus 重新加载配置
+```
+curl -X POST http://localhost:9090/-/reload
+```
+
+## node_exporter
+debian13
+```
+apt update && apt install -y prometheus-node-exporter && systemctl enable prometheus-node-exporter
+```
+
+## cadvisor

+ 69 - 0
monitor/rules/alert_rules.yml

@@ -0,0 +1,69 @@
+groups:
+  - name: infrastructure_alerts
+    rules:
+
+      # ==========================================
+      # 1. 实例存活状态报警 (适用于所有组件)
+      # ==========================================
+      - alert: InstanceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "实例挂了: {{ $labels.instance }}"
+          description: "任务 {{ $labels.job }} 中的实例 {{ $labels.instance }} 已经断开连接超过 1 分钟。"
+
+      # ==========================================
+      # 2. Node Exporter 报警 (服务器主机资源)
+      # ==========================================
+      - alert: HostCpuHighUsage
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "宿主机 CPU 使用率过高: {{ $labels.instance }}"
+          description: "主机 CPU 使用率已超过 85%(当前值: {{ $value | printf \"%.2f\" }}%),已持续 5 分钟。"
+
+      - alert: HostMemoryDiskSpaceLow
+        expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "根分区磁盘空间不足: {{ $labels.instance }}"
+          description: "根分区 (/) 可用空间低于 10%(当前剩余: {{ $value | printf \"%.2f\" }}%)。"
+
+      # ==========================================
+      # 3. cAdvisor 报警 (Docker 容器监控)
+      # ==========================================
+      - alert: ContainerCpuHigh
+        expr: sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (container, instance) * 100 > 80
+        for: 3m
+        labels:
+          severity: warning
+        annotations:
+          summary: "容器 CPU 使用率过高: {{ $labels.container }}"
+          description: "主机 {{ $labels.instance }} 上的容器 {{ $labels.container }} CPU 使用率超过 80%(当前值: {{ $value | printf \"%.2f\" }}%)。"
+
+      - alert: ContainerMemoryOOMRisk
+        expr: (container_memory_working_set_bytes{container!=""} / container_spec_memory_limit_bytes{container!=""}) * 100 > 90
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "容器内存即将耗尽 (OOM 风险): {{ $labels.container }}"
+          description: "容器 {{ $labels.container }} 的内存使用率已超过其限制的 90%(当前值: {{ $value | printf \"%.2f\" }}%),有被 OOM KILLED 的风险。"
+
+      # ==========================================
+      # 4. Spring Boot 应用报警 (业务性能监控)
+      # ==========================================
+      - alert: SpringBootHttp5xxErrorHigh
+        expr: sum(rate(http_server_requests_seconds_count{status=~"5.*"}[5m])) by (instance) / sum(rate(http_server_requests_seconds_count[5m])) by (instance) * 100 > 5
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Spring Boot 接口 5xx 错误率过高: {{ $labels.instance }}"
+          description: "服务 {{ $labels.instance }} 在过去 5 分钟内的 5xx 错误率超过 5%(当前值: {{ $value | printf \"%.2f\" }}%)。"