|
|
@@ -10,6 +10,7 @@ import cn.reghao.devops.mgr.ops.builder.model.po.AppBuilding;
|
|
|
import cn.reghao.devops.mgr.ops.builder.model.po.AppDeploying;
|
|
|
import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
|
|
|
import cn.reghao.devops.mgr.ops.mon.repository.InspectionTaskRepository;
|
|
|
+import cn.reghao.devops.mgr.ops.mon.service.PrometheusService;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
|
@@ -66,108 +67,124 @@ public class AppConfigTest {
|
|
|
|
|
|
@Autowired
|
|
|
InspectionTaskRepository inspectionTaskRepository;
|
|
|
+ @Autowired
|
|
|
+ PrometheusService prometheusService;
|
|
|
@Test
|
|
|
public void initInspectionTask() {
|
|
|
- List<InspectionTask> list = getAllTasks();
|
|
|
- inspectionTaskRepository.saveAll(list);
|
|
|
- }
|
|
|
-
|
|
|
- public List<InspectionTask> getAllTasks() {
|
|
|
List<InspectionTask> tasks = new ArrayList<>();
|
|
|
|
|
|
- // --- 类别 A: 计算资源与调度 (Compute) ---
|
|
|
- tasks.add(new InspectionTask("cpuThrottled", "计算资源", "容器CPU节流",
|
|
|
- "topk(10, sum(label_replace(increase(container_cpu_cfs_throttled_seconds_total[24h]), \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")) by (name, instance))",
|
|
|
- 1000.0, "建议:调大容器CPU Limit或优化线程池。"));
|
|
|
+ // --- 类别 A: 计算资源 (Compute) ---
|
|
|
+ // 逻辑:24H 累计增量
|
|
|
+ tasks.add(new InspectionTask("cpuThrottled", "计算资源", "CPU 性能节流", "24H Throttled",
|
|
|
+ "topk(10, sum(increase(container_cpu_cfs_throttled_seconds_total[24h])) by (name, instance))",
|
|
|
+ "INCREASE", "gt", 1000.0, "建议:调大容器CPU Limit或优化线程池。", "el-icon-cpu", "warning-text", "warning", "#e6a23c", "s", false, "无节流限制"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("nodeLoadRisk", "计算资源", "节点CPU负载饱和度",
|
|
|
- "node_load5 / count by (instance) (node_cpu_seconds_total{mode=\"idle\"})",
|
|
|
- 1.2, "建议:Load高于核心数,说明进程正在排队,请检查系统瓶颈。"));
|
|
|
+ // 逻辑:24H 平均负载
|
|
|
+ tasks.add(new InspectionTask("nodeLoadRisk", "计算资源", "节点负载饱和度", "24H Avg Load",
|
|
|
+ "avg_over_time((node_load5 / count by (instance) (node_cpu_seconds_total{mode=\"idle\"}))[24h:5m])",
|
|
|
+ "AVG", "gt", 1.2, "建议:全天平均Load高于核心数,说明进程长期排队,请检查系统瓶颈。", "el-icon-odometer", "warning-text", "warning", "#e6a23c", "", false, "系统负载极低"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("contextSwitchRisk", "计算资源", "上下文切换风险",
|
|
|
- "topk(10, avg_over_time(rate(node_context_switches_total[5m])[24h:1m]))",
|
|
|
- 50000.0, "建议:减少线程竞争或优化锁逻辑。"));
|
|
|
+ // 逻辑:24H 平均切换频率
|
|
|
+ tasks.add(new InspectionTask("contextSwitchRisk", "计算资源", "上下文切换", "24H Avg CS",
|
|
|
+ "avg_over_time(rate(node_context_switches_total[5m])[24h:1m])",
|
|
|
+ "AVG", "gt", 50000.0, "建议:减少线程竞争或优化锁逻辑。", "el-icon-refresh", "warning-text", "warning", "#e6a23c", "次/s", false, "调度平稳"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("zombieRisk", "计算资源", "僵尸进程风险",
|
|
|
+ // 逻辑:当前瞬时状态
|
|
|
+ tasks.add(new InspectionTask("zombieRisk", "计算资源", "僵尸进程", "Z States",
|
|
|
"node_processes_state{state='Z'}",
|
|
|
- 5.0, "建议:修复父进程回收逻辑,防止PID泄露。"));
|
|
|
+ "INSTANT", "gt", 5.0, "建议:修复父进程回收逻辑,防止PID泄露。", "el-icon-stopwatch", "warning-text", "warning", "#e6a23c", "个", false, "无僵尸进程"));
|
|
|
+
|
|
|
+ // 逻辑:24H 平均 CPU 使用率
|
|
|
+ tasks.add(new InspectionTask("nodeCpuUsage", "计算资源", "节点CPU利用率", "24H Avg Usage",
|
|
|
+ "avg_over_time((100 - (irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * 100))[24h:5m])",
|
|
|
+ "AVG", "gt", 85.0, "建议:全天平均CPU占用过高,请检查是否有异常进程抢占资源。", "el-icon-monitor", "danger-text", "danger", "#f56c6c", "%", true, "利用率正常"));
|
|
|
|
|
|
- // --- 类别 B: 内存与运行时 (Memory) ---
|
|
|
- tasks.add(new InspectionTask("memRisk", "内存指标", "容器内存风险",
|
|
|
- "topk(10, avg by (name, instance) (label_replace((container_memory_working_set_bytes{name!=\"\"} / container_spec_memory_limit_bytes > 0) * 100, \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")))",
|
|
|
- 85.0, "建议:检查内存泄露或增加内存配额。"));
|
|
|
+ // 逻辑:24H 平均 IO 等待
|
|
|
+ tasks.add(new InspectionTask("cpuWaitIo", "计算资源", "CPU I/O等待频率", "24H Avg iowait",
|
|
|
+ "avg_over_time((irate(node_cpu_seconds_total{mode=\"iowait\"}[5m]) * 100)[24h:5m])",
|
|
|
+ "AVG", "gt", 10.0, "建议:全天IO等待比例偏高,请排查高频写日志或慢磁盘问题。", "el-icon-time", "warning-text", "warning", "#e6a23c", "%", true, "磁盘 I/O 无延迟"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("memPsiRisk", "内存指标", "内存紧缩压力(PSI)",
|
|
|
- "rate(node_pressure_memory_some_seconds_total[5m])",
|
|
|
- 0.1, "建议:系统正在频繁页面置换,请检查高内存占用进程。"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("oomEvents", "内存指标", "OOM事件",
|
|
|
+ // --- 类别 B: 内存指标 (Memory) ---
|
|
|
+ // 逻辑:当前水位
|
|
|
+ tasks.add(new InspectionTask("nodeMemUsage", "内存指标", "节点内存利用率", "Physical Mem",
|
|
|
+ "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
|
|
|
+ "INSTANT", "gt", 90.0, "建议:当前物理内存严重不足,可能会触发整机级别的OOM。", "el-icon-set-up", "danger-text", "danger", "#f56c6c", "%", true, "内存充足"));
|
|
|
+
|
|
|
+ tasks.add(new InspectionTask("memRisk", "内存指标", "容器内存水位线", "> 85% Usage",
|
|
|
+ "topk(10, (container_memory_working_set_bytes{name!=\"\"} / container_spec_memory_limit_bytes) * 100)",
|
|
|
+ "INSTANT", "gt", 85.0, "建议:容器接近内存限制,请检查泄露或增加配额。", "el-icon-box", "danger-text", "danger", "#f56c6c", "%", true, "水位正常"));
|
|
|
+
|
|
|
+ // 逻辑:24H 累计增量
|
|
|
+ tasks.add(new InspectionTask("oomEvents", "内存指标", "OOM Kill 事件", "24H Kernel OOM",
|
|
|
"increase(node_vmstat_oom_kill[24h])",
|
|
|
- 1.0, "建议:分析dmesg定位被杀进程,优化内存分配。"));
|
|
|
+ "INCREASE", "gt", 1.0, "建议:过去24小时发生过OOM,请分析dmesg定位被杀进程。", "el-icon-warning", "danger-text", "danger", "#f56c6c", "次", false, "无内存强杀"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("pidLimitRisk", "内存指标", "PID进程数限制",
|
|
|
+ tasks.add(new InspectionTask("pidLimitRisk", "内存指标", "PID 进程数限制", "Forks / Max",
|
|
|
"(node_forks_total / node_processes_max) * 100",
|
|
|
- 80.0, "建议:PID快用完了,请检查是否存在大量短时进程或进程泄露。"));
|
|
|
+ "INSTANT", "gt", 80.0, "建议:当前PID池占用过高,请检查进程泄露。", "el-icon-connection", "primary-text", "primary", "#409eff", "%", true, "池空间充足"));
|
|
|
|
|
|
- // --- 类别 C: 存储与文件系统 (Storage) ---
|
|
|
- tasks.add(new InspectionTask("diskUsageRisk", "存储文件", "磁盘空间风险",
|
|
|
- "topk(10, (1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100)",
|
|
|
- 80.0, "建议:清理日志或扩容磁盘。"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("diskPredictRisk", "存储文件", "磁盘存满预测(24h)",
|
|
|
- "predict_linear(node_filesystem_avail_bytes{mountpoint=\"/\"}[6h], 24 * 3600) < 0",
|
|
|
- 0.0, "建议:磁盘预计在24小时内写满,请立即处理。"));
|
|
|
+ // --- 类别 C: 存储文件 (Storage) ---
|
|
|
+ // 逻辑:当前瞬时水位
|
|
|
+ tasks.add(new InspectionTask("diskUsageRisk", "存储文件", "根分区空间", "( / ) Usage",
|
|
|
+ "(1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100",
|
|
|
+ "INSTANT", "gt", 80.0, "建议:当前磁盘占用过高,请清理日志或扩容。", "el-icon-pie-chart", "danger-text", "danger", "#f56c6c", "%", true, "空间充沛"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("inodeRisk", "存储文件", "Inode使用率",
|
|
|
- "topk(10, (1 - node_filesystem_files_free / node_filesystem_files) * 100)",
|
|
|
- 80.0, "建议:清理大量极小文件(如临时日志)。"));
|
|
|
+ // 逻辑:区间趋势预测
|
|
|
+ tasks.add(new InspectionTask("diskPredictRisk", "存储文件", "磁盘存满预测", "Predict 24H",
|
|
|
+ "predict_linear(node_filesystem_avail_bytes{mountpoint=\"/\"}[6h], 24 * 3600)",
|
|
|
+ "INSTANT", "lt", 0.0, "建议:根据最近6h趋势,磁盘预计在24小时内写满,请立即处理。", "el-icon-magic-stick", "danger-text", "danger", "#f56c6c", "", false, "无写满风险"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("diskIo", "存储文件", "磁盘IO负载",
|
|
|
- "avg_over_time(label_replace(irate(node_disk_io_time_seconds_total[10m]), \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")[24h:1m])",
|
|
|
- 1.0, "建议:排查高I/O进程,检查存储后端健康度。"));
|
|
|
+ tasks.add(new InspectionTask("inodeRisk", "存储文件", "inode 使用率", "Index Nodes",
|
|
|
+ "(1 - node_filesystem_files_free / node_filesystem_files) * 100",
|
|
|
+ "INSTANT", "gt", 80.0, "建议:inode即将耗尽,请清理大量小文件。", "el-icon-files", "warning-text", "warning", "#e6a23c", "%", true, "索引节点充足"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("fdRisk", "存储文件", "文件句柄风险",
|
|
|
- "topk(10, (node_filefd_allocated / node_filefd_maximum) * 100)",
|
|
|
- 80.0, "建议:检查FD泄露,必要时调大ulimit。"));
|
|
|
+ // 逻辑:24H 平均响应
|
|
|
+ tasks.add(new InspectionTask("diskIo", "存储文件", "磁盘 IO 响应", "24H Avg Wait",
|
|
|
+ "avg_over_time(irate(node_disk_io_time_seconds_total[10m])[24h:5m])",
|
|
|
+ "AVG", "gt", 1.0, "建议:过去24小时磁盘IO平均响应过慢,检查存储后端健康度。", "el-icon-receiving", "warning-text", "warning", "#e6a23c", "ms", false, "响应极快"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("readOnlyFsRisk", "存储文件", "只读分区风险",
|
|
|
+ // 逻辑:当前硬状态
|
|
|
+ tasks.add(new InspectionTask("readOnlyFsRisk", "存储文件", "文件系统只读", "RO Status",
|
|
|
"node_filesystem_readonly{mountpoint='/'}",
|
|
|
- 1.0, "建议:硬件故障触发只读挂载,请检修硬件。"));
|
|
|
+ "INSTANT", "ne", 0.0, "建议:检测到只读挂载,可能存在硬件故障,请立即检修。", "el-icon-lock", "danger-text", "danger", "#f56c6c", "", false, "挂载正常"));
|
|
|
+
|
|
|
|
|
|
- // --- 类别 D: 网络协议栈 (Network) ---
|
|
|
- tasks.add(new InspectionTask("conntrackRisk", "网络协议", "连接跟踪表(Conntrack)",
|
|
|
- "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) * 100",
|
|
|
- 80.0, "建议:连接表快满了,请优化内核参数或清理连接。"));
|
|
|
+ // --- 类别 D: 网络协议 (Network) ---
|
|
|
+ // 逻辑:24H 峰值
|
|
|
+ tasks.add(new InspectionTask("netEstMax", "网络协议", "TCP EST 数量", "24H Max EST",
|
|
|
+ "max_over_time(node_netstat_Tcp_CurrEstab[24h])",
|
|
|
+ "MAX", "gt", 5000.0, "建议:过去24小时连接数曾达峰值,请评估业务并发及连接泄露。", "el-icon-warning-outline", "danger-text", "danger", "#f56c6c", "次", false, "未超限"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("netEstMax", "网络协议", "TCP连接峰值",
|
|
|
- "max_over_time(label_replace(node_netstat_Tcp_CurrEstab, \"state\", \"ESTABLISHED\", \"\", \"\")[24h:])",
|
|
|
- 5000.0, "建议:评估业务并发,检查长连接泄露。"));
|
|
|
+ tasks.add(new InspectionTask("netTwMax", "网络协议", "TCP TIME_WAIT", "24H Max TW",
|
|
|
+ "max_over_time(node_sockstat_TCP_tw[24h])",
|
|
|
+ "MAX", "gt", 5000.0, "建议:全天TW连接数较高,请开启tw_reuse或检查客户端连接池。", "el-icon-warning-outline", "danger-text", "danger", "#f56c6c", "次", false, "未超限"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("netTwMax", "网络协议", "TIME_WAIT堆积",
|
|
|
- "max_over_time(label_replace(node_sockstat_TCP_tw, \"state\", \"TIME_WAIT\", \"\", \"\")[24h:])",
|
|
|
- 5000.0, "建议:开启tw_reuse或检查压测工具。"));
|
|
|
+ // 逻辑:24H 累计增量
|
|
|
+ tasks.add(new InspectionTask("netOverflows", "网络协议", "TCP 队列溢出", "24H Listen Overflow",
|
|
|
+ "increase(node_netstat_TcpExt_ListenOverflows[24h])",
|
|
|
+ "INCREASE", "gt", 1.0, "建议:全天曾出现溢出,应用处理过慢,请调大backlog或优化代码。", "el-icon-warning-outline", "danger-text", "danger", "#f56c6c", "次", false, "无溢出"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("netBandwidthRisk", "网络协议", "网卡带宽利用率",
|
|
|
- "((rate(node_network_receive_bytes_total[5m]) + rate(node_network_transmit_bytes_total[5m])) * 8) / node_network_speed_bytes > 0",
|
|
|
- 0.8, "建议:带宽接近极限,检查是否有大文件同步。"));
|
|
|
+ tasks.add(new InspectionTask("netDrops", "网络协议", "TCP 数据包丢弃", "24H Packet Drops",
|
|
|
+ "increase(node_netstat_TcpExt_ListenDrops[24h])",
|
|
|
+ "INCREASE", "gt", 1.0, "建议:全天曾有数据包丢弃,请检查网络负载或内核缓冲区。", "el-icon-circle-close", "danger-text", "danger", "#f56c6c", "次", false, "无丢弃"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("tcpRetransRisk", "网络协议", "TCP重传率",
|
|
|
- "rate(node_netstat_Tcp_RetransSegs[5m]) / rate(node_netstat_Tcp_OutSegs[5m]) * 100",
|
|
|
- 1.0, "建议:重传率偏高,检查链路质量或对端压力。"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("netOverflows", "网络协议", "全连接队列溢出",
|
|
|
- "topk(10, increase(node_netstat_TcpExt_ListenOverflows[24h]))",
|
|
|
- 1.0, "建议:应用处理过慢,请调大backlog或优化逻辑。"));
|
|
|
+ // --- 类别 G: 基础环境 & 监控存活 ---
|
|
|
+ tasks.add(new InspectionTask("nodeExporterAlive", "基础环境", "NodeExporter 状态", "24H Availability",
|
|
|
+ "min_over_time(up{job=\"node-exporter\"}[24h])",
|
|
|
+ "MIN", "eq", 0.0, "建议:检测到过去24小时内采集器曾离线,请检查宿主机服务稳定性。", "el-icon-monitor", "danger-text", "danger", "#f56c6c", "离线", false, "全天运行正常"));
|
|
|
|
|
|
- tasks.add(new InspectionTask("netDrops", "网络协议", "TCP丢弃",
|
|
|
- "topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))",
|
|
|
- 1.0, "建议:检查网络负载或内核缓冲区大小。"));
|
|
|
+ tasks.add(new InspectionTask("cadvisorAlive", "基础环境", "cAdvisor 状态", "24H Availability",
|
|
|
+ "min_over_time(up{job=\"cadvisor\"}[24h])",
|
|
|
+ "MIN", "eq", 0.0, "建议:检测到过去24小时内容器采集器曾离线,请检查 Docker 服务状态。", "el-icon-ship", "danger-text", "danger", "#f56c6c", "离线", false, "全天运行正常"));
|
|
|
|
|
|
- // --- 类别 E: 基础环境 (Infrastructure) ---
|
|
|
- tasks.add(new InspectionTask("clockSkewRisk", "基础环境", "时钟偏移风险",
|
|
|
- "abs(node_timex_offset_seconds)",
|
|
|
- 0.5, "建议:检查NTP/Chrony同步状态。"));
|
|
|
+ // 逻辑:24H 绝对值最大偏差
|
|
|
+ tasks.add(new InspectionTask("clockSkewRisk", "基础环境", "时钟偏移 (NTP)", "24H Max Offset",
|
|
|
+ "max_over_time(abs(node_timex_offset_seconds)[24h])",
|
|
|
+ "MAX", "gt", 0.5, "建议:全天最大时钟偏移超标,请检查NTP/Chrony同步状态。", "el-icon-alarm-clock", "warning-text", "warning", "#e6a23c", "s", false, "同步精确"));
|
|
|
|
|
|
- return tasks;
|
|
|
+ inspectionTaskRepository.saveAll(tasks);
|
|
|
}
|
|
|
}
|