|
|
@@ -1,6 +1,9 @@
|
|
|
-package cn.reghao.devops.mgr.ops.srv.mon;
|
|
|
+package cn.reghao.devops.mgr.ops.mon;
|
|
|
|
|
|
-import cn.reghao.devops.mgr.ops.srv.mon.model.*;
|
|
|
+import cn.reghao.devops.mgr.ops.mon.model.*;
|
|
|
+import cn.reghao.devops.mgr.ops.mon.model.dto.DailyReportDTO;
|
|
|
+import cn.reghao.devops.mgr.ops.mon.model.dto.MetricItem;
|
|
|
+import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
|
|
|
import cn.reghao.jutil.jdk.converter.DateTimeConverter;
|
|
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
|
import com.fasterxml.jackson.databind.JsonNode;
|
|
|
@@ -918,98 +921,31 @@ public class PrometheusService {
|
|
|
tasks.put("net_tcp_drops", """
|
|
|
topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))
|
|
|
""");
|
|
|
-
|
|
|
- // 新增的三个任务
|
|
|
tasks.put("node_oom", "increase(node_vmstat_oom_kill[24h])");
|
|
|
tasks.put("node_clock", "abs(node_timex_offset_seconds)");
|
|
|
tasks.put("node_ro_fs", "node_filesystem_readonly{mountpoint='/'}");
|
|
|
+ tasks.put("node_zombie", "node_processes_state{state='Z'}");
|
|
|
// 逻辑:计算 node 级别每秒上下文切换次数
|
|
|
tasks.put("node_context_switch", """
|
|
|
topk(10, avg_over_time(rate(node_context_switches_total[5m])[24h:1m]))
|
|
|
""");
|
|
|
|
|
|
- /*Map<String, String> tasks = Map.of(
|
|
|
- "container_cpu", """
|
|
|
- topk(10,\s
|
|
|
- sum(
|
|
|
- label_replace(
|
|
|
- increase(container_cpu_cfs_throttled_seconds_total[24h]),
|
|
|
- "instance", "$1", "instance", "([^:]+):.*"
|
|
|
- )
|
|
|
- ) by (name, instance)
|
|
|
- )
|
|
|
- """,
|
|
|
- "container_mem", """
|
|
|
- topk(10,\s
|
|
|
- avg by (name, instance) (
|
|
|
- label_replace(
|
|
|
- (container_memory_working_set_bytes{name!=""} / container_spec_memory_limit_bytes > 0) * 100,
|
|
|
- "instance", "$1", "instance", "([^:]+):.*"
|
|
|
- )
|
|
|
- )
|
|
|
- )
|
|
|
- """,
|
|
|
- "node_disk", """
|
|
|
- avg_over_time(
|
|
|
- label_replace(
|
|
|
- irate(node_disk_io_time_seconds_total[10m]),
|
|
|
- "instance", "$1", "instance", "([^:]+):.*"
|
|
|
- )[24h:1m]
|
|
|
- )
|
|
|
- """,
|
|
|
- "node_inode", """
|
|
|
- topk(10, (1 - node_filesystem_files_free / node_filesystem_files) * 100)
|
|
|
- """,
|
|
|
- "node_fd", """
|
|
|
- topk(10, (node_filefd_allocated / node_filefd_maximum) * 100)
|
|
|
- """,
|
|
|
- "node_disk_usage", """
|
|
|
- topk(10, (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100)
|
|
|
- """,
|
|
|
- // 24h 内 TCP 正常连接的最大并发数
|
|
|
- "net_tcp_est_max", """
|
|
|
- max_over_time(label_replace(node_netstat_Tcp_CurrEstab, "state", "ESTABLISHED", "", "")[24h:])
|
|
|
- """,
|
|
|
- // 24h 内 TCP 等待关闭连接的最大堆积数
|
|
|
- "net_tcp_tw_max", """
|
|
|
- max_over_time(label_replace(node_sockstat_TCP_tw, "state", "TIME_WAIT", "", "")[24h:])
|
|
|
- """,
|
|
|
- // TCP 全连接队列溢出 (ListenOverflows) - 24h 增量
|
|
|
- // 如果这个值 > 0,说明应用 backlog 满了,正在丢弃新连接
|
|
|
- "net_tcp_overflow", """
|
|
|
- topk(10, increase(node_netstat_TcpExt_ListenOverflows[24h]))
|
|
|
- """,
|
|
|
- // TCP 丢弃计数 (TcpExt_ListenDrops) - 24h 增量
|
|
|
- "net_tcp_drops", """
|
|
|
- topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))
|
|
|
- """
|
|
|
- );*/
|
|
|
+ List<InspectionTask> allTasks = getAllTasks();
|
|
|
+ Map<String, InspectionTask> taskMap = allTasks.stream()
|
|
|
+ .collect(Collectors.toMap(
|
|
|
+ InspectionTask::getTaskKey,
|
|
|
+ task -> task
|
|
|
+ ));
|
|
|
+
|
|
|
|
|
|
return prometheusClientManager.getClient()
|
|
|
- .fetchAllMetrics(tasks)
|
|
|
- .thenApply(this::processResults1) // 这里 processResults1 需要改为返回 DailyReportDTO
|
|
|
+ .fetchAllMetrics1(taskMap)
|
|
|
+ .thenApply(result -> {
|
|
|
+ return processResults1(taskMap, result);
|
|
|
+ }) // 这里 processResults1 需要改为返回 DailyReportDTO
|
|
|
.join();
|
|
|
}
|
|
|
|
|
|
- public DailyReportDTO processResults00(Map<String, String> results) {
|
|
|
- DailyReportDTO report = new DailyReportDTO();
|
|
|
-
|
|
|
- try {
|
|
|
- report.setCpuThrottled(parsePrometheusJson(results.get("container_cpu"), "name"));
|
|
|
- report.setMemRisk(parsePrometheusJson(results.get("container_mem"), "name"));
|
|
|
- report.setDiskIo(parsePrometheusJson(results.get("node_disk"), "instance"));
|
|
|
-
|
|
|
- // 简单的状态判定逻辑
|
|
|
- if (report.getMemRisk().stream().anyMatch(i -> i.getValue() > 90)) {
|
|
|
- report.setStatusSummary("存在内存风险点");
|
|
|
- }
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("解析监控数据失败", e);
|
|
|
- }
|
|
|
-
|
|
|
- return report;
|
|
|
- }
|
|
|
-
|
|
|
// 定义全局风险阈值
|
|
|
private static final double MEM_RISK_THRESHOLD = 30.0; // 内存超过85%需注意
|
|
|
private static final double CPU_THROTTLE_THRESHOLD = 1000.0; // 24h节流超过10秒需注意
|
|
|
@@ -1025,65 +961,102 @@ public class PrometheusService {
|
|
|
private static final double READONLY_FS_THRESHOLD = 1.0; // 存在只读文件系统
|
|
|
private static final double ZOMBIE_PROCS_THRESHOLD = 5.0; // 僵尸进程过多
|
|
|
// 假设是 8 核机器,总切换数超过 50,000 需注意
|
|
|
- private static final double CONTEXT_SWITCH_THRESHOLD = 50000.0;
|
|
|
+ private static final double CONTEXT_SWITCH_THRESHOLD = 1000.0;
|
|
|
|
|
|
- public DailyReportDTO processResults1(Map<String, String> results) {
|
|
|
+ public static List<InspectionTask> getAllTasks() {
|
|
|
+ List<InspectionTask> tasks = new ArrayList<>();
|
|
|
+ return tasks;
|
|
|
+ }
|
|
|
+
|
|
|
+ public DailyReportDTO processResults1(Map<String, InspectionTask> taskMap, Map<String, String> results) {
|
|
|
DailyReportDTO report = new DailyReportDTO();
|
|
|
try {
|
|
|
// 1. 解析并筛选 CPU 节流 (只保留显著受限的容器)
|
|
|
report.setCpuThrottled(
|
|
|
- parseAndFilter(results.get("container_cpu"), "name", CPU_THROTTLE_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("container_cpu"), "name", taskMap.get("container_cpu").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("container_cpu", taskMap.get("container_cpu").getAdvice());
|
|
|
|
|
|
// 2. 解析并筛选 内存风险 (只保留接近 Limit 的容器)
|
|
|
report.setMemRisk(
|
|
|
- parseAndFilter(results.get("container_mem"), "name", MEM_RISK_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("container_mem"), "name", taskMap.get("container_mem").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("container_mem", taskMap.get("container_mem").getAdvice());
|
|
|
|
|
|
// 3. 解析并筛选 磁盘 IO (只保留高负载节点)
|
|
|
report.setDiskIo(
|
|
|
- parseAndFilter(results.get("node_disk"), "instance", DISK_IO_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("node_disk"), "instance", taskMap.get("node_disk").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("node_disk", taskMap.get("node_disk").getAdvice());
|
|
|
|
|
|
report.setInodeRisk(
|
|
|
- parseAndFilter(results.get("node_inode"), "instance", INODE_RISK_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("node_inode"), "instance", taskMap.get("node_inode").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("node_inode", taskMap.get("node_inode").getAdvice());
|
|
|
|
|
|
// 2. 新增:解析并筛选文件句柄风险
|
|
|
report.setFdRisk(
|
|
|
- parseAndFilter(results.get("node_fd"), "instance", FD_RISK_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("node_fd"), "instance", taskMap.get("node_fd").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("node_fd", taskMap.get("node_fd").getAdvice());
|
|
|
|
|
|
report.setDiskUsageRisk(
|
|
|
- parseAndFilter(results.get("node_disk_usage"), "instance", DISK_USAGE_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("node_disk_usage"), "instance", taskMap.get("node_disk_usage").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("node_disk_usage", taskMap.get("node_disk_usage").getAdvice());
|
|
|
|
|
|
// 解析 24h TCP EST 峰值
|
|
|
report.setNetEstMax(
|
|
|
- parseAndFilter(results.get("net_tcp_est_max"), "state", TCP_EST_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("net_tcp_est_max"), "state", taskMap.get("net_tcp_est_max").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("net_tcp_est_max", taskMap.get("net_tcp_est_max").getAdvice());
|
|
|
|
|
|
// 解析并筛选 24h TCP TIME_WAIT 风险
|
|
|
report.setNetTwMax(
|
|
|
- parseAndFilter(results.get("net_tcp_tw_max"), "state", TCP_TW_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("net_tcp_tw_max"), "state", taskMap.get("net_tcp_tw_max").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("net_tcp_tw_max", taskMap.get("net_tcp_tw_max").getAdvice());
|
|
|
|
|
|
// 筛选全连接队列溢出和丢弃
|
|
|
report.setNetOverflows(
|
|
|
- parseAndFilter(results.get("net_tcp_overflow"), "instance", NET_DROP_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("net_tcp_overflow"), "instance", taskMap.get("net_tcp_overflow").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("net_tcp_overflow", taskMap.get("net_tcp_overflow").getAdvice());
|
|
|
+
|
|
|
report.setNetDrops(
|
|
|
- parseAndFilter(results.get("net_tcp_drops"), "instance", NET_DROP_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("net_tcp_drops"), "instance", taskMap.get("net_tcp_drops").getThreshold())
|
|
|
);
|
|
|
-
|
|
|
- if (!report.getMemRisk().isEmpty()) {
|
|
|
- report.getAdvices().put("memRisk", "内存风险排查流程");
|
|
|
- }
|
|
|
+ report.getAdvices().put("net_tcp_drops", taskMap.get("net_tcp_drops").getAdvice());
|
|
|
|
|
|
// 2. 解析并筛选上下文切换风险
|
|
|
report.setContextSwitchRisk(
|
|
|
- parseAndFilter(results.get("node_context_switch"), "instance", CONTEXT_SWITCH_THRESHOLD)
|
|
|
+ parseAndFilter(results.get("node_context_switch"), "instance", taskMap.get("node_context_switch").getThreshold())
|
|
|
);
|
|
|
+ report.getAdvices().put("node_context_switch", taskMap.get("node_context_switch").getAdvice());
|
|
|
+
|
|
|
+ // 2. OOM Kill 事件 (24h 增量)
|
|
|
+ report.setOomEvents(
|
|
|
+ parseAndFilter(results.get("node_oom"), "instance", taskMap.get("node_oom").getThreshold())
|
|
|
+ );
|
|
|
+ report.getAdvices().put("node_oom", taskMap.get("node_oom").getAdvice());
|
|
|
+
|
|
|
+ // 3. 时钟偏移 (绝对值)
|
|
|
+ report.setClockSkewRisk(
|
|
|
+ parseAndFilter(results.get("node_clock"), "instance", taskMap.get("node_clock").getThreshold())
|
|
|
+ );
|
|
|
+ report.getAdvices().put("node_clock", taskMap.get("node_clock").getAdvice());
|
|
|
+
|
|
|
+ // 4. 只读文件系统 (状态值)
|
|
|
+ report.setReadOnlyFsRisk(
|
|
|
+ parseAndFilter(results.get("node_ro_fs"), "instance", taskMap.get("node_ro_fs").getThreshold())
|
|
|
+ );
|
|
|
+ report.getAdvices().put("node_ro_fs", taskMap.get("node_ro_fs").getAdvice());
|
|
|
+
|
|
|
+ // 5. 僵尸进程/阻塞进程
|
|
|
+ report.setZombieRisk(
|
|
|
+ parseAndFilter(results.get("node_zombie"), "instance", taskMap.get("node_zombie").getThreshold())
|
|
|
+ );
|
|
|
+ report.getAdvices().put("node_zombie", taskMap.get("node_zombie").getAdvice());
|
|
|
|
|
|
// 计算汇总状态
|
|
|
long totalIssues = report.getCpuThrottled().size() +
|
|
|
@@ -1095,7 +1068,12 @@ public class PrometheusService {
|
|
|
report.getNetEstMax().size() +
|
|
|
report.getNetTwMax().size() +
|
|
|
report.getNetOverflows().size() +
|
|
|
- report.getNetDrops().size();
|
|
|
+ report.getNetDrops().size() +
|
|
|
+ report.getContextSwitchRisk().size() +
|
|
|
+ report.getOomEvents().size() +
|
|
|
+ report.getClockSkewRisk().size() +
|
|
|
+ report.getReadOnlyFsRisk().size() +
|
|
|
+ report.getZombieRisk().size();
|
|
|
|
|
|
report.setStatusSummary(totalIssues > 0 ? "发现 " + totalIssues + " 项待处理异常" : "所有指标正常");
|
|
|
} catch (Exception e) {
|
|
|
@@ -1105,8 +1083,8 @@ public class PrometheusService {
|
|
|
return report;
|
|
|
}
|
|
|
|
|
|
- private List<DailyReportDTO.MetricItem> parseAndFilter(String json, String nameLabel, double threshold) throws Exception {
|
|
|
- List<DailyReportDTO.MetricItem> filteredItems = new ArrayList<>();
|
|
|
+ private List<MetricItem> parseAndFilter(String json, String nameLabel, double threshold) throws Exception {
|
|
|
+ List<MetricItem> filteredItems = new ArrayList<>();
|
|
|
JsonNode resultNode = objectMapper.readTree(json).path("data").path("result");
|
|
|
|
|
|
if (resultNode.isArray()) {
|
|
|
@@ -1123,8 +1101,7 @@ public class PrometheusService {
|
|
|
name1 = instance.split(":")[0];
|
|
|
instance = name1;
|
|
|
}
|
|
|
-
|
|
|
- filteredItems.add(new DailyReportDTO.MetricItem(name1, instance, Math.round(val * 100.0) / 100.0, 0.0));
|
|
|
+ filteredItems.add(new MetricItem(name1, instance, Math.round(val * 100.0) / 100.0));
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -1133,8 +1110,8 @@ public class PrometheusService {
|
|
|
return filteredItems;
|
|
|
}
|
|
|
|
|
|
- private List<DailyReportDTO.MetricItem> parsePrometheusJson(String json, String nameLabel) throws Exception {
|
|
|
- List<DailyReportDTO.MetricItem> items = new ArrayList<>();
|
|
|
+ private List<MetricItem> parsePrometheusJson(String json, String nameLabel) throws Exception {
|
|
|
+ List<MetricItem> items = new ArrayList<>();
|
|
|
JsonNode root = objectMapper.readTree(json);
|
|
|
JsonNode resultNode = root.path("data").path("result");
|
|
|
|
|
|
@@ -1151,8 +1128,7 @@ public class PrometheusService {
|
|
|
String name = metric.path(nameLabel).asText("unknown");
|
|
|
String instance = metric.path("instance").asText("unknown");
|
|
|
Double val = valueArray.get(1).asDouble();
|
|
|
-
|
|
|
- items.add(new DailyReportDTO.MetricItem(name, instance, Math.round(val * 100.0) / 100.0, 0.0));
|
|
|
+ items.add(new MetricItem(name, instance, Math.round(val * 100.0) / 100.0));
|
|
|
}
|
|
|
}
|
|
|
return items;
|