Browse Source

update mon module

reghao 1 month ago
parent
commit
8c882a9fca
26 changed files with 439 additions and 148 deletions
  1. 8 6
      mgr/src/main/java/cn/reghao/devops/mgr/admin/controller/HomeController.java
  2. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/admin/service/SiteOptionService.java
  3. 2 2
      mgr/src/main/java/cn/reghao/devops/mgr/config/PrometheusAvailabilityAspect.java
  4. 2 2
      mgr/src/main/java/cn/reghao/devops/mgr/ops/builder/controller/AppStatController.java
  5. 2 2
      mgr/src/main/java/cn/reghao/devops/mgr/ops/builder/model/po/PipelineStep.java
  6. 2 2
      mgr/src/main/java/cn/reghao/devops/mgr/ops/machine/controller/MachineController.java
  7. 112 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/InspectionService.java
  8. 14 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusAsyncClient.java
  9. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusClientManager.java
  10. 83 107
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusService.java
  11. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/AnalysisResult.java
  12. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/ContainerInfo.java
  13. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/ContainerReportVO.java
  14. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/CpuThresholdConfig.java
  15. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/HostData.java
  16. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/HostGroup.java
  17. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/HostInfo.java
  18. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/MetricGroup.java
  19. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/MetricRecord.java
  20. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/PillarReportDTO.java
  21. 14 12
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/DailyReportDTO.java
  22. 16 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/MetricItem.java
  23. 50 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/po/InspectionTask.java
  24. 12 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/repository/InspectionTaskRepository.java
  25. 1 2
      mgr/src/main/resources/application-test.yml
  26. 109 0
      mgr/src/test/java/devops/AppConfigTest.java

+ 8 - 6
mgr/src/main/java/cn/reghao/devops/mgr/admin/controller/HomeController.java

@@ -1,10 +1,9 @@
 package cn.reghao.devops.mgr.admin.controller;
 
 import cn.reghao.devops.mgr.admin.service.HomeViewService;
-import cn.reghao.devops.mgr.ops.srv.mon.model.ContainerReportVO;
-import cn.reghao.devops.mgr.ops.srv.mon.PrometheusService;
-import cn.reghao.devops.mgr.ops.srv.mon.model.DailyReportDTO;
-import cn.reghao.devops.mgr.ops.srv.mon.model.PillarReportDTO;
+import cn.reghao.devops.mgr.ops.mon.InspectionService;
+import cn.reghao.devops.mgr.ops.mon.PrometheusService;
+import cn.reghao.devops.mgr.ops.mon.model.dto.DailyReportDTO;
 import cn.reghao.jutil.web.WebResult;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
@@ -23,10 +22,13 @@ import org.springframework.web.bind.annotation.ResponseBody;
 public class HomeController {
     private final HomeViewService homeViewService;
     private final PrometheusService prometheusService;
+    private InspectionService inspectionService;
 
-    public HomeController(HomeViewService homeViewService, PrometheusService prometheusService) {
+    public HomeController(HomeViewService homeViewService, PrometheusService prometheusService,
+                          InspectionService inspectionService) {
         this.homeViewService = homeViewService;
         this.prometheusService = prometheusService;
+        this.inspectionService = inspectionService;
     }
 
     //@Operation(summary = "后台首页", description = "N")
@@ -40,7 +42,7 @@ public class HomeController {
     @ResponseBody
     public String dashboardData() {
         //DashboardData dashboardData = homeViewService.getDashboardData();
-        DailyReportDTO pillarReportData = prometheusService.getDailyReportData();
+        DailyReportDTO pillarReportData = inspectionService.getDailyReportData();
         //PillarReportDTO pillarReportData = prometheusService.getPillarReportData();
         if (pillarReportData != null) {
             return WebResult.success(pillarReportData);

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/admin/service/SiteOptionService.java

@@ -2,7 +2,7 @@ package cn.reghao.devops.mgr.admin.service;
 
 import cn.reghao.devops.mgr.admin.db.repository.SiteOptionRepo;
 import cn.reghao.devops.mgr.admin.model.po.SiteOption;
-import cn.reghao.devops.mgr.ops.srv.mon.PrometheusClientManager;
+import cn.reghao.devops.mgr.ops.mon.PrometheusClientManager;
 import cn.reghao.devops.mgr.util.StringUtil;
 import org.springframework.cache.annotation.CacheEvict;
 import org.springframework.data.domain.Page;

+ 2 - 2
mgr/src/main/java/cn/reghao/devops/mgr/config/PrometheusAvailabilityAspect.java

@@ -1,6 +1,6 @@
 package cn.reghao.devops.mgr.config;
 
-import cn.reghao.devops.mgr.ops.srv.mon.PrometheusClientManager;
+import cn.reghao.devops.mgr.ops.mon.PrometheusClientManager;
 import org.aspectj.lang.ProceedingJoinPoint;
 import org.aspectj.lang.annotation.Around;
 import org.aspectj.lang.annotation.Aspect;
@@ -20,7 +20,7 @@ public class PrometheusAvailabilityAspect {
     }
 
     // 拦截 PrometheusService 下的所有公共方法
-    @Around("execution(* cn.reghao.devops.mgr.ops.srv.mon.PrometheusService.*(..))")
+    @Around("execution(* cn.reghao.devops.mgr.ops.mon.PrometheusService.*(..))")
     public Object checkAvailability(ProceedingJoinPoint joinPoint) throws Throwable {
         if (clientManager.getClient() == null) {
             // 这里可以根据业务返回空对象、默认值或抛异常

+ 2 - 2
mgr/src/main/java/cn/reghao/devops/mgr/ops/builder/controller/AppStatController.java

@@ -6,8 +6,8 @@ import cn.reghao.devops.mgr.ops.app.db.query.AppDeployQuery;
 import cn.reghao.devops.mgr.ops.builder.model.vo.AppRunning;
 import cn.reghao.devops.mgr.ops.builder.model.vo.AppRunningNode;
 import cn.reghao.devops.mgr.ops.app.service.AppRunService;
-import cn.reghao.devops.mgr.ops.srv.mon.PrometheusService;
-import cn.reghao.devops.mgr.ops.srv.mon.model.ContainerReportVO;
+import cn.reghao.devops.mgr.ops.mon.PrometheusService;
+import cn.reghao.devops.mgr.ops.mon.model.ContainerReportVO;
 import cn.reghao.jutil.jdk.web.db.PageList;
 import cn.reghao.jutil.jdk.web.result.Result;
 import cn.reghao.jutil.jdk.web.result.ResultStatus;

+ 2 - 2
mgr/src/main/java/cn/reghao/devops/mgr/ops/builder/model/po/PipelineStep.java

@@ -22,11 +22,11 @@ public class PipelineStep {
     private static final long serialVersionUID = 1L;
     @Id
     @GeneratedValue(strategy = GenerationType.IDENTITY)
-    protected Integer id;
+    private Integer id;
     @UpdateTimestamp
     @Column(nullable = false)
     @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
-    protected LocalDateTime updateTime;
+    private LocalDateTime updateTime;
 
     /*@ManyToOne(cascade = CascadeType.REFRESH)
     @JoinColumn(name = "app_building_id")

+ 2 - 2
mgr/src/main/java/cn/reghao/devops/mgr/ops/machine/controller/MachineController.java

@@ -4,8 +4,8 @@ import cn.reghao.devops.mgr.ops.machine.model.po.MachineInfo;
 import cn.reghao.devops.mgr.ops.machine.model.vo.MachineDetail;
 import cn.reghao.devops.mgr.ops.machine.service.MachineQuery;
 import cn.reghao.devops.mgr.ops.machine.service.MachineService;
-import cn.reghao.devops.mgr.ops.srv.mon.PrometheusService;
-import cn.reghao.devops.mgr.ops.srv.mon.model.ContainerReportVO;
+import cn.reghao.devops.mgr.ops.mon.PrometheusService;
+import cn.reghao.devops.mgr.ops.mon.model.ContainerReportVO;
 import cn.reghao.devops.mgr.util.SelectOption;
 import cn.reghao.jutil.jdk.web.db.PageList;
 import cn.reghao.jutil.jdk.web.result.Result;

+ 112 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/InspectionService.java

@@ -0,0 +1,112 @@
+package cn.reghao.devops.mgr.ops.mon;
+
+import cn.reghao.devops.mgr.ops.mon.model.dto.DailyReportDTO;
+import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
+import cn.reghao.devops.mgr.ops.mon.model.dto.MetricItem;
+import cn.reghao.devops.mgr.ops.mon.repository.InspectionTaskRepository;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.springframework.stereotype.Service;
+
+import java.lang.reflect.Field;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author reghao
+ * @date 2026-04-03 18:51:27
+ */
+@Service
+public class InspectionService {
+    private final ObjectMapper objectMapper = new ObjectMapper();
+    private final PrometheusClientManager prometheusClientManager;
+    private final InspectionTaskRepository inspectionTaskRepository;
+
+    public InspectionService(PrometheusClientManager prometheusClientManager,
+                             InspectionTaskRepository inspectionTaskRepository) {
+        this.prometheusClientManager = prometheusClientManager;
+        this.inspectionTaskRepository = inspectionTaskRepository;
+    }
+
+    public DailyReportDTO getDailyReportData() {
+        List<InspectionTask> allTasks = inspectionTaskRepository.findAll();
+        Map<String, InspectionTask> taskMap = allTasks.stream()
+                .collect(Collectors.toMap(
+                        InspectionTask::getTaskKey,
+                        task -> task
+                ));
+
+
+        return prometheusClientManager.getClient()
+                .fetchAllMetrics1(taskMap)
+                .thenApply(resultMap -> processResults1(taskMap, resultMap)) // 这里 processResults1 需要改为返回 DailyReportDTO
+                .join();
+    }
+
+    public DailyReportDTO processResults1(Map<String, InspectionTask> taskMap, Map<String, String> resultMap) {
+        DailyReportDTO report = new DailyReportDTO();
+        int totalIssues = 0;
+
+        for (Map.Entry<String, InspectionTask> entry : taskMap.entrySet()) {
+            String fieldName = entry.getKey(); // 此时 taskKey 即为 DTO 字段名
+            InspectionTask task = entry.getValue();
+            String jsonResult = resultMap.get(fieldName);
+
+            if (jsonResult == null || jsonResult.isEmpty()) continue;
+
+            report.getCategoryMap().put(fieldName, task.getCategory());
+            try {
+                // 1. 自动判断 Label 映射
+                String labelName = "instance";
+                // 2. 解析并过滤数据
+                List<MetricItem> items = parseAndFilter(jsonResult, labelName, task.getThreshold());
+
+                if (!items.isEmpty()) {
+                    // 3. 反射写入 DTO 字段
+                    Field field = DailyReportDTO.class.getDeclaredField(fieldName);
+                    field.setAccessible(true);
+                    field.set(report, items);
+
+                    // 4. 写入建议
+                    report.getAdvices().put(fieldName, task.getAdvice());
+                    totalIssues += items.size();
+                }
+            } catch (NoSuchFieldException e) {
+                // 如果 taskKey 配置错了,找不到 DTO 字段,在这里捕获
+                System.err.println("配置错误: DailyReportDTO 中不存在字段 " + fieldName);
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        }
+
+        report.setStatusSummary(totalIssues > 0 ? "发现 " + totalIssues + " 项待处理异常" : "所有指标正常");
+        return report;
+    }
+
+    private List<MetricItem> parseAndFilter(String json, String nameLabel, double threshold) throws Exception {
+        List<MetricItem> filteredItems = new ArrayList<>();
+        JsonNode resultNode = objectMapper.readTree(json).path("data").path("result");
+
+        if (!resultNode.isArray()) return filteredItems;
+
+        for (JsonNode node : resultNode) {
+            double val = node.path("value").get(1).asDouble();
+            if (val >= threshold) {
+                JsonNode metric = node.path("metric");
+                String instance = metric.path(nameLabel).asText("unknown").split(":")[0];
+                String name = metric.path("name").asText("unknown");
+                String displayName = String.format("%s_%s", instance, name);
+                String job = node.path("metric").path("job").asText("unknown");
+                if ("node-exporter".equals(job)) {
+                    displayName = instance.split(":")[0];
+                    instance = displayName;
+                }
+                filteredItems.add(new MetricItem(displayName, instance, Math.round(val * 100.0) / 100.0));
+            }
+        }
+        filteredItems.sort((a, b) -> b.getValue().compareTo(a.getValue()));
+        return filteredItems;
+    }
+}

+ 14 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/PrometheusAsyncClient.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusAsyncClient.java

@@ -1,5 +1,6 @@
-package cn.reghao.devops.mgr.ops.srv.mon;
+package cn.reghao.devops.mgr.ops.mon;
 
+import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import lombok.extern.slf4j.Slf4j;
 
 import java.net.URI;
@@ -48,6 +49,18 @@ public class PrometheusAsyncClient {
                         .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)));
     }
 
+    public CompletableFuture<Map<String, String>> fetchAllMetrics1(Map<String, InspectionTask> queries) {
+        List<CompletableFuture<Map.Entry<String, String>>> futures = queries.entrySet().stream()
+                .map(entry -> fetchSingleMetric(entry.getKey(), entry.getValue().getPromql()))
+                .toList();
+
+        // 等待所有请求完成
+        return CompletableFuture.allOf(futures.toArray(new CompletableFuture[0]))
+                .thenApply(v -> futures.stream()
+                        .map(CompletableFuture::join)
+                        .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)));
+    }
+
     public CompletableFuture<Map.Entry<String, String>> fetchSingleMetric(String alias, String query) {
         String encodedUrl = prometheusBaseUrl + "/api/v1/query?query=" + URLEncoder.encode(query, StandardCharsets.UTF_8);
         HttpRequest request = HttpRequest.newBuilder()

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/PrometheusClientManager.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusClientManager.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon;
+package cn.reghao.devops.mgr.ops.mon;
 
 import cn.reghao.devops.mgr.admin.db.repository.SiteOptionRepository;
 import cn.reghao.devops.mgr.admin.model.po.SiteOption;

+ 83 - 107
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/PrometheusService.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusService.java

@@ -1,6 +1,9 @@
-package cn.reghao.devops.mgr.ops.srv.mon;
+package cn.reghao.devops.mgr.ops.mon;
 
-import cn.reghao.devops.mgr.ops.srv.mon.model.*;
+import cn.reghao.devops.mgr.ops.mon.model.*;
+import cn.reghao.devops.mgr.ops.mon.model.dto.DailyReportDTO;
+import cn.reghao.devops.mgr.ops.mon.model.dto.MetricItem;
+import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import cn.reghao.jutil.jdk.converter.DateTimeConverter;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
@@ -918,98 +921,31 @@ public class PrometheusService {
         tasks.put("net_tcp_drops", """
         topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))
         """);
-
-        // 新增的三个任务
         tasks.put("node_oom", "increase(node_vmstat_oom_kill[24h])");
         tasks.put("node_clock", "abs(node_timex_offset_seconds)");
         tasks.put("node_ro_fs", "node_filesystem_readonly{mountpoint='/'}");
+        tasks.put("node_zombie", "node_processes_state{state='Z'}");
         // 逻辑:计算 node 级别每秒上下文切换次数
         tasks.put("node_context_switch", """
                 topk(10, avg_over_time(rate(node_context_switches_total[5m])[24h:1m]))
                 """);
 
-        /*Map<String, String> tasks = Map.of(
-                "container_cpu", """
-                        topk(10,\s
-                          sum(
-                            label_replace(
-                              increase(container_cpu_cfs_throttled_seconds_total[24h]),
-                              "instance", "$1", "instance", "([^:]+):.*"
-                            )
-                          ) by (name, instance)
-                        )
-                        """,
-                "container_mem", """
-                        topk(10,\s
-                          avg by (name, instance) (
-                            label_replace(
-                              (container_memory_working_set_bytes{name!=""} / container_spec_memory_limit_bytes > 0) * 100,
-                              "instance", "$1", "instance", "([^:]+):.*"
-                            )
-                          )
-                        )
-                        """,
-                "node_disk", """
-                        avg_over_time(
-                            label_replace(
-                              irate(node_disk_io_time_seconds_total[10m]),
-                              "instance", "$1", "instance", "([^:]+):.*"
-                            )[24h:1m]
-                          )
-                        """,
-                "node_inode", """
-                        topk(10, (1 - node_filesystem_files_free / node_filesystem_files) * 100)
-                        """,
-                "node_fd", """
-                        topk(10, (node_filefd_allocated / node_filefd_maximum) * 100)
-                        """,
-                "node_disk_usage", """
-                        topk(10, (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100)
-                        """,
-                // 24h 内 TCP 正常连接的最大并发数
-                "net_tcp_est_max", """
-                        max_over_time(label_replace(node_netstat_Tcp_CurrEstab, "state", "ESTABLISHED", "", "")[24h:])
-                        """,
-                // 24h 内 TCP 等待关闭连接的最大堆积数
-                "net_tcp_tw_max", """
-                        max_over_time(label_replace(node_sockstat_TCP_tw, "state", "TIME_WAIT", "", "")[24h:])
-                        """,
-                // TCP 全连接队列溢出 (ListenOverflows) - 24h 增量
-                // 如果这个值 > 0,说明应用 backlog 满了,正在丢弃新连接
-                "net_tcp_overflow", """
-                        topk(10, increase(node_netstat_TcpExt_ListenOverflows[24h]))
-                        """,
-                // TCP 丢弃计数 (TcpExt_ListenDrops) - 24h 增量
-                "net_tcp_drops", """
-                        topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))
-                        """
-        );*/
+        List<InspectionTask> allTasks = getAllTasks();
+        Map<String, InspectionTask> taskMap = allTasks.stream()
+                .collect(Collectors.toMap(
+                        InspectionTask::getTaskKey,
+                        task -> task
+                ));
+
 
         return prometheusClientManager.getClient()
-                .fetchAllMetrics(tasks)
-                .thenApply(this::processResults1) // 这里 processResults1 需要改为返回 DailyReportDTO
+                .fetchAllMetrics1(taskMap)
+                .thenApply(result -> {
+                    return processResults1(taskMap, result);
+                }) // 这里 processResults1 需要改为返回 DailyReportDTO
                 .join();
     }
 
-    public DailyReportDTO processResults00(Map<String, String> results) {
-        DailyReportDTO report = new DailyReportDTO();
-
-        try {
-            report.setCpuThrottled(parsePrometheusJson(results.get("container_cpu"), "name"));
-            report.setMemRisk(parsePrometheusJson(results.get("container_mem"), "name"));
-            report.setDiskIo(parsePrometheusJson(results.get("node_disk"), "instance"));
-
-            // 简单的状态判定逻辑
-            if (report.getMemRisk().stream().anyMatch(i -> i.getValue() > 90)) {
-                report.setStatusSummary("存在内存风险点");
-            }
-        } catch (Exception e) {
-            log.error("解析监控数据失败", e);
-        }
-
-        return report;
-    }
-
     // 定义全局风险阈值
     private static final double MEM_RISK_THRESHOLD = 30.0;      // 内存超过85%需注意
     private static final double CPU_THROTTLE_THRESHOLD = 1000.0; // 24h节流超过10秒需注意
@@ -1025,65 +961,102 @@ public class PrometheusService {
     private static final double READONLY_FS_THRESHOLD = 1.0;   // 存在只读文件系统
     private static final double ZOMBIE_PROCS_THRESHOLD = 5.0;  // 僵尸进程过多
     // 假设是 8 核机器,总切换数超过 50,000 需注意
-    private static final double CONTEXT_SWITCH_THRESHOLD = 50000.0;
+    private static final double CONTEXT_SWITCH_THRESHOLD = 1000.0;
 
-    public DailyReportDTO processResults1(Map<String, String> results) {
+    public static List<InspectionTask> getAllTasks() {
+        List<InspectionTask> tasks = new ArrayList<>();
+        return tasks;
+    }
+
+    public DailyReportDTO processResults1(Map<String, InspectionTask> taskMap, Map<String, String> results) {
         DailyReportDTO report = new DailyReportDTO();
         try {
             // 1. 解析并筛选 CPU 节流 (只保留显著受限的容器)
             report.setCpuThrottled(
-                    parseAndFilter(results.get("container_cpu"), "name", CPU_THROTTLE_THRESHOLD)
+                    parseAndFilter(results.get("container_cpu"), "name", taskMap.get("container_cpu").getThreshold())
             );
+            report.getAdvices().put("container_cpu", taskMap.get("container_cpu").getAdvice());
 
             // 2. 解析并筛选 内存风险 (只保留接近 Limit 的容器)
             report.setMemRisk(
-                    parseAndFilter(results.get("container_mem"), "name", MEM_RISK_THRESHOLD)
+                    parseAndFilter(results.get("container_mem"), "name", taskMap.get("container_mem").getThreshold())
             );
+            report.getAdvices().put("container_mem", taskMap.get("container_mem").getAdvice());
 
             // 3. 解析并筛选 磁盘 IO (只保留高负载节点)
             report.setDiskIo(
-                    parseAndFilter(results.get("node_disk"), "instance", DISK_IO_THRESHOLD)
+                    parseAndFilter(results.get("node_disk"), "instance", taskMap.get("node_disk").getThreshold())
             );
+            report.getAdvices().put("node_disk", taskMap.get("node_disk").getAdvice());
 
             report.setInodeRisk(
-                    parseAndFilter(results.get("node_inode"), "instance", INODE_RISK_THRESHOLD)
+                    parseAndFilter(results.get("node_inode"), "instance", taskMap.get("node_inode").getThreshold())
             );
+            report.getAdvices().put("node_inode", taskMap.get("node_inode").getAdvice());
 
             // 2. 新增:解析并筛选文件句柄风险
             report.setFdRisk(
-                    parseAndFilter(results.get("node_fd"), "instance", FD_RISK_THRESHOLD)
+                    parseAndFilter(results.get("node_fd"), "instance", taskMap.get("node_fd").getThreshold())
             );
+            report.getAdvices().put("node_fd", taskMap.get("node_fd").getAdvice());
 
             report.setDiskUsageRisk(
-                    parseAndFilter(results.get("node_disk_usage"), "instance", DISK_USAGE_THRESHOLD)
+                    parseAndFilter(results.get("node_disk_usage"), "instance", taskMap.get("node_disk_usage").getThreshold())
             );
+            report.getAdvices().put("node_disk_usage", taskMap.get("node_disk_usage").getAdvice());
 
             // 解析 24h TCP EST 峰值
             report.setNetEstMax(
-                    parseAndFilter(results.get("net_tcp_est_max"), "state", TCP_EST_THRESHOLD)
+                    parseAndFilter(results.get("net_tcp_est_max"), "state", taskMap.get("net_tcp_est_max").getThreshold())
             );
+            report.getAdvices().put("net_tcp_est_max", taskMap.get("net_tcp_est_max").getAdvice());
 
             // 解析并筛选 24h TCP TIME_WAIT 风险
             report.setNetTwMax(
-                    parseAndFilter(results.get("net_tcp_tw_max"), "state", TCP_TW_THRESHOLD)
+                    parseAndFilter(results.get("net_tcp_tw_max"), "state", taskMap.get("net_tcp_tw_max").getThreshold())
             );
+            report.getAdvices().put("net_tcp_tw_max", taskMap.get("net_tcp_tw_max").getAdvice());
 
             // 筛选全连接队列溢出和丢弃
             report.setNetOverflows(
-                    parseAndFilter(results.get("net_tcp_overflow"), "instance", NET_DROP_THRESHOLD)
+                    parseAndFilter(results.get("net_tcp_overflow"), "instance", taskMap.get("net_tcp_overflow").getThreshold())
             );
+            report.getAdvices().put("net_tcp_overflow", taskMap.get("net_tcp_overflow").getAdvice());
+
             report.setNetDrops(
-                    parseAndFilter(results.get("net_tcp_drops"), "instance", NET_DROP_THRESHOLD)
+                    parseAndFilter(results.get("net_tcp_drops"), "instance", taskMap.get("net_tcp_drops").getThreshold())
             );
-
-            if (!report.getMemRisk().isEmpty()) {
-                report.getAdvices().put("memRisk", "内存风险排查流程");
-            }
+            report.getAdvices().put("net_tcp_drops", taskMap.get("net_tcp_drops").getAdvice());
 
             // 2. 解析并筛选上下文切换风险
             report.setContextSwitchRisk(
-                    parseAndFilter(results.get("node_context_switch"), "instance", CONTEXT_SWITCH_THRESHOLD)
+                    parseAndFilter(results.get("node_context_switch"), "instance", taskMap.get("node_context_switch").getThreshold())
             );
+            report.getAdvices().put("node_context_switch", taskMap.get("node_context_switch").getAdvice());
+
+            // 2. OOM Kill 事件 (24h 增量)
+            report.setOomEvents(
+                    parseAndFilter(results.get("node_oom"), "instance", taskMap.get("node_oom").getThreshold())
+            );
+            report.getAdvices().put("node_oom", taskMap.get("node_oom").getAdvice());
+
+            // 3. 时钟偏移 (绝对值)
+            report.setClockSkewRisk(
+                    parseAndFilter(results.get("node_clock"), "instance", taskMap.get("node_clock").getThreshold())
+            );
+            report.getAdvices().put("node_clock", taskMap.get("node_clock").getAdvice());
+
+            // 4. 只读文件系统 (状态值)
+            report.setReadOnlyFsRisk(
+                    parseAndFilter(results.get("node_ro_fs"), "instance", taskMap.get("node_ro_fs").getThreshold())
+            );
+            report.getAdvices().put("node_ro_fs", taskMap.get("node_ro_fs").getAdvice());
+
+            // 5. 僵尸进程/阻塞进程
+            report.setZombieRisk(
+                    parseAndFilter(results.get("node_zombie"), "instance", taskMap.get("node_zombie").getThreshold())
+            );
+            report.getAdvices().put("node_zombie", taskMap.get("node_zombie").getAdvice());
 
             // 计算汇总状态
             long totalIssues = report.getCpuThrottled().size() +
@@ -1095,7 +1068,12 @@ public class PrometheusService {
                     report.getNetEstMax().size() +
                     report.getNetTwMax().size() +
                     report.getNetOverflows().size() +
-                    report.getNetDrops().size();
+                    report.getNetDrops().size() +
+                    report.getContextSwitchRisk().size() +
+                    report.getOomEvents().size() +
+                    report.getClockSkewRisk().size() +
+                    report.getReadOnlyFsRisk().size() +
+                    report.getZombieRisk().size();
 
             report.setStatusSummary(totalIssues > 0 ? "发现 " + totalIssues + " 项待处理异常" : "所有指标正常");
         } catch (Exception e) {
@@ -1105,8 +1083,8 @@ public class PrometheusService {
         return report;
     }
 
-    private List<DailyReportDTO.MetricItem> parseAndFilter(String json, String nameLabel, double threshold) throws Exception {
-        List<DailyReportDTO.MetricItem> filteredItems = new ArrayList<>();
+    private List<MetricItem> parseAndFilter(String json, String nameLabel, double threshold) throws Exception {
+        List<MetricItem> filteredItems = new ArrayList<>();
         JsonNode resultNode = objectMapper.readTree(json).path("data").path("result");
 
         if (resultNode.isArray()) {
@@ -1123,8 +1101,7 @@ public class PrometheusService {
                         name1 = instance.split(":")[0];
                         instance = name1;
                     }
-
-                    filteredItems.add(new DailyReportDTO.MetricItem(name1, instance, Math.round(val * 100.0) / 100.0, 0.0));
+                    filteredItems.add(new MetricItem(name1, instance, Math.round(val * 100.0) / 100.0));
                 }
             }
         }
@@ -1133,8 +1110,8 @@ public class PrometheusService {
         return filteredItems;
     }
 
-    private List<DailyReportDTO.MetricItem> parsePrometheusJson(String json, String nameLabel) throws Exception {
-        List<DailyReportDTO.MetricItem> items = new ArrayList<>();
+    private List<MetricItem> parsePrometheusJson(String json, String nameLabel) throws Exception {
+        List<MetricItem> items = new ArrayList<>();
         JsonNode root = objectMapper.readTree(json);
         JsonNode resultNode = root.path("data").path("result");
 
@@ -1151,8 +1128,7 @@ public class PrometheusService {
                 String name = metric.path(nameLabel).asText("unknown");
                 String instance = metric.path("instance").asText("unknown");
                 Double val = valueArray.get(1).asDouble();
-
-                items.add(new DailyReportDTO.MetricItem(name, instance, Math.round(val * 100.0) / 100.0, 0.0));
+                items.add(new MetricItem(name, instance, Math.round(val * 100.0) / 100.0));
             }
         }
         return items;

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/AnalysisResult.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/AnalysisResult.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/ContainerInfo.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/ContainerInfo.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/ContainerReportVO.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/ContainerReportVO.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/CpuThresholdConfig.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/CpuThresholdConfig.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 /**
  * @author reghao

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/HostData.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/HostData.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/HostGroup.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/HostGroup.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/HostInfo.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/HostInfo.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/MetricGroup.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/MetricGroup.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/MetricRecord.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/MetricRecord.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/PillarReportDTO.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/PillarReportDTO.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model;
 
 import lombok.Data;
 

+ 14 - 12
mgr/src/main/java/cn/reghao/devops/mgr/ops/srv/mon/model/DailyReportDTO.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/DailyReportDTO.java

@@ -1,6 +1,5 @@
-package cn.reghao.devops.mgr.ops.srv.mon.model;
+package cn.reghao.devops.mgr.ops.mon.model.dto;
 
-import lombok.AllArgsConstructor;
 import lombok.Data;
 
 import java.time.LocalDate;
@@ -20,6 +19,8 @@ public class DailyReportDTO {
     private int totalNodes = 10;
     private String checkDuration = "1.2s";
     private String riskLevel = "Medium";
+    private Map<String, String> advices = new HashMap<>();
+    private Map<String, String> categoryMap = new HashMap<>();
 
     private List<MetricItem> cpuThrottled = new ArrayList<>();
     private List<MetricItem> memRisk = new ArrayList<>();
@@ -32,15 +33,16 @@ public class DailyReportDTO {
     private List<MetricItem> netOverflows = new ArrayList<>();
     private List<MetricItem> netDrops = new ArrayList<>();
     private List<MetricItem> contextSwitchRisk = new ArrayList<>();
+    private List<MetricItem> oomEvents = new ArrayList<>();
+    private List<MetricItem> clockSkewRisk = new ArrayList<>();
+    private List<MetricItem> readOnlyFsRisk = new ArrayList<>();
+    private List<MetricItem> zombieRisk = new ArrayList<>();
 
-    private Map<String, String> advices = new HashMap<>();
-
-    @Data
-    @AllArgsConstructor
-    public static class MetricItem {
-        private String name;      // 容器名或实例名
-        private String instance;  // 宿主机
-        private Double value;     // 当前数值
-        private Double compare;   // 环比增长 (可选)
-    }
+    private List<MetricItem> nodeLoadRisk = new ArrayList<>();
+    private List<MetricItem> memPsiRisk = new ArrayList<>();
+    private List<MetricItem> diskPredictRisk = new ArrayList<>();
+    private List<MetricItem> pidLimitRisk = new ArrayList<>();
+    private List<MetricItem> netBandwidthRisk = new ArrayList<>();
+    private List<MetricItem> tcpRetransRisk = new ArrayList<>();
+    private List<MetricItem> conntrackRisk = new ArrayList<>();
 }

+ 16 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/MetricItem.java

@@ -0,0 +1,16 @@
+package cn.reghao.devops.mgr.ops.mon.model.dto;
+
+import lombok.AllArgsConstructor;
+import lombok.Data;
+
+/**
+ * @author reghao
+ * @date 2026-04-03 18:43:58
+ */
+@Data
+@AllArgsConstructor
+public class MetricItem {
+    private String name;      // 容器名或实例名
+    private String instance;  // 宿主机
+    private Double value;     // 当前数值
+}

+ 50 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/po/InspectionTask.java

@@ -0,0 +1,50 @@
+package cn.reghao.devops.mgr.ops.mon.model.po;
+
+import com.fasterxml.jackson.annotation.JsonFormat;
+import jakarta.persistence.*;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+import org.hibernate.annotations.UpdateTimestamp;
+
+import java.time.LocalDateTime;
+
+/**
+ * @author reghao
+ * @date 2026-04-03 18:25:05
+ */
+@Data
+@NoArgsConstructor
+@Entity
+@Table(name = "devops_inspection_task")
+public class InspectionTask {
+    private static final long serialVersionUID = 1L;
+    @Id
+    @GeneratedValue(strategy = GenerationType.IDENTITY)
+    private Integer id;
+    @UpdateTimestamp
+    @Column(nullable = false)
+    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss")
+    private LocalDateTime updateTime;
+
+    @Column(nullable = false, unique = true)
+    private String taskKey;      // 任务标识,如 "container_cpu"
+    @Column(nullable = false)
+    private String category;
+    @Column(nullable = false)
+    private String taskDesc;         // 任务作用描述
+    @Column(nullable = false)
+    private String promql;       // 查询语句
+    @Column(nullable = false)
+    private double threshold;    // 阈值
+    @Column(nullable = false)
+    private String advice;       // 超过阈值后的修复建议
+
+    public InspectionTask(String taskKey, String category, String taskDesc, String promql, double threshold, String advice) {
+        this.taskKey = taskKey;
+        this.category = category;
+        this.taskDesc = taskDesc;
+        this.promql = promql;
+        this.threshold = threshold;
+        this.advice = advice;
+    }
+}

+ 12 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/repository/InspectionTaskRepository.java

@@ -0,0 +1,12 @@
+package cn.reghao.devops.mgr.ops.mon.repository;
+
+import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
+import org.springframework.data.jpa.repository.JpaRepository;
+
+/**
+ * @author reghao
+ * @date 2026-04-03 19:38:58
+ */
+public interface InspectionTaskRepository extends JpaRepository<InspectionTask, Integer> {
+    InspectionTask findByTaskKey(String taskKey);
+}

+ 1 - 2
mgr/src/main/resources/application-test.yml

@@ -4,5 +4,4 @@ spring:
     username: azytest
     password: Azy@123456
 app:
-  ops-root: /opt/data/devops_data
-  prometheus-base-url: http://prometheus.iquizoo.cn
+  ops-root: /opt/data/devops_data

+ 109 - 0
mgr/src/test/java/devops/AppConfigTest.java

@@ -8,6 +8,8 @@ import cn.reghao.devops.mgr.ops.app.db.repository.AppConfigRepository;
 import cn.reghao.devops.mgr.ops.app.db.repository.AppDeployConfigRepository;
 import cn.reghao.devops.mgr.ops.builder.model.po.AppBuilding;
 import cn.reghao.devops.mgr.ops.builder.model.po.AppDeploying;
+import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
+import cn.reghao.devops.mgr.ops.mon.repository.InspectionTaskRepository;
 import lombok.extern.slf4j.Slf4j;
 import org.junit.jupiter.api.Test;
 import org.springframework.beans.factory.annotation.Autowired;
@@ -61,4 +63,111 @@ public class AppConfigTest {
                 .collect(Collectors.toList());
         appDeployingRepository.saveAll(list);
     }
+
+    @Autowired
+    InspectionTaskRepository inspectionTaskRepository;
+    @Test
+    public void initInspectionTask() {
+        List<InspectionTask> list = getAllTasks();
+        inspectionTaskRepository.saveAll(list);
+    }
+
+    public List<InspectionTask> getAllTasks() {
+        List<InspectionTask> tasks = new ArrayList<>();
+
+        // --- 类别 A: 计算资源与调度 (Compute) ---
+        tasks.add(new InspectionTask("cpuThrottled", "计算资源", "容器CPU节流",
+                "topk(10, sum(label_replace(increase(container_cpu_cfs_throttled_seconds_total[24h]), \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")) by (name, instance))",
+                1000.0, "建议:调大容器CPU Limit或优化线程池。"));
+
+        tasks.add(new InspectionTask("nodeLoadRisk", "计算资源", "节点CPU负载饱和度",
+                "node_load5 / count by (instance) (node_cpu_seconds_total{mode=\"idle\"})",
+                1.2, "建议:Load高于核心数,说明进程正在排队,请检查系统瓶颈。"));
+
+        tasks.add(new InspectionTask("contextSwitchRisk", "计算资源", "上下文切换风险",
+                "topk(10, avg_over_time(rate(node_context_switches_total[5m])[24h:1m]))",
+                50000.0, "建议:减少线程竞争或优化锁逻辑。"));
+
+        tasks.add(new InspectionTask("zombieRisk", "计算资源", "僵尸进程风险",
+                "node_processes_state{state='Z'}",
+                5.0, "建议:修复父进程回收逻辑,防止PID泄露。"));
+
+        // --- 类别 B: 内存与运行时 (Memory) ---
+        tasks.add(new InspectionTask("memRisk", "内存指标", "容器内存风险",
+                "topk(10, avg by (name, instance) (label_replace((container_memory_working_set_bytes{name!=\"\"} / container_spec_memory_limit_bytes > 0) * 100, \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")))",
+                85.0, "建议:检查内存泄露或增加内存配额。"));
+
+        tasks.add(new InspectionTask("memPsiRisk", "内存指标", "内存紧缩压力(PSI)",
+                "rate(node_pressure_memory_some_seconds_total[5m])",
+                0.1, "建议:系统正在频繁页面置换,请检查高内存占用进程。"));
+
+        tasks.add(new InspectionTask("oomEvents", "内存指标", "OOM事件",
+                "increase(node_vmstat_oom_kill[24h])",
+                1.0, "建议:分析dmesg定位被杀进程,优化内存分配。"));
+
+        tasks.add(new InspectionTask("pidLimitRisk", "内存指标", "PID进程数限制",
+                "(node_forks_total / node_processes_max) * 100",
+                80.0, "建议:PID快用完了,请检查是否存在大量短时进程或进程泄露。"));
+
+        // --- 类别 C: 存储与文件系统 (Storage) ---
+        tasks.add(new InspectionTask("diskUsageRisk", "存储文件", "磁盘空间风险",
+                "topk(10, (1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100)",
+                80.0, "建议:清理日志或扩容磁盘。"));
+
+        tasks.add(new InspectionTask("diskPredictRisk", "存储文件", "磁盘存满预测(24h)",
+                "predict_linear(node_filesystem_avail_bytes{mountpoint=\"/\"}[6h], 24 * 3600) < 0",
+                0.0, "建议:磁盘预计在24小时内写满,请立即处理。"));
+
+        tasks.add(new InspectionTask("inodeRisk", "存储文件", "Inode使用率",
+                "topk(10, (1 - node_filesystem_files_free / node_filesystem_files) * 100)",
+                80.0, "建议:清理大量极小文件(如临时日志)。"));
+
+        tasks.add(new InspectionTask("diskIo", "存储文件", "磁盘IO负载",
+                "avg_over_time(label_replace(irate(node_disk_io_time_seconds_total[10m]), \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")[24h:1m])",
+                1.0, "建议:排查高I/O进程,检查存储后端健康度。"));
+
+        tasks.add(new InspectionTask("fdRisk", "存储文件", "文件句柄风险",
+                "topk(10, (node_filefd_allocated / node_filefd_maximum) * 100)",
+                80.0, "建议:检查FD泄露,必要时调大ulimit。"));
+
+        tasks.add(new InspectionTask("readOnlyFsRisk", "存储文件", "只读分区风险",
+                "node_filesystem_readonly{mountpoint='/'}",
+                1.0, "建议:硬件故障触发只读挂载,请检修硬件。"));
+
+        // --- 类别 D: 网络协议栈 (Network) ---
+        tasks.add(new InspectionTask("conntrackRisk", "网络协议", "连接跟踪表(Conntrack)",
+                "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) * 100",
+                80.0, "建议:连接表快满了,请优化内核参数或清理连接。"));
+
+        tasks.add(new InspectionTask("netEstMax", "网络协议", "TCP连接峰值",
+                "max_over_time(label_replace(node_netstat_Tcp_CurrEstab, \"state\", \"ESTABLISHED\", \"\", \"\")[24h:])",
+                5000.0, "建议:评估业务并发,检查长连接泄露。"));
+
+        tasks.add(new InspectionTask("netTwMax", "网络协议", "TIME_WAIT堆积",
+                "max_over_time(label_replace(node_sockstat_TCP_tw, \"state\", \"TIME_WAIT\", \"\", \"\")[24h:])",
+                5000.0, "建议:开启tw_reuse或检查压测工具。"));
+
+        tasks.add(new InspectionTask("netBandwidthRisk", "网络协议", "网卡带宽利用率",
+                "((rate(node_network_receive_bytes_total[5m]) + rate(node_network_transmit_bytes_total[5m])) * 8) / node_network_speed_bytes > 0",
+                0.8, "建议:带宽接近极限,检查是否有大文件同步。"));
+
+        tasks.add(new InspectionTask("tcpRetransRisk", "网络协议", "TCP重传率",
+                "rate(node_netstat_Tcp_RetransSegs[5m]) / rate(node_netstat_Tcp_OutSegs[5m]) * 100",
+                1.0, "建议:重传率偏高,检查链路质量或对端压力。"));
+
+        tasks.add(new InspectionTask("netOverflows", "网络协议", "全连接队列溢出",
+                "topk(10, increase(node_netstat_TcpExt_ListenOverflows[24h]))",
+                1.0, "建议:应用处理过慢,请调大backlog或优化逻辑。"));
+
+        tasks.add(new InspectionTask("netDrops", "网络协议", "TCP丢弃",
+                "topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))",
+                1.0, "建议:检查网络负载或内核缓冲区大小。"));
+
+        // --- 类别 E: 基础环境 (Infrastructure) ---
+        tasks.add(new InspectionTask("clockSkewRisk", "基础环境", "时钟偏移风险",
+                "abs(node_timex_offset_seconds)",
+                0.5, "建议:检查NTP/Chrony同步状态。"));
+
+        return tasks;
+    }
 }