Browse Source

更新 ops/mon 模块

reghao 4 weeks ago
parent
commit
45be7c86b8
17 changed files with 381 additions and 594 deletions
  1. 6 6
      mgr/src/main/java/cn/reghao/devops/mgr/admin/controller/HomeController.java
  2. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/admin/service/SiteOptionService.java
  3. 2 2
      mgr/src/main/java/cn/reghao/devops/mgr/config/PrometheusAvailabilityAspect.java
  4. 0 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/app/controller/AppDeployConfigPageController.java
  5. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/machine/controller/MachineController.java
  6. 0 114
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/InspectionService.java
  7. 0 48
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/DailyReportDTO.java
  8. 21 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/InspectionReport.java
  9. 31 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/InspectionTaskResult.java
  10. 33 4
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/po/InspectionTask.java
  11. 10 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/repository/InspectionTaskRepository.java
  12. 181 0
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/InspectionService.java
  13. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/PrometheusAsyncClient.java
  14. 1 1
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/PrometheusClientManager.java
  15. 2 292
      mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/PrometheusService.java
  16. 90 73
      mgr/src/test/java/devops/AppConfigTest.java
  17. 1 50
      mgr/src/test/java/devops/DockerTest.java

+ 6 - 6
mgr/src/main/java/cn/reghao/devops/mgr/admin/controller/HomeController.java

@@ -1,9 +1,9 @@
 package cn.reghao.devops.mgr.admin.controller;
 package cn.reghao.devops.mgr.admin.controller;
 
 
 import cn.reghao.devops.mgr.admin.service.HomeViewService;
 import cn.reghao.devops.mgr.admin.service.HomeViewService;
-import cn.reghao.devops.mgr.ops.mon.InspectionService;
-import cn.reghao.devops.mgr.ops.mon.PrometheusService;
-import cn.reghao.devops.mgr.ops.mon.model.dto.DailyReportDTO;
+import cn.reghao.devops.mgr.ops.mon.service.InspectionService;
+import cn.reghao.devops.mgr.ops.mon.service.PrometheusService;
+import cn.reghao.devops.mgr.ops.mon.model.dto.InspectionReport;
 import cn.reghao.jutil.web.WebResult;
 import cn.reghao.jutil.web.WebResult;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import io.swagger.v3.oas.annotations.tags.Tag;
@@ -42,10 +42,10 @@ public class HomeController {
     @ResponseBody
     @ResponseBody
     public String dashboardData() {
     public String dashboardData() {
         //DashboardData dashboardData = homeViewService.getDashboardData();
         //DashboardData dashboardData = homeViewService.getDashboardData();
-        DailyReportDTO pillarReportData = inspectionService.getDailyReportData();
+        InspectionReport inspectionReport = inspectionService.getInspectionReport();
         //PillarReportDTO pillarReportData = prometheusService.getPillarReportData();
         //PillarReportDTO pillarReportData = prometheusService.getPillarReportData();
-        if (pillarReportData != null) {
-            return WebResult.success(pillarReportData);
+        if (inspectionReport != null) {
+            return WebResult.success(inspectionReport);
         }
         }
 
 
         return WebResult.failWithMsg("get dashboard data failed");
         return WebResult.failWithMsg("get dashboard data failed");

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/admin/service/SiteOptionService.java

@@ -2,7 +2,7 @@ package cn.reghao.devops.mgr.admin.service;
 
 
 import cn.reghao.devops.mgr.admin.db.repository.SiteOptionRepo;
 import cn.reghao.devops.mgr.admin.db.repository.SiteOptionRepo;
 import cn.reghao.devops.mgr.admin.model.po.SiteOption;
 import cn.reghao.devops.mgr.admin.model.po.SiteOption;
-import cn.reghao.devops.mgr.ops.mon.PrometheusClientManager;
+import cn.reghao.devops.mgr.ops.mon.service.PrometheusClientManager;
 import cn.reghao.devops.mgr.util.StringUtil;
 import cn.reghao.devops.mgr.util.StringUtil;
 import org.springframework.cache.annotation.CacheEvict;
 import org.springframework.cache.annotation.CacheEvict;
 import org.springframework.data.domain.Page;
 import org.springframework.data.domain.Page;

+ 2 - 2
mgr/src/main/java/cn/reghao/devops/mgr/config/PrometheusAvailabilityAspect.java

@@ -1,6 +1,6 @@
 package cn.reghao.devops.mgr.config;
 package cn.reghao.devops.mgr.config;
 
 
-import cn.reghao.devops.mgr.ops.mon.PrometheusClientManager;
+import cn.reghao.devops.mgr.ops.mon.service.PrometheusClientManager;
 import org.aspectj.lang.ProceedingJoinPoint;
 import org.aspectj.lang.ProceedingJoinPoint;
 import org.aspectj.lang.annotation.Around;
 import org.aspectj.lang.annotation.Around;
 import org.aspectj.lang.annotation.Aspect;
 import org.aspectj.lang.annotation.Aspect;
@@ -20,7 +20,7 @@ public class PrometheusAvailabilityAspect {
     }
     }
 
 
     // 拦截 PrometheusService 下的所有公共方法
     // 拦截 PrometheusService 下的所有公共方法
-    @Around("execution(* cn.reghao.devops.mgr.ops.mon.PrometheusService.*(..))")
+    @Around("execution(* cn.reghao.devops.mgr.ops.mon.service.PrometheusService.*(..))")
     public Object checkAvailability(ProceedingJoinPoint joinPoint) throws Throwable {
     public Object checkAvailability(ProceedingJoinPoint joinPoint) throws Throwable {
         if (clientManager.getClient() == null) {
         if (clientManager.getClient() == null) {
             // 这里可以根据业务返回空对象、默认值或抛异常
             // 这里可以根据业务返回空对象、默认值或抛异常

+ 0 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/app/controller/AppDeployConfigPageController.java

@@ -12,7 +12,6 @@ import io.swagger.v3.oas.annotations.tags.Tag;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Operation;
 import lombok.extern.slf4j.Slf4j;
 import lombok.extern.slf4j.Slf4j;
 import org.springframework.http.MediaType;
 import org.springframework.http.MediaType;
-import org.springframework.ui.Model;
 import org.springframework.validation.annotation.Validated;
 import org.springframework.validation.annotation.Validated;
 import org.springframework.web.bind.annotation.*;
 import org.springframework.web.bind.annotation.*;
 
 

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/machine/controller/MachineController.java

@@ -4,7 +4,7 @@ import cn.reghao.devops.mgr.ops.machine.model.po.MachineInfo;
 import cn.reghao.devops.mgr.ops.machine.model.vo.MachineDetail;
 import cn.reghao.devops.mgr.ops.machine.model.vo.MachineDetail;
 import cn.reghao.devops.mgr.ops.machine.service.MachineQuery;
 import cn.reghao.devops.mgr.ops.machine.service.MachineQuery;
 import cn.reghao.devops.mgr.ops.machine.service.MachineService;
 import cn.reghao.devops.mgr.ops.machine.service.MachineService;
-import cn.reghao.devops.mgr.ops.mon.PrometheusService;
+import cn.reghao.devops.mgr.ops.mon.service.PrometheusService;
 import cn.reghao.devops.mgr.ops.mon.model.ContainerReportVO;
 import cn.reghao.devops.mgr.ops.mon.model.ContainerReportVO;
 import cn.reghao.devops.mgr.util.SelectOption;
 import cn.reghao.devops.mgr.util.SelectOption;
 import cn.reghao.jutil.jdk.web.db.PageList;
 import cn.reghao.jutil.jdk.web.db.PageList;

+ 0 - 114
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/InspectionService.java

@@ -1,114 +0,0 @@
-package cn.reghao.devops.mgr.ops.mon;
-
-import cn.reghao.devops.mgr.ops.mon.model.dto.DailyReportDTO;
-import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
-import cn.reghao.devops.mgr.ops.mon.model.dto.MetricItem;
-import cn.reghao.devops.mgr.ops.mon.repository.InspectionTaskRepository;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import org.springframework.stereotype.Service;
-
-import java.lang.reflect.Field;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-
-/**
- * @author reghao
- * @date 2026-04-03 18:51:27
- */
-@Service
-public class InspectionService {
-    private final ObjectMapper objectMapper;
-    private final PrometheusClientManager prometheusClientManager;
-    private final InspectionTaskRepository inspectionTaskRepository;
-
-    public InspectionService(ObjectMapper objectMapper,
-                             PrometheusClientManager prometheusClientManager,
-                             InspectionTaskRepository inspectionTaskRepository) {
-        this.objectMapper = objectMapper;
-        this.prometheusClientManager = prometheusClientManager;
-        this.inspectionTaskRepository = inspectionTaskRepository;
-    }
-
-    public DailyReportDTO getDailyReportData() {
-        List<InspectionTask> allTasks = inspectionTaskRepository.findAll();
-        Map<String, InspectionTask> taskMap = allTasks.stream()
-                .collect(Collectors.toMap(
-                        InspectionTask::getTaskKey,
-                        task -> task
-                ));
-
-
-        return prometheusClientManager.getClient()
-                .fetchAllMetrics1(taskMap)
-                .thenApply(resultMap -> processResults1(taskMap, resultMap)) // 这里 processResults1 需要改为返回 DailyReportDTO
-                .join();
-    }
-
-    public DailyReportDTO processResults1(Map<String, InspectionTask> taskMap, Map<String, String> resultMap) {
-        DailyReportDTO report = new DailyReportDTO();
-        int totalIssues = 0;
-
-        for (Map.Entry<String, InspectionTask> entry : taskMap.entrySet()) {
-            String fieldName = entry.getKey(); // 此时 taskKey 即为 DTO 字段名
-            InspectionTask task = entry.getValue();
-            String jsonResult = resultMap.get(fieldName);
-
-            if (jsonResult == null || jsonResult.isEmpty()) continue;
-
-            report.getCategoryMap().put(fieldName, task.getCategory());
-            try {
-                // 1. 自动判断 Label 映射
-                String labelName = "instance";
-                // 2. 解析并过滤数据
-                List<MetricItem> items = parseAndFilter(jsonResult, labelName, task.getThreshold());
-
-                if (!items.isEmpty()) {
-                    // 3. 反射写入 DTO 字段
-                    Field field = DailyReportDTO.class.getDeclaredField(fieldName);
-                    field.setAccessible(true);
-                    field.set(report, items);
-
-                    // 4. 写入建议
-                    report.getAdvices().put(fieldName, task.getAdvice());
-                    totalIssues += items.size();
-                }
-            } catch (NoSuchFieldException e) {
-                // 如果 taskKey 配置错了,找不到 DTO 字段,在这里捕获
-                System.err.println("配置错误: DailyReportDTO 中不存在字段 " + fieldName);
-            } catch (Exception e) {
-                e.printStackTrace();
-            }
-        }
-
-        report.setStatusSummary(totalIssues > 0 ? "发现 " + totalIssues + " 项待处理异常" : "所有指标正常");
-        return report;
-    }
-
-    private List<MetricItem> parseAndFilter(String json, String nameLabel, double threshold) throws Exception {
-        List<MetricItem> filteredItems = new ArrayList<>();
-        JsonNode resultNode = objectMapper.readTree(json).path("data").path("result");
-
-        if (!resultNode.isArray()) return filteredItems;
-
-        for (JsonNode node : resultNode) {
-            double val = node.path("value").get(1).asDouble();
-            if (val >= threshold) {
-                JsonNode metric = node.path("metric");
-                String instance = metric.path(nameLabel).asText("unknown").split(":")[0];
-                String name = metric.path("name").asText("unknown");
-                String displayName = String.format("%s_%s", instance, name);
-                String job = node.path("metric").path("job").asText("unknown");
-                if ("node-exporter".equals(job)) {
-                    displayName = instance.split(":")[0];
-                    instance = displayName;
-                }
-                filteredItems.add(new MetricItem(displayName, instance, Math.round(val * 100.0) / 100.0));
-            }
-        }
-        filteredItems.sort((a, b) -> b.getValue().compareTo(a.getValue()));
-        return filteredItems;
-    }
-}

+ 0 - 48
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/DailyReportDTO.java

@@ -1,48 +0,0 @@
-package cn.reghao.devops.mgr.ops.mon.model.dto;
-
-import lombok.Data;
-
-import java.time.LocalDate;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-/**
- * @author reghao
- * @date 2026-04-02 14:53:03
- */
-@Data
-public class DailyReportDTO {
-    private String lastUpdateTime = LocalDate.now().toString();
-    private String statusSummary = "正常";
-    private int totalNodes = 10;
-    private String checkDuration = "1.2s";
-    private String riskLevel = "Medium";
-    private Map<String, String> advices = new HashMap<>();
-    private Map<String, String> categoryMap = new HashMap<>();
-
-    private List<MetricItem> cpuThrottled = new ArrayList<>();
-    private List<MetricItem> memRisk = new ArrayList<>();
-    private List<MetricItem> diskIo = new ArrayList<>();
-    private List<MetricItem> inodeRisk = new ArrayList<>();
-    private List<MetricItem> fdRisk = new ArrayList<>();
-    private List<MetricItem> diskUsageRisk = new ArrayList<>();
-    private List<MetricItem> netEstMax = new ArrayList<>();
-    private List<MetricItem> netTwMax = new ArrayList<>();
-    private List<MetricItem> netOverflows = new ArrayList<>();
-    private List<MetricItem> netDrops = new ArrayList<>();
-    private List<MetricItem> contextSwitchRisk = new ArrayList<>();
-    private List<MetricItem> oomEvents = new ArrayList<>();
-    private List<MetricItem> clockSkewRisk = new ArrayList<>();
-    private List<MetricItem> readOnlyFsRisk = new ArrayList<>();
-    private List<MetricItem> zombieRisk = new ArrayList<>();
-
-    private List<MetricItem> nodeLoadRisk = new ArrayList<>();
-    private List<MetricItem> memPsiRisk = new ArrayList<>();
-    private List<MetricItem> diskPredictRisk = new ArrayList<>();
-    private List<MetricItem> pidLimitRisk = new ArrayList<>();
-    private List<MetricItem> netBandwidthRisk = new ArrayList<>();
-    private List<MetricItem> tcpRetransRisk = new ArrayList<>();
-    private List<MetricItem> conntrackRisk = new ArrayList<>();
-}

+ 21 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/InspectionReport.java

@@ -0,0 +1,21 @@
+package cn.reghao.devops.mgr.ops.mon.model.dto;
+
+import lombok.Data;
+
+import java.time.LocalDate;
+import java.util.*;
+
+/**
+ * @author reghao
+ * @date 2026-04-02 14:53:03
+ */
+@Data
+public class InspectionReport {
+    private String lastUpdateTime = LocalDate.now().toString();
+    private String statusSummary = "正常";
+    private int totalNodes = 10;
+    private String checkDuration = "1.2s";
+    private String riskLevel = "Medium";
+    private Map<String, List<String>> categoryMap = new LinkedHashMap<>();
+    private Map<String, InspectionTaskResult> resultMap = new HashMap<>();
+}

+ 31 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/dto/InspectionTaskResult.java

@@ -0,0 +1,31 @@
+package cn.reghao.devops.mgr.ops.mon.model.dto;
+
+import lombok.Data;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author reghao
+ * @date 2026-04-07 10:06:34
+ */
+@Data
+public class InspectionTaskResult {
+    private String taskKey;
+    private String title;         // 任务作用描述
+    private String subtitle;
+    private String valueType;
+    private String operator;
+    private double threshold;    // 阈值
+    private String advice;       // 超过阈值后的修复建议
+
+    // --- UI 相关字段 ---
+    private String icon;
+    private String textColor;
+    private String tagType;
+    private String progColor;
+    private String unit;
+    private boolean ratio;
+    private String emptyText;
+    private List<MetricItem> metricItemList = new ArrayList<>();
+}

+ 33 - 4
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/model/po/InspectionTask.java

@@ -31,20 +31,49 @@ public class InspectionTask {
     @Column(nullable = false)
     @Column(nullable = false)
     private String category;
     private String category;
     @Column(nullable = false)
     @Column(nullable = false)
-    private String taskDesc;         // 任务作用描述
-    @Column(nullable = false)
+    private String title;         // 任务作用描述
+    private String subtitle;
+    @Column(nullable = false, length = 1000)
     private String promql;       // 查询语句
     private String promql;       // 查询语句
+    // 数值类型:INSTANT(瞬时), AVG(平均), MAX(最高), INCREASE(增量), MIN(最低)
+    @Column(nullable = false)
+    private String valueType;
+    // 比较操作符:gt(>), lt(<), eq(==), ne(!=)
+    @Column(nullable = false)
+    private String operator;
     @Column(nullable = false)
     @Column(nullable = false)
     private double threshold;    // 阈值
     private double threshold;    // 阈值
     @Column(nullable = false)
     @Column(nullable = false)
     private String advice;       // 超过阈值后的修复建议
     private String advice;       // 超过阈值后的修复建议
 
 
-    public InspectionTask(String taskKey, String category, String taskDesc, String promql, double threshold, String advice) {
+    // --- UI 相关字段 ---
+    private String icon;
+    private String textColor;
+    private String tagType;
+    private String progColor;
+    private String unit;
+    private boolean ratio;
+    private String emptyText;
+
+    public InspectionTask(String taskKey, String category, String title, String subtitle,
+                          String promql, String valueType, String operator, double threshold,
+                          String advice, String icon, String textColor, String tagType, String progColor, String unit,
+                          boolean ratio, String emptyText) {
         this.taskKey = taskKey;
         this.taskKey = taskKey;
         this.category = category;
         this.category = category;
-        this.taskDesc = taskDesc;
+        this.title = title;
+        this.subtitle = subtitle;
         this.promql = promql;
         this.promql = promql;
+        this.valueType = valueType;
+        this.operator = operator;
         this.threshold = threshold;
         this.threshold = threshold;
         this.advice = advice;
         this.advice = advice;
+        this.icon = icon;
+        this.textColor = textColor;
+        this.tagType = tagType;
+        this.progColor = progColor;
+        this.unit = unit;
+        this.ratio = ratio;
+        this.emptyText = emptyText;
     }
     }
 }
 }

+ 10 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/repository/InspectionTaskRepository.java

@@ -2,11 +2,21 @@ package cn.reghao.devops.mgr.ops.mon.repository;
 
 
 import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import org.springframework.data.jpa.repository.JpaRepository;
 import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.Query;
+
+import java.util.List;
 
 
 /**
 /**
  * @author reghao
  * @author reghao
  * @date 2026-04-03 19:38:58
  * @date 2026-04-03 19:38:58
  */
  */
 public interface InspectionTaskRepository extends JpaRepository<InspectionTask, Integer> {
 public interface InspectionTaskRepository extends JpaRepository<InspectionTask, Integer> {
+    /**
+     * 获取所有分类名称列表
+     * 用于前端 categories 数组的动态生成
+     */
+    @Query("SELECT DISTINCT t.category FROM InspectionTask t")
+    List<String> findDistinctCategories();
+
     InspectionTask findByTaskKey(String taskKey);
     InspectionTask findByTaskKey(String taskKey);
 }
 }

+ 181 - 0
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/InspectionService.java

@@ -0,0 +1,181 @@
+package cn.reghao.devops.mgr.ops.mon.service;
+
+import cn.reghao.devops.mgr.ops.mon.model.dto.InspectionReport;
+import cn.reghao.devops.mgr.ops.mon.model.dto.InspectionTaskResult;
+import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
+import cn.reghao.devops.mgr.ops.mon.model.dto.MetricItem;
+import cn.reghao.devops.mgr.ops.mon.repository.InspectionTaskRepository;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author reghao
+ * @date 2026-04-03 18:51:27
+ */
+@Slf4j
+@Service
+public class InspectionService {
+    private final ObjectMapper objectMapper;
+    private final PrometheusClientManager prometheusClientManager;
+    private final InspectionTaskRepository inspectionTaskRepository;
+
+    public InspectionService(ObjectMapper objectMapper,
+                             PrometheusClientManager prometheusClientManager,
+                             InspectionTaskRepository inspectionTaskRepository) {
+        this.objectMapper = objectMapper;
+        this.prometheusClientManager = prometheusClientManager;
+        this.inspectionTaskRepository = inspectionTaskRepository;
+    }
+
+    public InspectionReport getInspectionReport() {
+        Map<String, List<String>> categoryMap = new LinkedHashMap<>();
+        for (String category : inspectionTaskRepository.findDistinctCategories()) {
+            categoryMap.putIfAbsent(category, new ArrayList<>());
+        }
+
+        // 1. 获取所有配置好的巡检任务
+        List<InspectionTask> allTasks = inspectionTaskRepository.findAll();
+        Map<String, InspectionTask> taskMap = allTasks.stream()
+                .collect(Collectors.toMap(InspectionTask::getTaskKey, task -> task));
+
+        // 2. 调用 Prometheus 获取数据并异步处理
+        return prometheusClientManager.getClient()
+                .fetchAllMetrics1(taskMap)
+                .thenApply(rawJsonMap -> processResults(categoryMap, taskMap, rawJsonMap))
+                .join();
+    }
+
+    public InspectionReport processResults(Map<String, List<String>> categoryMap, Map<String, InspectionTask> taskMap, Map<String, String> rawJsonMap) {
+        InspectionReport report = new InspectionReport();
+        int totalIssues = 0;
+
+        for (Map.Entry<String, InspectionTask> entry : taskMap.entrySet()) {
+            String taskKey = entry.getKey();
+            InspectionTask config = entry.getValue();
+            String jsonResult = rawJsonMap.get(taskKey);
+
+            if (jsonResult == null || jsonResult.isEmpty()) continue;
+
+            try {
+                // 1. 解析并根据配置的阈值过滤数据
+                List<MetricItem> filteredItems = parseAndFilter(jsonResult, "instance", config.getOperator(), config.getThreshold());
+
+                // 2. 创建结果包装类 (无论是否有异常,都建议放入结果集,以便前端显示“正常”状态)
+                // 如果你希望只有异常才显示,可以在这里加 if (!filteredItems.isEmpty())
+                InspectionTaskResult taskResult = new InspectionTaskResult();
+
+                // 将数据库中的 UI 配置拷贝到 Result 中供前端渲染
+                taskResult.setTaskKey(taskKey);
+                taskResult.setTitle(config.getTitle());
+                taskResult.setSubtitle(config.getSubtitle());
+                taskResult.setValueType(config.getValueType());
+                taskResult.setOperator(config.getOperator());
+                taskResult.setThreshold(config.getThreshold());
+                taskResult.setAdvice(config.getAdvice());
+                taskResult.setIcon(config.getIcon());
+                taskResult.setTextColor(config.getTextColor());
+                taskResult.setTagType(config.getTagType());
+                taskResult.setProgColor(config.getProgColor());
+                taskResult.setUnit(config.getUnit());
+                taskResult.setRatio(config.isRatio());
+                taskResult.setEmptyText(config.getEmptyText());
+
+                // 放入过滤后的指标列表
+                taskResult.setMetricItemList(filteredItems);
+
+                // 3. 存入 DTO 的 Map 结构
+                report.getResultMap().put(taskKey, taskResult);
+
+                // 4. 记录分类信息(用于前端分组显示)
+                categoryMap.get(config.getCategory()).add(taskKey);
+                //report.getCategoryMap().put(taskKey, config.getCategory());
+                //List<String> list = report.getCategoryMap().computeIfAbsent(config.getCategory(), v -> new ArrayList<>() );
+                //list.add(taskKey);
+                totalIssues += filteredItems.size();
+            } catch (Exception e) {
+                log.error("解析巡检项失败: " + taskKey, e);
+            }
+        }
+
+        // 更新汇总状态
+        report.setCategoryMap(categoryMap);
+        report.setStatusSummary(totalIssues > 0 ? "发现 " + totalIssues + " 项待处理异常" : "所有指标正常");
+        report.setRiskLevel(totalIssues > 5 ? "High" : (totalIssues > 0 ? "Medium" : "Low"));
+        return report;
+    }
+
+    private List<MetricItem> parseAndFilter(String json, String nameLabel, String operator, double threshold) throws Exception {
+        List<MetricItem> filteredItems = new ArrayList<>();
+        JsonNode root = objectMapper.readTree(json);
+        JsonNode resultNode = root.path("data").path("result");
+
+        if (!resultNode.isArray()) return filteredItems;
+
+        for (JsonNode node : resultNode) {
+            // Prometheus 返回值格式为 [timestamp, "value"]
+            JsonNode valueNode = node.path("value");
+            if (valueNode.size() < 2) continue;
+
+            double val = valueNode.get(1).asDouble();
+
+            // 阈值过滤
+            boolean isAbnormal = compare(val, operator, threshold);
+            if (isAbnormal) {
+                JsonNode metric = node.path("metric");
+                // 提取实例名 (处理 IP:Port 格式)
+                String rawInstance = metric.path(nameLabel).asText("unknown");
+                String instance = rawInstance.contains(":") ? rawInstance.split(":")[0] : rawInstance;
+
+                // 提取监控对象名 (如容器名)
+                String name = metric.path("name").asText("");
+                if (name.isEmpty()) {
+                    name = metric.path("pod").asText(""); // 尝试获取 pod 名
+                }
+
+                String displayName = name.isEmpty() ? instance : String.format("%s (%s)", name, instance);
+
+                // 格式化数值 (保留两位小数)
+                double formattedVal = Math.round(val * 100.0) / 100.0;
+
+                filteredItems.add(new MetricItem(displayName, instance, formattedVal));
+            }
+        }
+
+        // 按数值降序排序,让最危险的排在前面
+        filteredItems.sort((a, b) -> b.getValue().compareTo(a.getValue()));
+        return filteredItems;
+    }
+
+    public static boolean compare(double actualValue, String operator, double threshold) {
+        if (operator == null) {
+            return false;
+        }
+        switch (operator.toLowerCase()) {
+            case "gt": // Greater Than
+                return actualValue > threshold;
+            case "lt": // Less Than
+                return actualValue < threshold;
+            case "eq": // Equal
+                // 注意:Double直接用==可能有精度问题,但在监控指标中(如 0.0 或 1.0)通常可以接受
+                // 如果追求严谨,可以使用 Math.abs(actualValue - threshold) < 0.00001
+                return actualValue == threshold;
+            case "ne": // Not Equal
+                return actualValue != threshold;
+            case "ge": // Greater or Equal
+                return actualValue >= threshold;
+            case "le": // Less or Equal
+                return actualValue <= threshold;
+            default:
+                // 如果操作符未知,默认不报错,防止误报
+                return false;
+        }
+    }
+}

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusAsyncClient.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/PrometheusAsyncClient.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.mon;
+package cn.reghao.devops.mgr.ops.mon.service;
 
 
 import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import lombok.extern.slf4j.Slf4j;
 import lombok.extern.slf4j.Slf4j;

+ 1 - 1
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusClientManager.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/PrometheusClientManager.java

@@ -1,4 +1,4 @@
-package cn.reghao.devops.mgr.ops.mon;
+package cn.reghao.devops.mgr.ops.mon.service;
 
 
 import cn.reghao.devops.mgr.admin.db.repository.SiteOptionRepository;
 import cn.reghao.devops.mgr.admin.db.repository.SiteOptionRepository;
 import cn.reghao.devops.mgr.admin.model.po.SiteOption;
 import cn.reghao.devops.mgr.admin.model.po.SiteOption;

+ 2 - 292
mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/PrometheusService.java → mgr/src/main/java/cn/reghao/devops/mgr/ops/mon/service/PrometheusService.java

@@ -1,9 +1,6 @@
-package cn.reghao.devops.mgr.ops.mon;
+package cn.reghao.devops.mgr.ops.mon.service;
 
 
 import cn.reghao.devops.mgr.ops.mon.model.*;
 import cn.reghao.devops.mgr.ops.mon.model.*;
-import cn.reghao.devops.mgr.ops.mon.model.dto.DailyReportDTO;
-import cn.reghao.devops.mgr.ops.mon.model.dto.MetricItem;
-import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import cn.reghao.jutil.jdk.converter.DateTimeConverter;
 import cn.reghao.jutil.jdk.converter.DateTimeConverter;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.JsonNode;
@@ -37,7 +34,7 @@ import java.util.stream.Collectors;
 @Slf4j
 @Slf4j
 @Service
 @Service
 public class PrometheusService {
 public class PrometheusService {
-    private ObjectMapper objectMapper;
+    private final ObjectMapper objectMapper;
     private final PrometheusClientManager prometheusClientManager;
     private final PrometheusClientManager prometheusClientManager;
     private final Cache<String, Object> cache;
     private final Cache<String, Object> cache;
 
 
@@ -855,291 +852,4 @@ public class PrometheusService {
             log.error("{}", e.getMessage());
             log.error("{}", e.getMessage());
         }
         }
     }
     }
-
-    public DailyReportDTO getDailyReportData() {
-        // 定义查询任务
-        Map<String, String> tasks = new HashMap<>();
-        // 使用 put 方法依次添加原有的 PromQL 任务
-        tasks.put("container_cpu", """
-        topk(10,\s
-          sum(
-            label_replace(
-              increase(container_cpu_cfs_throttled_seconds_total[24h]),
-              "instance", "$1", "instance", "([^:]+):.*"
-            )
-          ) by (name, instance)
-        )
-        """);
-
-        tasks.put("container_mem", """
-        topk(10,\s
-          avg by (name, instance) (
-            label_replace(
-              (container_memory_working_set_bytes{name!=""} / container_spec_memory_limit_bytes > 0) * 100,
-              "instance", "$1", "instance", "([^:]+):.*"
-            )
-          )
-        )
-        """);
-
-        tasks.put("node_disk", """
-        avg_over_time(
-            label_replace(
-              irate(node_disk_io_time_seconds_total[10m]),
-              "instance", "$1", "instance", "([^:]+):.*"
-            )[24h:1m]
-          )
-        """);
-
-        tasks.put("node_inode", """
-        topk(10, (1 - node_filesystem_files_free / node_filesystem_files) * 100)
-        """);
-
-        tasks.put("node_fd", """
-        topk(10, (node_filefd_allocated / node_filefd_maximum) * 100)
-        """);
-
-        tasks.put("node_disk_usage", """
-        topk(10, (1 - node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100)
-        """);
-
-        // 24h 内 TCP 正常连接的最大并发数
-        tasks.put("net_tcp_est_max", """
-        max_over_time(label_replace(node_netstat_Tcp_CurrEstab, "state", "ESTABLISHED", "", "")[24h:])
-        """);
-
-        // 24h 内 TCP 等待关闭连接的最大堆积数
-        tasks.put("net_tcp_tw_max", """
-        max_over_time(label_replace(node_sockstat_TCP_tw, "state", "TIME_WAIT", "", "")[24h:])
-        """);
-
-        // TCP 全连接队列溢出 (ListenOverflows) - 24h 增量
-        tasks.put("net_tcp_overflow", """
-        topk(10, increase(node_netstat_TcpExt_ListenOverflows[24h]))
-        """);
-
-        // TCP 丢弃计数 (TcpExt_ListenDrops) - 24h 增量
-        tasks.put("net_tcp_drops", """
-        topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))
-        """);
-        tasks.put("node_oom", "increase(node_vmstat_oom_kill[24h])");
-        tasks.put("node_clock", "abs(node_timex_offset_seconds)");
-        tasks.put("node_ro_fs", "node_filesystem_readonly{mountpoint='/'}");
-        tasks.put("node_zombie", "node_processes_state{state='Z'}");
-        // 逻辑:计算 node 级别每秒上下文切换次数
-        tasks.put("node_context_switch", """
-                topk(10, avg_over_time(rate(node_context_switches_total[5m])[24h:1m]))
-                """);
-
-        List<InspectionTask> allTasks = getAllTasks();
-        Map<String, InspectionTask> taskMap = allTasks.stream()
-                .collect(Collectors.toMap(
-                        InspectionTask::getTaskKey,
-                        task -> task
-                ));
-
-
-        return prometheusClientManager.getClient()
-                .fetchAllMetrics1(taskMap)
-                .thenApply(result -> {
-                    return processResults1(taskMap, result);
-                }) // 这里 processResults1 需要改为返回 DailyReportDTO
-                .join();
-    }
-
-    // 定义全局风险阈值
-    private static final double MEM_RISK_THRESHOLD = 30.0;      // 内存超过85%需注意
-    private static final double CPU_THROTTLE_THRESHOLD = 1000.0; // 24h节流超过10秒需注意
-    private static final double DISK_IO_THRESHOLD = 1.0;      // IO Wait超过50ms需注意
-    private static final double INODE_RISK_THRESHOLD = 10.0;    // Inode 超过 80% 需注意
-    private static final double FD_RISK_THRESHOLD = 10.0; // 句柄占用超过 80% 需注意
-    private static final double DISK_USAGE_THRESHOLD = 80.0; // 磁盘空间超过 85% 需注意
-    private static final double TCP_EST_THRESHOLD = 5000.0; // 单机并发超过5000需注意(根据业务调整)
-    private static final double TCP_TW_THRESHOLD = 50.0; // TIME_WAIT 超过 5000 需注意
-    private static final double NET_DROP_THRESHOLD = 1.0;   // 只要有 1 个溢出或丢弃就需注意
-    private static final double OOM_KILL_THRESHOLD = 1.0;      // 24h 内发生过 OOM
-    private static final double CLOCK_SKEW_THRESHOLD = 0.5;    // 时钟偏移超过 500ms
-    private static final double READONLY_FS_THRESHOLD = 1.0;   // 存在只读文件系统
-    private static final double ZOMBIE_PROCS_THRESHOLD = 5.0;  // 僵尸进程过多
-    // 假设是 8 核机器,总切换数超过 50,000 需注意
-    private static final double CONTEXT_SWITCH_THRESHOLD = 1000.0;
-
-    public static List<InspectionTask> getAllTasks() {
-        List<InspectionTask> tasks = new ArrayList<>();
-        return tasks;
-    }
-
-    public DailyReportDTO processResults1(Map<String, InspectionTask> taskMap, Map<String, String> results) {
-        DailyReportDTO report = new DailyReportDTO();
-        try {
-            // 1. 解析并筛选 CPU 节流 (只保留显著受限的容器)
-            report.setCpuThrottled(
-                    parseAndFilter(results.get("container_cpu"), "name", taskMap.get("container_cpu").getThreshold())
-            );
-            report.getAdvices().put("container_cpu", taskMap.get("container_cpu").getAdvice());
-
-            // 2. 解析并筛选 内存风险 (只保留接近 Limit 的容器)
-            report.setMemRisk(
-                    parseAndFilter(results.get("container_mem"), "name", taskMap.get("container_mem").getThreshold())
-            );
-            report.getAdvices().put("container_mem", taskMap.get("container_mem").getAdvice());
-
-            // 3. 解析并筛选 磁盘 IO (只保留高负载节点)
-            report.setDiskIo(
-                    parseAndFilter(results.get("node_disk"), "instance", taskMap.get("node_disk").getThreshold())
-            );
-            report.getAdvices().put("node_disk", taskMap.get("node_disk").getAdvice());
-
-            report.setInodeRisk(
-                    parseAndFilter(results.get("node_inode"), "instance", taskMap.get("node_inode").getThreshold())
-            );
-            report.getAdvices().put("node_inode", taskMap.get("node_inode").getAdvice());
-
-            // 2. 新增:解析并筛选文件句柄风险
-            report.setFdRisk(
-                    parseAndFilter(results.get("node_fd"), "instance", taskMap.get("node_fd").getThreshold())
-            );
-            report.getAdvices().put("node_fd", taskMap.get("node_fd").getAdvice());
-
-            report.setDiskUsageRisk(
-                    parseAndFilter(results.get("node_disk_usage"), "instance", taskMap.get("node_disk_usage").getThreshold())
-            );
-            report.getAdvices().put("node_disk_usage", taskMap.get("node_disk_usage").getAdvice());
-
-            // 解析 24h TCP EST 峰值
-            report.setNetEstMax(
-                    parseAndFilter(results.get("net_tcp_est_max"), "state", taskMap.get("net_tcp_est_max").getThreshold())
-            );
-            report.getAdvices().put("net_tcp_est_max", taskMap.get("net_tcp_est_max").getAdvice());
-
-            // 解析并筛选 24h TCP TIME_WAIT 风险
-            report.setNetTwMax(
-                    parseAndFilter(results.get("net_tcp_tw_max"), "state", taskMap.get("net_tcp_tw_max").getThreshold())
-            );
-            report.getAdvices().put("net_tcp_tw_max", taskMap.get("net_tcp_tw_max").getAdvice());
-
-            // 筛选全连接队列溢出和丢弃
-            report.setNetOverflows(
-                    parseAndFilter(results.get("net_tcp_overflow"), "instance", taskMap.get("net_tcp_overflow").getThreshold())
-            );
-            report.getAdvices().put("net_tcp_overflow", taskMap.get("net_tcp_overflow").getAdvice());
-
-            report.setNetDrops(
-                    parseAndFilter(results.get("net_tcp_drops"), "instance", taskMap.get("net_tcp_drops").getThreshold())
-            );
-            report.getAdvices().put("net_tcp_drops", taskMap.get("net_tcp_drops").getAdvice());
-
-            // 2. 解析并筛选上下文切换风险
-            report.setContextSwitchRisk(
-                    parseAndFilter(results.get("node_context_switch"), "instance", taskMap.get("node_context_switch").getThreshold())
-            );
-            report.getAdvices().put("node_context_switch", taskMap.get("node_context_switch").getAdvice());
-
-            // 2. OOM Kill 事件 (24h 增量)
-            report.setOomEvents(
-                    parseAndFilter(results.get("node_oom"), "instance", taskMap.get("node_oom").getThreshold())
-            );
-            report.getAdvices().put("node_oom", taskMap.get("node_oom").getAdvice());
-
-            // 3. 时钟偏移 (绝对值)
-            report.setClockSkewRisk(
-                    parseAndFilter(results.get("node_clock"), "instance", taskMap.get("node_clock").getThreshold())
-            );
-            report.getAdvices().put("node_clock", taskMap.get("node_clock").getAdvice());
-
-            // 4. 只读文件系统 (状态值)
-            report.setReadOnlyFsRisk(
-                    parseAndFilter(results.get("node_ro_fs"), "instance", taskMap.get("node_ro_fs").getThreshold())
-            );
-            report.getAdvices().put("node_ro_fs", taskMap.get("node_ro_fs").getAdvice());
-
-            // 5. 僵尸进程/阻塞进程
-            report.setZombieRisk(
-                    parseAndFilter(results.get("node_zombie"), "instance", taskMap.get("node_zombie").getThreshold())
-            );
-            report.getAdvices().put("node_zombie", taskMap.get("node_zombie").getAdvice());
-
-            // 计算汇总状态
-            long totalIssues = report.getCpuThrottled().size() +
-                    report.getMemRisk().size() +
-                    report.getDiskIo().size() +
-                    report.getInodeRisk().size() +
-                    report.getFdRisk().size() +
-                    report.getDiskUsageRisk().size() +
-                    report.getNetEstMax().size() +
-                    report.getNetTwMax().size() +
-                    report.getNetOverflows().size() +
-                    report.getNetDrops().size() +
-                    report.getContextSwitchRisk().size() +
-                    report.getOomEvents().size() +
-                    report.getClockSkewRisk().size() +
-                    report.getReadOnlyFsRisk().size() +
-                    report.getZombieRisk().size();
-
-            report.setStatusSummary(totalIssues > 0 ? "发现 " + totalIssues + " 项待处理异常" : "所有指标正常");
-        } catch (Exception e) {
-            log.error("{}", e.getMessage());
-        }
-
-        return report;
-    }
-
-    private List<MetricItem> parseAndFilter(String json, String nameLabel, double threshold) throws Exception {
-        List<MetricItem> filteredItems = new ArrayList<>();
-        JsonNode resultNode = objectMapper.readTree(json).path("data").path("result");
-
-        if (resultNode.isArray()) {
-            for (JsonNode node : resultNode) {
-                double val = node.path("value").get(1).asDouble();
-
-                // 核心逻辑:只有超过阈值的才加入报告
-                if (val >= threshold) {
-                    String name = node.path("metric").path(nameLabel).asText("unknown");
-                    String instance = node.path("metric").path("instance").asText("unknown");
-                    String name1 = String.format("%s_%s", instance, name);
-                    String job = node.path("metric").path("job").asText("unknown");
-                    if ("node-exporter".equals(job)) {
-                        name1 = instance.split(":")[0];
-                        instance = name1;
-                    }
-                    filteredItems.add(new MetricItem(name1, instance, Math.round(val * 100.0) / 100.0));
-                }
-            }
-        }
-        // 按数值倒序排列,最重要的放在最上面
-        filteredItems.sort((a, b) -> b.getValue().compareTo(a.getValue()));
-        return filteredItems;
-    }
-
-    private List<MetricItem> parsePrometheusJson(String json, String nameLabel) throws Exception {
-        List<MetricItem> items = new ArrayList<>();
-        JsonNode root = objectMapper.readTree(json);
-        JsonNode resultNode = root.path("data").path("result");
-
-        if (resultNode.isArray()) {
-            for (JsonNode node : resultNode) {
-                JsonNode metric = node.path("metric");
-                // Prometheus 返回的 value 是 [timestamp, "value"]
-                JsonNode valueArray = node.path("value");
-                if (valueArray.isMissingNode()) {
-                    // 如果是 range query, 取 values 数组的最后一个
-                    valueArray = node.path("values").get(node.path("values").size() - 1);
-                }
-
-                String name = metric.path(nameLabel).asText("unknown");
-                String instance = metric.path("instance").asText("unknown");
-                Double val = valueArray.get(1).asDouble();
-                items.add(new MetricItem(name, instance, Math.round(val * 100.0) / 100.0));
-            }
-        }
-        return items;
-    }
-
-    public static void main(String[] args) throws Exception {
-        //PrometheusService prometheusService = new PrometheusService(appProperties, null);
-        /*prometheusService.generateContainerReport1();
-        prometheusService.generateContainerReport();*/
-        //prometheusService.generatePillarReport();
-        //prometheusService.generateDailyReport();
-    }
 }
 }

+ 90 - 73
mgr/src/test/java/devops/AppConfigTest.java

@@ -10,6 +10,7 @@ import cn.reghao.devops.mgr.ops.builder.model.po.AppBuilding;
 import cn.reghao.devops.mgr.ops.builder.model.po.AppDeploying;
 import cn.reghao.devops.mgr.ops.builder.model.po.AppDeploying;
 import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import cn.reghao.devops.mgr.ops.mon.model.po.InspectionTask;
 import cn.reghao.devops.mgr.ops.mon.repository.InspectionTaskRepository;
 import cn.reghao.devops.mgr.ops.mon.repository.InspectionTaskRepository;
+import cn.reghao.devops.mgr.ops.mon.service.PrometheusService;
 import lombok.extern.slf4j.Slf4j;
 import lombok.extern.slf4j.Slf4j;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.Test;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Autowired;
@@ -66,108 +67,124 @@ public class AppConfigTest {
 
 
     @Autowired
     @Autowired
     InspectionTaskRepository inspectionTaskRepository;
     InspectionTaskRepository inspectionTaskRepository;
+    @Autowired
+    PrometheusService prometheusService;
     @Test
     @Test
     public void initInspectionTask() {
     public void initInspectionTask() {
-        List<InspectionTask> list = getAllTasks();
-        inspectionTaskRepository.saveAll(list);
-    }
-
-    public List<InspectionTask> getAllTasks() {
         List<InspectionTask> tasks = new ArrayList<>();
         List<InspectionTask> tasks = new ArrayList<>();
 
 
-        // --- 类别 A: 计算资源与调度 (Compute) ---
-        tasks.add(new InspectionTask("cpuThrottled", "计算资源", "容器CPU节流",
-                "topk(10, sum(label_replace(increase(container_cpu_cfs_throttled_seconds_total[24h]), \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")) by (name, instance))",
-                1000.0, "建议:调大容器CPU Limit或优化线程池。"));
+        // --- 类别 A: 计算资源 (Compute) ---
+        // 逻辑:24H 累计增量
+        tasks.add(new InspectionTask("cpuThrottled", "计算资源", "CPU 性能节流", "24H Throttled",
+                "topk(10, sum(increase(container_cpu_cfs_throttled_seconds_total[24h])) by (name, instance))",
+                "INCREASE", "gt", 1000.0, "建议:调大容器CPU Limit或优化线程池。", "el-icon-cpu", "warning-text", "warning", "#e6a23c", "s", false, "无节流限制"));
 
 
-        tasks.add(new InspectionTask("nodeLoadRisk", "计算资源", "节点CPU负载饱和度",
-                "node_load5 / count by (instance) (node_cpu_seconds_total{mode=\"idle\"})",
-                1.2, "建议:Load高于核心数,说明进程正在排队,请检查系统瓶颈。"));
+        // 逻辑:24H 平均负载
+        tasks.add(new InspectionTask("nodeLoadRisk", "计算资源", "节点负载饱和度", "24H Avg Load",
+                "avg_over_time((node_load5 / count by (instance) (node_cpu_seconds_total{mode=\"idle\"}))[24h:5m])",
+                "AVG", "gt", 1.2, "建议:全天平均Load高于核心数,说明进程长期排队,请检查系统瓶颈。", "el-icon-odometer", "warning-text", "warning", "#e6a23c", "", false, "系统负载极低"));
 
 
-        tasks.add(new InspectionTask("contextSwitchRisk", "计算资源", "上下文切换风险",
-                "topk(10, avg_over_time(rate(node_context_switches_total[5m])[24h:1m]))",
-                50000.0, "建议:减少线程竞争或优化锁逻辑。"));
+        // 逻辑:24H 平均切换频率
+        tasks.add(new InspectionTask("contextSwitchRisk", "计算资源", "上下文切换", "24H Avg CS",
+                "avg_over_time(rate(node_context_switches_total[5m])[24h:1m])",
+                "AVG", "gt", 50000.0, "建议:减少线程竞争或优化锁逻辑。", "el-icon-refresh", "warning-text", "warning", "#e6a23c", "次/s", false, "调度平稳"));
 
 
-        tasks.add(new InspectionTask("zombieRisk", "计算资源", "僵尸进程风险",
+        // 逻辑:当前瞬时状态
+        tasks.add(new InspectionTask("zombieRisk", "计算资源", "僵尸进程", "Z States",
                 "node_processes_state{state='Z'}",
                 "node_processes_state{state='Z'}",
-                5.0, "建议:修复父进程回收逻辑,防止PID泄露。"));
+                "INSTANT", "gt", 5.0, "建议:修复父进程回收逻辑,防止PID泄露。", "el-icon-stopwatch", "warning-text", "warning", "#e6a23c", "个", false, "无僵尸进程"));
+
+        // 逻辑:24H 平均 CPU 使用率
+        tasks.add(new InspectionTask("nodeCpuUsage", "计算资源", "节点CPU利用率", "24H Avg Usage",
+                "avg_over_time((100 - (irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * 100))[24h:5m])",
+                "AVG", "gt", 85.0, "建议:全天平均CPU占用过高,请检查是否有异常进程抢占资源。", "el-icon-monitor", "danger-text", "danger", "#f56c6c", "%", true, "利用率正常"));
 
 
-        // --- 类别 B: 内存与运行时 (Memory) ---
-        tasks.add(new InspectionTask("memRisk", "内存指标", "容器内存风险",
-                "topk(10, avg by (name, instance) (label_replace((container_memory_working_set_bytes{name!=\"\"} / container_spec_memory_limit_bytes > 0) * 100, \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")))",
-                85.0, "建议:检查内存泄露或增加内存配额。"));
+        // 逻辑:24H 平均 IO 等待
+        tasks.add(new InspectionTask("cpuWaitIo", "计算资源", "CPU I/O等待频率", "24H Avg iowait",
+                "avg_over_time((irate(node_cpu_seconds_total{mode=\"iowait\"}[5m]) * 100)[24h:5m])",
+                "AVG", "gt", 10.0, "建议:全天IO等待比例偏高,请排查高频写日志或慢磁盘问题。", "el-icon-time", "warning-text", "warning", "#e6a23c", "%", true, "磁盘 I/O 无延迟"));
 
 
-        tasks.add(new InspectionTask("memPsiRisk", "内存指标", "内存紧缩压力(PSI)",
-                "rate(node_pressure_memory_some_seconds_total[5m])",
-                0.1, "建议:系统正在频繁页面置换,请检查高内存占用进程。"));
 
 
-        tasks.add(new InspectionTask("oomEvents", "内存指标", "OOM事件",
+        // --- 类别 B: 内存指标 (Memory) ---
+        // 逻辑:当前水位
+        tasks.add(new InspectionTask("nodeMemUsage", "内存指标", "节点内存利用率", "Physical Mem",
+                "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
+                "INSTANT", "gt", 90.0, "建议:当前物理内存严重不足,可能会触发整机级别的OOM。", "el-icon-set-up", "danger-text", "danger", "#f56c6c", "%", true, "内存充足"));
+
+        tasks.add(new InspectionTask("memRisk", "内存指标", "容器内存水位线", "> 85% Usage",
+                "topk(10, (container_memory_working_set_bytes{name!=\"\"} / container_spec_memory_limit_bytes) * 100)",
+                "INSTANT", "gt", 85.0, "建议:容器接近内存限制,请检查泄露或增加配额。", "el-icon-box", "danger-text", "danger", "#f56c6c", "%", true, "水位正常"));
+
+        // 逻辑:24H 累计增量
+        tasks.add(new InspectionTask("oomEvents", "内存指标", "OOM Kill 事件", "24H Kernel OOM",
                 "increase(node_vmstat_oom_kill[24h])",
                 "increase(node_vmstat_oom_kill[24h])",
-                1.0, "建议:分析dmesg定位被杀进程,优化内存分配。"));
+                "INCREASE", "gt", 1.0, "建议:过去24小时发生过OOM,请分析dmesg定位被杀进程。", "el-icon-warning", "danger-text", "danger", "#f56c6c", "次", false, "无内存强杀"));
 
 
-        tasks.add(new InspectionTask("pidLimitRisk", "内存指标", "PID进程数限制",
+        tasks.add(new InspectionTask("pidLimitRisk", "内存指标", "PID 进程数限制", "Forks / Max",
                 "(node_forks_total / node_processes_max) * 100",
                 "(node_forks_total / node_processes_max) * 100",
-                80.0, "建议:PID快用完了,请检查是否存在大量短时进程或进程泄露。"));
+                "INSTANT", "gt", 80.0, "建议:当前PID池占用过高,请检查进程泄露。", "el-icon-connection", "primary-text", "primary", "#409eff", "%", true, "池空间充足"));
 
 
-        // --- 类别 C: 存储与文件系统 (Storage) ---
-        tasks.add(new InspectionTask("diskUsageRisk", "存储文件", "磁盘空间风险",
-                "topk(10, (1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100)",
-                80.0, "建议:清理日志或扩容磁盘。"));
 
 
-        tasks.add(new InspectionTask("diskPredictRisk", "存储文件", "磁盘存满预测(24h)",
-                "predict_linear(node_filesystem_avail_bytes{mountpoint=\"/\"}[6h], 24 * 3600) < 0",
-                0.0, "建议:磁盘预计在24小时内写满,请立即处理。"));
+        // --- 类别 C: 存储文件 (Storage) ---
+        // 逻辑:当前瞬时水位
+        tasks.add(new InspectionTask("diskUsageRisk", "存储文件", "根分区空间", "( / ) Usage",
+                "(1 - node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"}) * 100",
+                "INSTANT", "gt", 80.0, "建议:当前磁盘占用过高,请清理日志或扩容。", "el-icon-pie-chart", "danger-text", "danger", "#f56c6c", "%", true, "空间充沛"));
 
 
-        tasks.add(new InspectionTask("inodeRisk", "存储文件", "Inode使用率",
-                "topk(10, (1 - node_filesystem_files_free / node_filesystem_files) * 100)",
-                80.0, "建议:清理大量极小文件(如临时日志)。"));
+        // 逻辑:区间趋势预测
+        tasks.add(new InspectionTask("diskPredictRisk", "存储文件", "磁盘存满预测", "Predict 24H",
+                "predict_linear(node_filesystem_avail_bytes{mountpoint=\"/\"}[6h], 24 * 3600)",
+                "INSTANT", "lt", 0.0, "建议:根据最近6h趋势,磁盘预计在24小时内写满,请立即处理。", "el-icon-magic-stick", "danger-text", "danger", "#f56c6c", "", false, "无写满风险"));
 
 
-        tasks.add(new InspectionTask("diskIo", "存储文件", "磁盘IO负载",
-                "avg_over_time(label_replace(irate(node_disk_io_time_seconds_total[10m]), \"instance\", \"$1\", \"instance\", \"([^:]+):.*\")[24h:1m])",
-                1.0, "建议:排查高I/O进程,检查存储后端健康度。"));
+        tasks.add(new InspectionTask("inodeRisk", "存储文件", "inode 使用率", "Index Nodes",
+                "(1 - node_filesystem_files_free / node_filesystem_files) * 100",
+                "INSTANT", "gt", 80.0, "建议:inode即将耗尽,请清理大量小文件。", "el-icon-files", "warning-text", "warning", "#e6a23c", "%", true, "索引节点充足"));
 
 
-        tasks.add(new InspectionTask("fdRisk", "存储文件", "文件句柄风险",
-                "topk(10, (node_filefd_allocated / node_filefd_maximum) * 100)",
-                80.0, "建议:检查FD泄露,必要时调大ulimit。"));
+        // 逻辑:24H 平均响应
+        tasks.add(new InspectionTask("diskIo", "存储文件", "磁盘 IO 响应", "24H Avg Wait",
+                "avg_over_time(irate(node_disk_io_time_seconds_total[10m])[24h:5m])",
+                "AVG", "gt", 1.0, "建议:过去24小时磁盘IO平均响应过慢,检查存储后端健康度。", "el-icon-receiving", "warning-text", "warning", "#e6a23c", "ms", false, "响应极快"));
 
 
-        tasks.add(new InspectionTask("readOnlyFsRisk", "存储文件", "只读分区风险",
+        // 逻辑:当前硬状态
+        tasks.add(new InspectionTask("readOnlyFsRisk", "存储文件", "文件系统只读", "RO Status",
                 "node_filesystem_readonly{mountpoint='/'}",
                 "node_filesystem_readonly{mountpoint='/'}",
-                1.0, "建议:硬件故障触发只读挂载,请检修硬件。"));
+                "INSTANT", "ne", 0.0, "建议:检测到只读挂载,可能存在硬件故障,请立即检修。", "el-icon-lock", "danger-text", "danger", "#f56c6c", "", false, "挂载正常"));
+
 
 
-        // --- 类别 D: 网络协议栈 (Network) ---
-        tasks.add(new InspectionTask("conntrackRisk", "网络协议", "连接跟踪表(Conntrack)",
-                "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) * 100",
-                80.0, "建议:连接表快满了,请优化内核参数或清理连接。"));
+        // --- 类别 D: 网络协议 (Network) ---
+        // 逻辑:24H 峰值
+        tasks.add(new InspectionTask("netEstMax", "网络协议", "TCP EST 数量", "24H Max EST",
+                "max_over_time(node_netstat_Tcp_CurrEstab[24h])",
+                "MAX", "gt", 5000.0, "建议:过去24小时连接数曾达峰值,请评估业务并发及连接泄露。", "el-icon-warning-outline", "danger-text", "danger", "#f56c6c", "次", false, "未超限"));
 
 
-        tasks.add(new InspectionTask("netEstMax", "网络协议", "TCP连接峰值",
-                "max_over_time(label_replace(node_netstat_Tcp_CurrEstab, \"state\", \"ESTABLISHED\", \"\", \"\")[24h:])",
-                5000.0, "建议:评估业务并发,检查长连接泄露。"));
+        tasks.add(new InspectionTask("netTwMax", "网络协议", "TCP TIME_WAIT", "24H Max TW",
+                "max_over_time(node_sockstat_TCP_tw[24h])",
+                "MAX", "gt", 5000.0, "建议:全天TW连接数较高,请开启tw_reuse或检查客户端连接池。", "el-icon-warning-outline", "danger-text", "danger", "#f56c6c", "次", false, "未超限"));
 
 
-        tasks.add(new InspectionTask("netTwMax", "网络协议", "TIME_WAIT堆积",
-                "max_over_time(label_replace(node_sockstat_TCP_tw, \"state\", \"TIME_WAIT\", \"\", \"\")[24h:])",
-                5000.0, "建议:开启tw_reuse或检查压测工具。"));
+        // 逻辑:24H 累计增量
+        tasks.add(new InspectionTask("netOverflows", "网络协议", "TCP 队列溢出", "24H Listen Overflow",
+                "increase(node_netstat_TcpExt_ListenOverflows[24h])",
+                "INCREASE", "gt", 1.0, "建议:全天曾出现溢出,应用处理过慢,请调大backlog或优化代码。", "el-icon-warning-outline", "danger-text", "danger", "#f56c6c", "次", false, "无溢出"));
 
 
-        tasks.add(new InspectionTask("netBandwidthRisk", "网络协议", "网卡带宽利用率",
-                "((rate(node_network_receive_bytes_total[5m]) + rate(node_network_transmit_bytes_total[5m])) * 8) / node_network_speed_bytes > 0",
-                0.8, "建议:带宽接近极限,检查是否有大文件同步。"));
+        tasks.add(new InspectionTask("netDrops", "网络协议", "TCP 数据包丢弃", "24H Packet Drops",
+                "increase(node_netstat_TcpExt_ListenDrops[24h])",
+                "INCREASE", "gt", 1.0, "建议:全天曾有数据包丢弃,请检查网络负载或内核缓冲区。", "el-icon-circle-close", "danger-text", "danger", "#f56c6c", "次", false, "无丢弃"));
 
 
-        tasks.add(new InspectionTask("tcpRetransRisk", "网络协议", "TCP重传率",
-                "rate(node_netstat_Tcp_RetransSegs[5m]) / rate(node_netstat_Tcp_OutSegs[5m]) * 100",
-                1.0, "建议:重传率偏高,检查链路质量或对端压力。"));
 
 
-        tasks.add(new InspectionTask("netOverflows", "网络协议", "全连接队列溢出",
-                "topk(10, increase(node_netstat_TcpExt_ListenOverflows[24h]))",
-                1.0, "建议:应用处理过慢,请调大backlog或优化逻辑。"));
+        // --- 类别 G: 基础环境 & 监控存活 ---
+        tasks.add(new InspectionTask("nodeExporterAlive", "基础环境", "NodeExporter 状态", "24H Availability",
+                "min_over_time(up{job=\"node-exporter\"}[24h])",
+                "MIN", "eq", 0.0, "建议:检测到过去24小时内采集器曾离线,请检查宿主机服务稳定性。", "el-icon-monitor", "danger-text", "danger", "#f56c6c", "离线", false, "全天运行正常"));
 
 
-        tasks.add(new InspectionTask("netDrops", "网络协议", "TCP丢弃",
-                "topk(10, increase(node_netstat_TcpExt_ListenDrops[24h]))",
-                1.0, "建议:检查网络负载或内核缓冲区大小。"));
+        tasks.add(new InspectionTask("cadvisorAlive", "基础环境", "cAdvisor 状态", "24H Availability",
+                "min_over_time(up{job=\"cadvisor\"}[24h])",
+                "MIN", "eq", 0.0, "建议:检测到过去24小时内容器采集器曾离线,请检查 Docker 服务状态。", "el-icon-ship", "danger-text", "danger", "#f56c6c", "离线", false, "全天运行正常"));
 
 
-        // --- 类别 E: 基础环境 (Infrastructure) ---
-        tasks.add(new InspectionTask("clockSkewRisk", "基础环境", "时钟偏移风险",
-                "abs(node_timex_offset_seconds)",
-                0.5, "建议:检查NTP/Chrony同步状态。"));
+        // 逻辑:24H 绝对值最大偏差
+        tasks.add(new InspectionTask("clockSkewRisk", "基础环境", "时钟偏移 (NTP)", "24H Max Offset",
+                "max_over_time(abs(node_timex_offset_seconds)[24h])",
+                "MAX", "gt", 0.5, "建议:全天最大时钟偏移超标,请检查NTP/Chrony同步状态。", "el-icon-alarm-clock", "warning-text", "warning", "#e6a23c", "s", false, "同步精确"));
 
 
-        return tasks;
+        inspectionTaskRepository.saveAll(tasks);
     }
     }
 }
 }

+ 1 - 50
mgr/src/test/java/devops/DockerTest.java

@@ -24,43 +24,6 @@ public class DockerTest {
         rootLogger.setLevel(Level.INFO);
         rootLogger.setLevel(Level.INFO);
     }
     }
 
 
-    @Test
-    public void dockerBuildTest() throws Exception {
-        setLogLevel();
-        DockerImpl docker = new DockerImpl();
-        /*String repoTag = "file:12345678";
-        String compileHome = "/home/reghao/code/aha/tnb/file/file-service";
-        docker.build(repoTag, compileHome);*/
-
-        String image = "node:16.20.2-buster-slim";
-        String sourceCodeDir = "/home/reghao/Downloads/0/devopsapp";
-        sourceCodeDir = "/home/reghao/Downloads/0/iquizoo.admin/";
-        String nodeModulesDir = "/home/reghao/Downloads/0/node_modules";
-        String buildCmd = "npm run build";
-        String buildCmd1 = "npm i && npm run build";
-        String buildCmd2 = "npm run test";
-        String buildCmd3 = "cd /app && npm --registry http://registry.npm.taobao.org i && npm run uat";
-        String buildCmd4 = "npm --registry http://registry.npm.taobao.org i";
-
-        DockerContainerConfig config = new DockerContainerConfig(image);
-        config.setVolumes(new DockerContainerConfig.Volumes());
-        config.setCmd(List.of("sh", "-c", buildCmd3));
-
-        Map<String, String> map = config.getVolumes().getMap();
-        map.put("/node_modules", nodeModulesDir);
-        map.put("/app", sourceCodeDir);
-
-        long start = 0;
-        start = System.currentTimeMillis();
-        docker.runAndRm(config);
-        log.info("cost {} ms", System.currentTimeMillis()-start);
-
-        /*config.setCmd(List.of("sh", "-c", "cd /front && " + buildCmd1));
-        start = System.currentTimeMillis();
-        docker.runAndRm(config);
-        log.info("cost {} ms", System.currentTimeMillis()-start);*/
-    }
-
     /**
     /**
      * 解析 docker 容器启动参数
      * 解析 docker 容器启动参数
      *
      *
@@ -71,7 +34,7 @@ public class DockerTest {
     @Test
     @Test
     public void parseContainerArgsTest() {
     public void parseContainerArgsTest() {
         String image = "node:16.20.2-buster-slim";
         String image = "node:16.20.2-buster-slim";
-        DockerContainerConfig config = new DockerContainerConfig(image);
+        DockerContainerConfig config = new DockerContainerConfig();
         config.setVolumes(new DockerContainerConfig.Volumes());
         config.setVolumes(new DockerContainerConfig.Volumes());
         //config.getHostConfig().setInit(true);
         //config.getHostConfig().setInit(true);
 
 
@@ -82,16 +45,4 @@ public class DockerTest {
         DockerContainerConfig config1 = JsonConverter.jsonToObject(json1, DockerContainerConfig.class);
         DockerContainerConfig config1 = JsonConverter.jsonToObject(json1, DockerContainerConfig.class);
         System.out.println();
         System.out.println();
     }
     }
-
-    @Test
-    public void dockerConfigTest() {
-        DockerContainerConfig config = new DockerContainerConfig();
-
-        DockerContainerConfig.Volumes volumes = new DockerContainerConfig.Volumes();
-        volumes.getMap().put("/app1", "/app1");
-        volumes.getMap().put("/app2", "/app2");
-        config.setVolumes(volumes);
-
-        System.out.println(JsonConverter.objectToJson(config));
-    }
 }
 }