|
|
@@ -1,11 +1,7 @@
|
|
|
package cn.reghao.devops.mgr.ops.srv.mon;
|
|
|
|
|
|
import cn.reghao.devops.mgr.config.AppProperties;
|
|
|
-import cn.reghao.devops.mgr.ops.srv.mon.dto.ContainerHealthReport;
|
|
|
-import cn.reghao.devops.mgr.ops.srv.mon.model.AnalysisResult;
|
|
|
-import cn.reghao.devops.mgr.ops.srv.mon.model.CpuThresholdConfig;
|
|
|
-import cn.reghao.devops.mgr.ops.srv.mon.model.HostGroup;
|
|
|
-import cn.reghao.devops.mgr.ops.srv.mon.model.MetricRecord;
|
|
|
+import cn.reghao.devops.mgr.ops.srv.mon.model.*;
|
|
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
|
import com.fasterxml.jackson.databind.JsonNode;
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
@@ -13,7 +9,6 @@ import com.github.benmanes.caffeine.cache.Cache;
|
|
|
import freemarker.template.Configuration;
|
|
|
import freemarker.template.Template;
|
|
|
import freemarker.template.TemplateException;
|
|
|
-import lombok.Data;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
import org.springframework.ui.freemarker.FreeMarkerTemplateUtils;
|
|
|
@@ -86,102 +81,6 @@ public class PrometheusService {
|
|
|
public void generateDailyReport() {
|
|
|
// 定义查询任务
|
|
|
Map<String, String> tasks = Map.of(
|
|
|
- "container_count", """
|
|
|
- avg_over_time((1 - avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))[24h:1m]) * 100
|
|
|
- """,
|
|
|
- "top_cpu_containers", """
|
|
|
- max_over_time((1 - avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))[24h:1m]) * 100
|
|
|
- """,
|
|
|
- "top_cpu_containers1", """
|
|
|
- max_over_time((1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)[24h:1m]) * 100
|
|
|
- """,
|
|
|
- "top_cpu_containers2", """
|
|
|
- max_over_time(rate(container_cpu_usage_seconds_total{name!=""}[5m])[24h:1m])
|
|
|
- """,
|
|
|
- "top_cpu_containers3", """
|
|
|
- max_over_time(container_memory_usage_bytes{name!=""}[24h:1m]) / 1024 / 1024
|
|
|
- """,
|
|
|
- "top_cpu_containers4", """
|
|
|
- increase(container_oom_events_total[24h]) > 0
|
|
|
- """
|
|
|
- );
|
|
|
-
|
|
|
- // 异步执行
|
|
|
- promClient.fetchAllMetrics(tasks).thenAccept(results -> {
|
|
|
- //processResults(results);
|
|
|
- System.out.println("所有数据采集完成,开始渲染报表...");
|
|
|
- }).join(); // 如果是在定时任务主线程,可以用 join 等待完成
|
|
|
- }
|
|
|
-
|
|
|
- private OperationReportDTO processResults(Map<String, String> results) {
|
|
|
- OperationReportDTO report = new OperationReportDTO();
|
|
|
- Map<String, HostInfo> hostMap = new HashMap<>();
|
|
|
- try {
|
|
|
- // 1. 解析 CPU 指标
|
|
|
- if (results.containsKey("node_cpu")) {
|
|
|
- parseToHostMap(results.get("node_cpu"), hostMap, "cpu");
|
|
|
- }
|
|
|
-
|
|
|
- // 2. 填充内存数据
|
|
|
- if (results.containsKey("node_mem")) {
|
|
|
- parseToHostMap(results.get("node_mem"), hostMap, "mem");
|
|
|
- }
|
|
|
-
|
|
|
- // 3. 填充容器数量
|
|
|
- if (results.containsKey("container_count")) {
|
|
|
- parseToHostMap(results.get("container_count"), hostMap, "count");
|
|
|
- }
|
|
|
-
|
|
|
- // 4. 解析 Top 5 容器排行 (注意这里 Key 的对齐)
|
|
|
- // 修正:使用 generateDailyReport 中定义的 "top_cpu_containers"
|
|
|
- String topCpuJson = results.get("top_cpu_containers");
|
|
|
- if (topCpuJson != null) {
|
|
|
- List<ContainerInfo> topContainers = parseTopContainers(topCpuJson);
|
|
|
- report.setTopContainers(topContainers);
|
|
|
- }
|
|
|
-
|
|
|
- report.setHostList(new ArrayList<>(hostMap.values()));
|
|
|
- // 这里建议加上容器总数统计
|
|
|
- report.setContainerCount(report.getTopContainers() != null ? report.getTopContainers().size() : 0);
|
|
|
-
|
|
|
- report.setStartTime(LocalDateTime.now().minusDays(1).format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
|
|
|
- report.setEndTime(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
|
|
|
-
|
|
|
- // 解析趋势数据
|
|
|
- if (results.containsKey("cpu_trend")) {
|
|
|
- parseTrendData(results.get("cpu_trend"), report);
|
|
|
- }
|
|
|
- } catch (Exception e) {
|
|
|
- }
|
|
|
-
|
|
|
- Map<String, Object> root = new HashMap<>();
|
|
|
- root.put("hostGroupList", report);
|
|
|
-
|
|
|
- /*try {
|
|
|
- // 6. 渲染最终的复合模板
|
|
|
- String templatePath = "daily_report.ftl";
|
|
|
- String htmlContent = renderHtml(templatePath, root);
|
|
|
- Path outputPath = Paths.get("/home/reghao/Downloads", "daily_report_" + LocalDate.now() + ".html");
|
|
|
- if (Files.notExists(outputPath.getParent())) {
|
|
|
- Files.createDirectories(outputPath.getParent());
|
|
|
- }
|
|
|
- Files.writeString(outputPath, htmlContent, StandardCharsets.UTF_8);
|
|
|
- System.out.println("✅ 报表已成功保存至: " + outputPath.toAbsolutePath());
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("{}", e.getMessage());
|
|
|
- }*/
|
|
|
-
|
|
|
- return report;
|
|
|
- }
|
|
|
-
|
|
|
- public void generateDailyReport1() {
|
|
|
- // 定义查询任务
|
|
|
- Map<String, String> tasks = Map.of(
|
|
|
- "container_count", "count by (instance) (container_last_seen{image!=''})",
|
|
|
- "top_cpu_containers", "topk(5, sum by (name) (rate(container_cpu_usage_seconds_total{image!=''}[5m]) * 100))"
|
|
|
- );
|
|
|
-
|
|
|
- tasks = Map.of(
|
|
|
"node_cpu_core", """
|
|
|
count by(instance) (node_cpu_seconds_total{mode="idle"})
|
|
|
""",
|
|
|
@@ -226,79 +125,11 @@ public class PrometheusService {
|
|
|
|
|
|
// 异步执行
|
|
|
promClient.fetchAllMetrics(tasks).thenAccept(results -> {
|
|
|
- processResults0(results);
|
|
|
- System.out.println("所有数据采集完成,开始渲染报表...");
|
|
|
+ processResults(results);
|
|
|
}).join(); // 如果是在定时任务主线程,可以用 join 等待完成
|
|
|
}
|
|
|
|
|
|
- public List<HostGroup> generateUnifiedReport(Map<String, MetricRecord> mergedMap) {
|
|
|
- // 2. 建立 HostGroup 层级关系
|
|
|
- Map<String, HostGroup> hostGroupMap = new LinkedHashMap<>();
|
|
|
- mergedMap.values().forEach(record -> {
|
|
|
- // 提取 IP (172.16.45.66:9100 -> 172.16.45.66)
|
|
|
- String ip = record.getInstance().split(":")[0];
|
|
|
- HostGroup group = hostGroupMap.computeIfAbsent(ip, k -> {
|
|
|
- HostGroup hg = new HostGroup();
|
|
|
- hg.setHostIp(ip);
|
|
|
- return hg;
|
|
|
- });
|
|
|
-
|
|
|
- if (record.getContainer() == null) {
|
|
|
- group.setHostRecord(record);
|
|
|
- } else {
|
|
|
- group.getContainerRecords().add(record);
|
|
|
- }
|
|
|
- });
|
|
|
-
|
|
|
- // 3. 核心:建立宿主机与容器的量纲关联 (统一为 Node 视角)
|
|
|
- hostGroupMap.values().forEach(group -> {
|
|
|
- MetricRecord host = group.getHostRecord();
|
|
|
- List<MetricRecord> containers = group.getContainerRecords();
|
|
|
-
|
|
|
- if (host != null && !containers.isEmpty()) {
|
|
|
- // 获取宿主机总核数 (从 node_cpu_core 指标中解析,此处假设 record 已包含该值)
|
|
|
- double totalCores = host.getCpuCore() > 0 ? host.getCpuCore() : 8.0;
|
|
|
-
|
|
|
- // 计算容器对宿主机的“实际贡献值”
|
|
|
- // 容器利用率 (C_util) = (Used_Cores / Limit_Cores) * 100
|
|
|
- // 贡献 Node 的百分比 = C_util * (Limit_Cores / Node_Total_Cores)
|
|
|
- double totalContainerContributionToNode = containers.stream()
|
|
|
- .mapToDouble(c -> c.getAvgValue() * (c.getCpuLimit() / totalCores))
|
|
|
- .sum();
|
|
|
-
|
|
|
- // 系统隐性损耗 = 宿主机总利用率 - 容器贡献总和
|
|
|
- group.setSystemOverhead(Math.max(0, host.getAvgValue() - totalContainerContributionToNode));
|
|
|
-
|
|
|
- for (MetricRecord container : group.getContainerRecords()) {
|
|
|
- // 1. 获取该容器的 Limit 核数 (需从 container_cpu_limit 查询中匹配)
|
|
|
- double limitCores = container.getCpuLimit();
|
|
|
-
|
|
|
- // 2. 计算:相对宿主机的百分比 = 相对Limit百分比 * (Limit核数 / 总核数)
|
|
|
- double relativeToHost = container.getAvgValue() * (limitCores / totalCores);
|
|
|
- double relativeToHostMax = container.getMaxValue() * (limitCores / totalCores);
|
|
|
-
|
|
|
- // 将这两个值存入 record 供模板使用
|
|
|
- container.setRelativeToHostAvg(relativeToHost);
|
|
|
- container.setRelativeToHostMax(relativeToHostMax);
|
|
|
- }
|
|
|
-
|
|
|
- // 诊断结论
|
|
|
- StringBuilder diag = new StringBuilder();
|
|
|
- if (group.getSystemOverhead() > 15.0) {
|
|
|
- diag.append(String.format("⚠️ 宿主机非容器损耗(内核/IO)高达 %.1f%%。 ", group.getSystemOverhead()));
|
|
|
- }
|
|
|
- if (host.getMaxValue() > 85.0) {
|
|
|
- diag.append("🚨 宿主机峰值接近瓶颈。 ");
|
|
|
- }
|
|
|
- group.setRelationshipDiagnosis(diag.length() == 0 ? "✅ 资源分配健康" : diag.toString());
|
|
|
- }
|
|
|
- });
|
|
|
-
|
|
|
- // 4. 传给模板
|
|
|
- return new ArrayList<>(hostGroupMap.values());
|
|
|
- }
|
|
|
-
|
|
|
- private void processResults0(Map<String, String> rawResults) {
|
|
|
+ private void processResults(Map<String, String> rawResults) {
|
|
|
// 使用 Map 存储合并后的对象,Key 是 "instance:container"
|
|
|
Map<String, MetricRecord> mergedMap = new HashMap<>();
|
|
|
// 遍历所有的任务结果 (node_cpu_avg, node_cpu_max, container_cpu_avg 等)
|
|
|
@@ -349,14 +180,17 @@ public class PrometheusService {
|
|
|
}
|
|
|
});
|
|
|
// 4. 将合并后的结果转换为 List 并执行分析
|
|
|
- List<MetricRecord> finalRecords = new ArrayList<>(mergedMap.values());
|
|
|
- System.out.println("数据对齐完成,共计 " + finalRecords.size() + " 条指标记录。");
|
|
|
+ //List<MetricRecord> finalRecords = new ArrayList<>(mergedMap.values());
|
|
|
+ System.out.println("数据对齐完成,共计 " + mergedMap.size() + " 条指标记录。");
|
|
|
|
|
|
+ generateReport1(mergedMap);
|
|
|
+ generateReport2(mergedMap);
|
|
|
+ }
|
|
|
|
|
|
- //List<HostGroup> hostGroupList0 = buildHierarchyAndAnalyze(mergedMap);
|
|
|
- List<HostGroup> hostGroupList = generateUnifiedReport(mergedMap);
|
|
|
+ private void generateReport1(Map<String, MetricRecord> mergedMap) {
|
|
|
+ List<MetricRecord> finalRecords = new ArrayList<>(mergedMap.values());
|
|
|
// 5. 调用之前的分析逻辑
|
|
|
- /*List<AnalysisResult> results = finalRecords.stream()
|
|
|
+ List<AnalysisResult> results = finalRecords.stream()
|
|
|
.map(this::getAnalysisResult)
|
|
|
.filter(res -> !"NORMAL".equals(res.getStatus())) // 过滤掉正常的
|
|
|
.collect(Collectors.toList());
|
|
|
@@ -375,16 +209,14 @@ public class PrometheusService {
|
|
|
return list;
|
|
|
}
|
|
|
)
|
|
|
- ));*/
|
|
|
+ ));
|
|
|
|
|
|
Map<String, Object> root = new HashMap<>();
|
|
|
- //root.put("hostMap", groupMap);
|
|
|
root.put("date", new SimpleDateFormat("yyyy-MM-dd").format(new Date()));
|
|
|
- root.put("hostGroupList", hostGroupList);
|
|
|
-
|
|
|
+ root.put("hostMap", groupMap);
|
|
|
try {
|
|
|
// 6. 渲染最终的复合模板(左右布局那个)
|
|
|
- String templatePath = "host_group_report.ftl";
|
|
|
+ String templatePath = "report.ftl";
|
|
|
String htmlContent = renderHtml(templatePath, root);
|
|
|
Path outputPath = Paths.get("/home/reghao/Downloads", "report_" + LocalDate.now() + ".html");
|
|
|
if (Files.notExists(outputPath.getParent())) {
|
|
|
@@ -397,59 +229,39 @@ public class PrometheusService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- private List<HostGroup> buildHostGroups(Map<String, MetricRecord> mergedMap) {
|
|
|
- Map<String, HostGroup> hostGroups = new HashMap<>();
|
|
|
-
|
|
|
- // 1. 第一次遍历:初始化宿主机
|
|
|
- mergedMap.values().stream().filter(r -> r.getContainer() == null).forEach(r -> {
|
|
|
- HostGroup g = new HostGroup();
|
|
|
- g.setHostIp(r.getInstance());
|
|
|
- g.setHostRecord(r);
|
|
|
- hostGroups.put(r.getInstance(), g);
|
|
|
- });
|
|
|
+ private void generateReport2(Map<String, MetricRecord> mergedMap) {
|
|
|
+ List<HostGroup> hostGroupList = generateUnifiedReport(mergedMap);
|
|
|
+ Map<String, Object> root = new HashMap<>();
|
|
|
+ root.put("date", new SimpleDateFormat("yyyy-MM-dd").format(new Date()));
|
|
|
+ root.put("hostGroupList", hostGroupList);
|
|
|
|
|
|
- // 2. 第二次遍历:挂载容器并折算贡献度
|
|
|
- mergedMap.values().stream().filter(r -> r.getContainer() != null).forEach(r -> {
|
|
|
- HostGroup g = hostGroups.get(r.getInstance());
|
|
|
- if (g != null) {
|
|
|
- g.getContainerRecords().add(r);
|
|
|
- // 关键:计算该容器对宿主机的实际贡献 = (容器利用率 * 容器配额核心数) / 宿主机总核数
|
|
|
- // 但因为我们没有直接查配额核心数,目前最稳妥的办法是仅做展示,
|
|
|
- // 损耗诊断建议基于:HostAvg - Sum(Container核心数)/TotalCores
|
|
|
- // 此处为了简化,我们假设容器利用率是相对于 Limit 的。
|
|
|
+ try {
|
|
|
+ // 6. 渲染最终的复合模板(左右布局那个)
|
|
|
+ String templatePath = "report_v1.ftl";
|
|
|
+ String htmlContent = renderHtml(templatePath, root);
|
|
|
+ Path outputPath = Paths.get("/home/reghao/Downloads", "report_v1_" + LocalDate.now() + ".html");
|
|
|
+ if (Files.notExists(outputPath.getParent())) {
|
|
|
+ Files.createDirectories(outputPath.getParent());
|
|
|
}
|
|
|
- });
|
|
|
-
|
|
|
- // 3. 执行关系分析
|
|
|
- hostGroups.values().forEach(this::analyzeRelationship);
|
|
|
- return new ArrayList<>(hostGroups.values());
|
|
|
- }
|
|
|
-
|
|
|
- private void analyzeRelationship(HostGroup group) {
|
|
|
- MetricRecord host = group.getHostRecord();
|
|
|
- if (host == null) return;
|
|
|
-
|
|
|
- double totalCores = host.getCpuCore() > 0 ? host.getCpuCore() : 1.0;
|
|
|
-
|
|
|
- // 这里需要注意:因为容器 avg 是相对于 limit 的百分比
|
|
|
- // 如果没有采集到 limit 核心数,sum(avg) 是没有物理意义的。
|
|
|
- // 建议:在实际运维中,我们关注的是宿主机整体水位。
|
|
|
-
|
|
|
- StringBuilder sb = new StringBuilder();
|
|
|
- if (host.getAvgValue() > 80) {
|
|
|
- sb.append("🚨 宿主机整体负载极高,请检查资源分配。");
|
|
|
- } else {
|
|
|
- sb.append("✅ 节点运行状态平稳。");
|
|
|
+ Files.writeString(outputPath, htmlContent, StandardCharsets.UTF_8);
|
|
|
+ System.out.println("✅ 报表已成功保存至: " + outputPath.toAbsolutePath());
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("{}", e.getMessage());
|
|
|
}
|
|
|
- group.setRelationshipDiagnosis(sb.toString());
|
|
|
}
|
|
|
|
|
|
- public List<HostGroup> buildHierarchyAndAnalyze0(Map<String, MetricRecord> mergedMap) {
|
|
|
- // 1. 初步按 instance 分组
|
|
|
- Map<String, HostGroup> groups = new HashMap<>();
|
|
|
-
|
|
|
+ public List<HostGroup> generateUnifiedReport(Map<String, MetricRecord> mergedMap) {
|
|
|
+ // 2. 建立 HostGroup 层级关系
|
|
|
+ Map<String, HostGroup> hostGroupMap = new LinkedHashMap<>();
|
|
|
mergedMap.values().forEach(record -> {
|
|
|
- HostGroup group = groups.computeIfAbsent(record.getInstance(), k -> new HostGroup());
|
|
|
+ // 提取 IP (172.16.45.66:9100 -> 172.16.45.66)
|
|
|
+ String ip = record.getInstance().split(":")[0];
|
|
|
+ HostGroup group = hostGroupMap.computeIfAbsent(ip, k -> {
|
|
|
+ HostGroup hg = new HostGroup();
|
|
|
+ hg.setHostIp(ip);
|
|
|
+ return hg;
|
|
|
+ });
|
|
|
+
|
|
|
if (record.getContainer() == null) {
|
|
|
group.setHostRecord(record);
|
|
|
} else {
|
|
|
@@ -457,136 +269,54 @@ public class PrometheusService {
|
|
|
}
|
|
|
});
|
|
|
|
|
|
- // 2. 深度分析每一组的关系
|
|
|
- groups.values().forEach(group -> {
|
|
|
+ // 3. 核心:建立宿主机与容器的量纲关联 (统一为 Node 视角)
|
|
|
+ hostGroupMap.values().forEach(group -> {
|
|
|
MetricRecord host = group.getHostRecord();
|
|
|
List<MetricRecord> containers = group.getContainerRecords();
|
|
|
|
|
|
if (host != null && !containers.isEmpty()) {
|
|
|
- // 计算容器均值总和
|
|
|
- double sumContainerAvg = containers.stream().mapToDouble(MetricRecord::getAvgValue).sum();
|
|
|
- // 计算系统损耗 (宿主机利用率 - 容器利用率总和)
|
|
|
- group.setSystemOverhead(Math.max(0, host.getAvgValue() - sumContainerAvg));
|
|
|
-
|
|
|
- // 计算峰值共振:宿主机峰值 / 容器峰值总和
|
|
|
- double sumContainerMax = containers.stream().mapToDouble(MetricRecord::getMaxValue).sum();
|
|
|
- group.setPeakCohesion(sumContainerMax > 0 ? host.getMaxValue() / sumContainerMax : 0);
|
|
|
-
|
|
|
- // 3. 自动生成诊断结论
|
|
|
- StringBuilder diagnosis = new StringBuilder();
|
|
|
- if (group.getSystemOverhead() > 20.0) {
|
|
|
- diagnosis.append(String.format("⚠️ 系统隐性损耗过高(%.1f%%),请检查宿主机原生进程。 ", group.getSystemOverhead()));
|
|
|
- }
|
|
|
- if (host.getMaxValue() > 80.0 && group.getPeakCohesion() > 0.8) {
|
|
|
- diagnosis.append("🚨 探测到明显的容器并发冲撞,建议交错执行高负载任务。 ");
|
|
|
- }
|
|
|
- if (diagnosis.length() == 0) diagnosis.append("✅ 宿主与容器负载分配比例健康。");
|
|
|
-
|
|
|
- group.setRelationshipDiagnosis(diagnosis.toString());
|
|
|
- }
|
|
|
- });
|
|
|
-
|
|
|
- return new ArrayList<>(groups.values());
|
|
|
- }
|
|
|
- public List<HostGroup> buildHierarchyAndAnalyze(Map<String, MetricRecord> mergedMap) {
|
|
|
- // 1. 使用纯 IP (不带端口) 作为分组的 Key
|
|
|
- Map<String, HostGroup> groups = new HashMap<>();
|
|
|
-
|
|
|
- mergedMap.values().forEach(record -> {
|
|
|
- // 提取 IP 部分,例如 "192.168.1.10:9100" -> "192.168.1.10"
|
|
|
- String rawInstance = record.getInstance();
|
|
|
- String ipAddress = rawInstance.contains(":") ? rawInstance.split(":")[0] : rawInstance;
|
|
|
-
|
|
|
- HostGroup group = groups.computeIfAbsent(ipAddress, k -> {
|
|
|
- HostGroup newGroup = new HostGroup();
|
|
|
- newGroup.setHostIp(ipAddress); // 建议在 HostGroup 中增加该字段
|
|
|
- return newGroup;
|
|
|
- });
|
|
|
+ // 获取宿主机总核数 (从 node_cpu_core 指标中解析,此处假设 record 已包含该值)
|
|
|
+ double totalCores = host.getCpuCore() > 0 ? host.getCpuCore() : 8.0;
|
|
|
|
|
|
- // 判定归属:根据 container 字段是否为空
|
|
|
- if (record.getContainer() == null) {
|
|
|
- // 来自 node_exporter 的宿主机数据
|
|
|
- group.setHostRecord(record);
|
|
|
- } else {
|
|
|
- // 来自 cadvisor 的容器数据
|
|
|
- group.getContainerRecords().add(record);
|
|
|
- }
|
|
|
- });
|
|
|
+ // 计算容器对宿主机的“实际贡献值”
|
|
|
+ // 容器利用率 (C_util) = (Used_Cores / Limit_Cores) * 100
|
|
|
+ // 贡献 Node 的百分比 = C_util * (Limit_Cores / Node_Total_Cores)
|
|
|
+ double totalContainerContributionToNode = containers.stream()
|
|
|
+ .mapToDouble(c -> c.getAvgValue() * (c.getCpuLimit() / totalCores))
|
|
|
+ .sum();
|
|
|
|
|
|
- // 2. 深度分析每一组的关系
|
|
|
- groups.values().forEach(group -> {
|
|
|
- MetricRecord host = group.getHostRecord();
|
|
|
- List<MetricRecord> containers = group.getContainerRecords();
|
|
|
+ // 系统隐性损耗 = 宿主机总利用率 - 容器贡献总和
|
|
|
+ group.setSystemOverhead(Math.max(0, host.getAvgValue() - totalContainerContributionToNode));
|
|
|
|
|
|
- // 只有当宿主机数据存在时才计算损耗
|
|
|
- if (host != null) {
|
|
|
- double sumContainerAvg = containers.stream().mapToDouble(MetricRecord::getAvgValue).sum();
|
|
|
+ for (MetricRecord container : group.getContainerRecords()) {
|
|
|
+ // 1. 获取该容器的 Limit 核数 (需从 container_cpu_limit 查询中匹配)
|
|
|
+ double limitCores = container.getCpuLimit();
|
|
|
|
|
|
- // 计算系统隐性损耗:宿主机总量 - 容器总量
|
|
|
- // 注意:如果容器很多,sum 可能略大于 host(采样时间差),需用 Math.max(0, ...)
|
|
|
- group.setSystemOverhead(Math.max(0, host.getAvgValue() - sumContainerAvg));
|
|
|
+ // 2. 计算:相对宿主机的百分比 = 相对Limit百分比 * (Limit核数 / 总核数)
|
|
|
+ double relativeToHost = container.getAvgValue() * (limitCores / totalCores);
|
|
|
+ double relativeToHostMax = container.getMaxValue() * (limitCores / totalCores);
|
|
|
|
|
|
- // 诊断逻辑
|
|
|
- StringBuilder diagnosis = new StringBuilder();
|
|
|
- if (group.getSystemOverhead() > 20.0) {
|
|
|
- diagnosis.append(String.format("⚠️ 宿主机非容器损耗较高(%.1f%%)。 ", group.getSystemOverhead()));
|
|
|
+ // 将这两个值存入 record 供模板使用
|
|
|
+ container.setRelativeToHostAvg(relativeToHost);
|
|
|
+ container.setRelativeToHostMax(relativeToHostMax);
|
|
|
}
|
|
|
|
|
|
- if (containers.isEmpty()) {
|
|
|
- diagnosis.append("ℹ️ 该节点当前未发现运行中的业务容器。");
|
|
|
- } else if (diagnosis.length() == 0) {
|
|
|
- diagnosis.append("✅ 宿主与容器负载分配正常。");
|
|
|
+ // 诊断结论
|
|
|
+ StringBuilder diag = new StringBuilder();
|
|
|
+ if (group.getSystemOverhead() > 15.0) {
|
|
|
+ diag.append(String.format("⚠️ 宿主机非容器损耗(内核/IO)高达 %.1f%%。 ", group.getSystemOverhead()));
|
|
|
+ }
|
|
|
+ if (host.getMaxValue() > 85.0) {
|
|
|
+ diag.append("🚨 宿主机峰值接近瓶颈。 ");
|
|
|
}
|
|
|
- group.setRelationshipDiagnosis(diagnosis.toString());
|
|
|
+ group.setRelationshipDiagnosis(diag.length() == 0 ? "✅ 资源分配健康" : diag.toString());
|
|
|
}
|
|
|
});
|
|
|
|
|
|
- return new ArrayList<>(groups.values());
|
|
|
+ // 4. 传给模板
|
|
|
+ return new ArrayList<>(hostGroupMap.values());
|
|
|
}
|
|
|
|
|
|
- /*public String analyze(MetricRecord record) {
|
|
|
- StringBuilder report = new StringBuilder();
|
|
|
- boolean isContainer = record.getContainer() != null;
|
|
|
-
|
|
|
- // 1. 统一单位描述
|
|
|
- String unit = isContainer ? "核" : "%";
|
|
|
- String displayName = isContainer ? "容器:" + record.getContainer() : "宿主机:" + record.getInstance();
|
|
|
-
|
|
|
- report.append(String.format("【%s 报告】均值: %.2f%s, 峰值: %.2f%s\n",
|
|
|
- displayName, record.getAvgValue(), unit, record.getMaxValue(), unit));
|
|
|
-
|
|
|
- // 2. 针对性设定“防零处理”的底数 (Silence Threshold)
|
|
|
- // 宿主机分母至少 1%;容器分母至少 0.1 核
|
|
|
- double silenceThreshold = isContainer ? 0.1 : 1.0;
|
|
|
- double targetThreshold = isContainer ? CpuThresholdConfig.CONTAINER_THRESHOLD : CpuThresholdConfig.NODE_THRESHOLD;
|
|
|
-
|
|
|
- // 3. 判定:容量不足 (注意容器的 threshold 应该是它的 CPU Limit 核心数)
|
|
|
- if (record.getAvgValue() >= targetThreshold) {
|
|
|
- report.append(" -> [严重异常] 均值已触及水位线,资源严重不足!\n");
|
|
|
- }
|
|
|
-
|
|
|
- // 4. 判定:毛刺率 (Spike Rate)
|
|
|
- else {
|
|
|
- double ratio = record.getMaxValue() / Math.max(record.getAvgValue(), silenceThreshold);
|
|
|
-
|
|
|
- // 判定阈值:如果峰值本身很小(比如宿主机 < 5% 或 容器 < 0.2核),则忽略毛刺
|
|
|
- double significantPeak = isContainer ? 0.2 : 5.0;
|
|
|
-
|
|
|
- if (record.getMaxValue() > significantPeak && ratio > 5.0) {
|
|
|
- report.append(String.format(" -> [预警] 瞬时毛刺严重(%.1fx)。", ratio));
|
|
|
- if (isContainer) {
|
|
|
- report.append("建议检查容器内部是否有突发短查询或频繁GC。\n");
|
|
|
- } else {
|
|
|
- report.append("建议检查宿主机是否有系统级任务或IO等待引起的CPU飙升。\n");
|
|
|
- }
|
|
|
- } else {
|
|
|
- report.append(" -> [正常] 运行平稳。\n");
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return report.toString();
|
|
|
- }*/
|
|
|
-
|
|
|
public AnalysisResult getAnalysisResult(MetricRecord record) {
|
|
|
AnalysisResult res = new AnalysisResult();
|
|
|
// 基础信息设置
|
|
|
@@ -618,139 +348,6 @@ public class PrometheusService {
|
|
|
return res;
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * 模拟解析 Prometheus 返回的 JSON 结构
|
|
|
- * 生产环境建议使用 Jackson 或 Fastjson 遍历 data.result 数组
|
|
|
- */
|
|
|
- private Map<String, List<MetricRecord>> parsePrometheusJson(Map<String, String> results) {
|
|
|
- Map<String, List<MetricRecord>> parsedData = new HashMap<>();
|
|
|
- results.forEach((taskName, jsonContent) -> {
|
|
|
- List<MetricRecord> records = new ArrayList<>();
|
|
|
- try {
|
|
|
- JsonNode root = objectMapper.readTree(jsonContent);
|
|
|
- // Prometheus 标准响应路径: data -> result
|
|
|
- JsonNode resultNodes = root.path("data").path("result");
|
|
|
-
|
|
|
- if (resultNodes.isArray()) {
|
|
|
- for (JsonNode node : resultNodes) {
|
|
|
- MetricRecord record = new MetricRecord();
|
|
|
-
|
|
|
- // 1. 解析标签 (Metric Labels)
|
|
|
- JsonNode metric = node.path("metric");
|
|
|
- record.setInstance(metric.path("instance").asText("unknown"));
|
|
|
- // container 在 cAdvisor 中通常对应 'name' 标签,宿主机指标则没有此标签
|
|
|
- if (metric.has("name")) {
|
|
|
- record.setContainer(metric.path("name").asText());
|
|
|
- }
|
|
|
-
|
|
|
- // 2. 解析数值 (Value)
|
|
|
- // Prometheus value 格式为 [timestamp, "value_string"]
|
|
|
- JsonNode valueNode = node.path("value");
|
|
|
- if (valueNode.isArray() && valueNode.size() >= 2) {
|
|
|
- // 注意:Prometheus 返回的数值是字符串形式,需要转换
|
|
|
- double val = valueNode.get(1).asDouble();
|
|
|
-
|
|
|
- // 根据任务名决定填充到哪个字段(暂时存入,后续 mergeMetrics 会处理)
|
|
|
- if (taskName.contains("avg")) {
|
|
|
- record.setAvgValue(val);
|
|
|
- } else if (taskName.contains("max")) {
|
|
|
- record.setMaxValue(val);
|
|
|
- }
|
|
|
- // 设置指标名称便于识别
|
|
|
- record.setName(taskName);
|
|
|
- }
|
|
|
-
|
|
|
- records.add(record);
|
|
|
- }
|
|
|
- }
|
|
|
- parsedData.put(taskName, records);
|
|
|
- } catch (Exception e) {
|
|
|
- System.err.println("解析任务 [" + taskName + "] 失败: " + e.getMessage());
|
|
|
- parsedData.put(taskName, Collections.emptyList());
|
|
|
- }
|
|
|
- });
|
|
|
-
|
|
|
- return parsedData;
|
|
|
- }
|
|
|
-
|
|
|
- private void parseToHostMap(String json, Map<String, HostInfo> hostMap, String type) throws Exception {
|
|
|
- JsonNode root = objectMapper.readTree(json);
|
|
|
- JsonNode resultList = root.path("data").path("result");
|
|
|
-
|
|
|
- for (JsonNode node : resultList) {
|
|
|
- // 关键点:提取 IP (例如从 192.168.1.10:9100 提取 192.168.1.10)
|
|
|
- String rawInstance = node.path("metric").path("instance").asText();
|
|
|
- String ip = rawInstance.contains(":") ? rawInstance.split(":")[0] : rawInstance;
|
|
|
-
|
|
|
- double value = node.path("value").get(1).asDouble();
|
|
|
-
|
|
|
- // 如果 Map 里没有该 IP,则新建
|
|
|
- HostInfo host = hostMap.computeIfAbsent(ip, k -> {
|
|
|
- HostInfo h = new HostInfo();
|
|
|
- h.setName(k);
|
|
|
- h.setIp(k);
|
|
|
- return h;
|
|
|
- });
|
|
|
-
|
|
|
- // 根据类型赋值
|
|
|
- switch (type) {
|
|
|
- case "cpu" -> host.setCpuUsage(formatDouble(value));
|
|
|
- case "mem" -> host.setMemUsage(formatDouble(value));
|
|
|
- case "count" -> host.setContainerCount((int) value);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- private List<ContainerInfo> parseTopContainers(String json) throws Exception {
|
|
|
- List<ContainerInfo> list = new ArrayList<>();
|
|
|
- JsonNode resultList = objectMapper.readTree(json).path("data").path("result");
|
|
|
-
|
|
|
- for (JsonNode node : resultList) {
|
|
|
- ContainerInfo c = new ContainerInfo();
|
|
|
- c.setName(node.path("metric").path("name").asText());
|
|
|
- c.setHostIp(node.path("metric").path("instance").asText().split(":")[0]);
|
|
|
- c.setCpu(formatDouble(node.path("value").get(1).asDouble()));
|
|
|
- list.add(c);
|
|
|
- }
|
|
|
- return list;
|
|
|
- }
|
|
|
-
|
|
|
- private double formatDouble(double val) {
|
|
|
- return Math.round(val * 10.0) / 10.0; // 保留一位小数
|
|
|
- }
|
|
|
-
|
|
|
- private void parseTrendData(String json, OperationReportDTO report) {
|
|
|
- try {
|
|
|
- JsonNode root = objectMapper.readTree(json);
|
|
|
- // 趋势数据在 data.result[0].values 中
|
|
|
- JsonNode valuesNode = root.path("data").path("result").get(0).path("values");
|
|
|
-
|
|
|
- List<String> labels = new ArrayList<>();
|
|
|
- List<String> trends = new ArrayList<>();
|
|
|
- DateTimeFormatter formatter = DateTimeFormatter.ofPattern("HH:mm");
|
|
|
-
|
|
|
- for (JsonNode node : valuesNode) {
|
|
|
- // node 是一个数组: [1672531200, "15.5"]
|
|
|
- long timestamp = node.get(0).asLong();
|
|
|
- double value = node.get(1).asDouble();
|
|
|
-
|
|
|
- // 转换时间戳为 HH:mm 格式
|
|
|
- String timeLabel = LocalDateTime.ofInstant(Instant.ofEpochSecond(timestamp), ZoneId.systemDefault())
|
|
|
- .format(formatter);
|
|
|
-
|
|
|
- labels.add("'" + timeLabel + "'"); // 加引号是为了符合 JS 数组格式
|
|
|
- trends.add(String.valueOf(formatDouble(value)));
|
|
|
- }
|
|
|
-
|
|
|
- // 将 List 转为逗号分隔的字符串,直接交给 FreeMarker 渲染进 JS 数组
|
|
|
- report.setTimeLabels(String.join(",", labels));
|
|
|
- report.setAvgCpuTrend(String.join(",", trends));
|
|
|
-
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("趋势数据解析失败", e);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
/**
|
|
|
* 辅助方法:构建 query_range 的完整 URL
|
|
|
*/
|
|
|
@@ -1026,8 +623,8 @@ public class PrometheusService {
|
|
|
model.put("timeLabels", String.join(",", timeLabels));
|
|
|
|
|
|
// 6. 渲染与输出
|
|
|
- String htmlContent = renderHtml("container_report_v2.ftl", model);
|
|
|
- Path outputPath = Paths.get("/home/reghao/Downloads", "container_report_v2_" + LocalDate.now() + ".html");
|
|
|
+ String htmlContent = renderHtml("container_report_v1.ftl", model);
|
|
|
+ Path outputPath = Paths.get("/home/reghao/Downloads", "container_report_v1_" + LocalDate.now() + ".html");
|
|
|
Files.writeString(outputPath, htmlContent, StandardCharsets.UTF_8);
|
|
|
System.out.println("✅ 报表生成成功: " + outputPath.toAbsolutePath());
|
|
|
}
|
|
|
@@ -1111,11 +708,51 @@ public class PrometheusService {
|
|
|
return report;
|
|
|
}
|
|
|
|
|
|
+ public void generateMemReport() {
|
|
|
+ // 定义查询任务
|
|
|
+ Map<String, String> tasks = Map.of(
|
|
|
+ "node_mem_avg", """
|
|
|
+ avg_over_time(container_memory_working_set_bytes{name!=""}[24h]) / 1024 / 1024
|
|
|
+ """,
|
|
|
+ "node_mem_max", """
|
|
|
+ avg_over_time(container_memory_working_set_bytes{name!=""}[24h]) / 1024 / 1024
|
|
|
+ """,
|
|
|
+ "container_mem_avg", """
|
|
|
+ avg_over_time(container_memory_working_set_bytes{name!=""}[24h]) / 1024 / 1024
|
|
|
+ """,
|
|
|
+ "container_mem_max", """
|
|
|
+ max_over_time(container_memory_working_set_bytes{name!=""}[24h]) / 1024 /1024
|
|
|
+ """
|
|
|
+ );
|
|
|
+
|
|
|
+ // 异步执行
|
|
|
+ promClient.fetchAllMetrics(tasks).thenAccept(results -> {
|
|
|
+ processResults0(results);
|
|
|
+ }).join(); // 如果是在定时任务主线程,可以用 join 等待完成
|
|
|
+ }
|
|
|
+
|
|
|
+ private void processResults0(Map<String, String> rawResults) {
|
|
|
+ Map<String, Object> root = new HashMap<>();
|
|
|
+ try {
|
|
|
+ // 6. 渲染最终的复合模板(左右布局那个)
|
|
|
+ String templatePath = "mem_report.ftl";
|
|
|
+ String htmlContent = renderHtml(templatePath, root);
|
|
|
+ Path outputPath = Paths.get("/home/reghao/Downloads", "mem_report_" + LocalDate.now() + ".html");
|
|
|
+ if (Files.notExists(outputPath.getParent())) {
|
|
|
+ Files.createDirectories(outputPath.getParent());
|
|
|
+ }
|
|
|
+ Files.writeString(outputPath, htmlContent, StandardCharsets.UTF_8);
|
|
|
+ System.out.println("✅ 报表已成功保存至: " + outputPath.toAbsolutePath());
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("{}", e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
PrometheusService prometheusService = new PrometheusService();
|
|
|
prometheusService.generateContainerReport1();
|
|
|
prometheusService.generateContainerReport();
|
|
|
prometheusService.generatePillarReport();
|
|
|
- //prometheusService.generateDailyReport();
|
|
|
+ prometheusService.generateDailyReport();
|
|
|
}
|
|
|
}
|