| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- import logging
- from datetime import datetime
- import gc
- import pynvml
- import torch
- from setting import ollama_client
- logger = logging.getLogger(__name__)
- def get_gpu_memory_info(device_index=0):
- pynvml.nvmlInit()
- handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
- info = pynvml.nvmlDeviceGetMemoryInfo(handle)
- # print(f"显卡型号: {pynvml.nvmlDeviceGetName(handle)}")
- # print(f"总显存: {info.total / 1024 ** 2:.2f} MB")
- # print(f"已用显存: {info.used / 1024 ** 2:.2f} MB")
- # print(f"空闲显存: {info.free / 1024 ** 2:.2f} MB")
- result = {
- "model": pynvml.nvmlDeviceGetName(handle),
- "mem_total": f"{info.total / 1024 ** 2:.2f} MB",
- "mem_used": f"{info.used / 1024 ** 2:.2f} MB",
- "mem_free": f"{info.free / 1024 ** 2:.2f} MB",
- }
- pynvml.nvmlShutdown()
- return result
- def get_torch_memory_usage():
- if torch.cuda.is_available():
- # 获取当前设备索引
- device = torch.cuda.current_device()
- # 显存缓存(PyTorch 预占用的)
- reserved = torch.cuda.memory_reserved(device) / 1024 ** 2
- # 已经分配给 Tensor 的显存
- allocated = torch.cuda.memory_allocated(device) / 1024 ** 2
- # reserved 是实际占有的 gpu 内存
- # allocated 是 reserved 中实际使用的部分
- return {
- "reserved": f"{reserved:.2f} MB",
- "allocated": f"{allocated:.2f} MB"
- }
- else:
- return {
- "message": "CUDA N/A",
- }
- # --- GPU 显存深度清理函数 ---
- def clear_gpu_memory():
- if torch.cuda.is_available():
- # 释放 PyTorch 占用的缓存
- torch.cuda.empty_cache()
- # 释放进程间共享内存
- torch.cuda.ipc_collect()
- # 强制进行 Python 层的垃圾回收
- gc.collect()
- logger.info(f"[{datetime.now().strftime('%H:%M:%S')}] 🧹 GPU 显存深度清理完成")
- def get_ollama_resource():
- result = []
- try:
- # 获取当前运行中的模型列表
- response = ollama_client.ps()
- if not response['models']:
- return result
- for model in response['models']:
- name = model['name']
- size_vram = model.get('size_vram', 0)
- size = model.get('size', 0)
- # 计算显存占比
- if size > 0:
- gpu_percentage = (size_vram / size) * 100
- else:
- gpu_percentage = 0
- # print(f"模型名称: {name}")
- # print(f"总大小: {size / 1024 ** 3:.2f} GB")
- # print(f"显存(VRAM)大小: {size_vram / 1024 ** 3:.2f} GB")
- if gpu_percentage >= 100:
- stat = "🚀 完全运行在 GPU 上"
- elif gpu_percentage > 0:
- stat = f"🌓 混合模式 (GPU 占比 {gpu_percentage:.2f}%)"
- else:
- stat = "🐌 完全运行在 CPU 上"
- result.append({
- 'model_name': name,
- 'size': f"{size / 1024 ** 3:.2f} GB",
- 'size_vram': f"{size_vram / 1024 ** 3:.2f} GB",
- 'gpu_percentage': f"{gpu_percentage:.2f}",
- 'stat': stat
- })
- except Exception as e:
- print(f"无法连接到 Ollama 服务: {e}")
- return result
|