reghao
/
pyai


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							import logging
from datetime import datetime
import gc
import pynvml
import torch
from setting import ollama_client

logger = logging.getLogger(__name__)

def get_gpu_memory_info(device_index=0):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)

    # print(f"显卡型号: {pynvml.nvmlDeviceGetName(handle)}")
    # print(f"总显存: {info.total / 1024 ** 2:.2f} MB")
    # print(f"已用显存: {info.used / 1024 ** 2:.2f} MB")
    # print(f"空闲显存: {info.free / 1024 ** 2:.2f} MB")
    result = {
        "model": pynvml.nvmlDeviceGetName(handle),
        "mem_total": f"{info.total / 1024 ** 2:.2f} MB",
        "mem_used": f"{info.used / 1024 ** 2:.2f} MB",
        "mem_free": f"{info.free / 1024 ** 2:.2f} MB",
    }
    pynvml.nvmlShutdown()
    return  result


def get_torch_memory_usage():
    if torch.cuda.is_available():
        # 获取当前设备索引
        device = torch.cuda.current_device()
        # 显存缓存（PyTorch 预占用的）
        reserved = torch.cuda.memory_reserved(device) / 1024 ** 2
        # 已经分配给 Tensor 的显存
        allocated = torch.cuda.memory_allocated(device) / 1024 ** 2

        # reserved 是实际占有的 gpu 内存
        # allocated 是 reserved 中实际使用的部分
        return {
            "reserved": f"{reserved:.2f} MB",
            "allocated": f"{allocated:.2f} MB"
        }
    else:
        return {
                "message": "CUDA N/A",
            }


# --- GPU 显存深度清理函数 ---
def clear_gpu_memory():
    if torch.cuda.is_available():
        # 释放 PyTorch 占用的缓存
        torch.cuda.empty_cache()
        # 释放进程间共享内存
        torch.cuda.ipc_collect()
    # 强制进行 Python 层的垃圾回收
    gc.collect()
    logger.info(f"[{datetime.now().strftime('%H:%M:%S')}] 🧹 GPU 显存深度清理完成")


def get_ollama_resource():
    result = []
    try:
        # 获取当前运行中的模型列表
        response = ollama_client.ps()
        if not response['models']:
            return result

        for model in response['models']:
            name = model['name']
            size_vram = model.get('size_vram', 0)
            size = model.get('size', 0)

            # 计算显存占比
            if size > 0:
                gpu_percentage = (size_vram / size) * 100
            else:
                gpu_percentage = 0

            # print(f"模型名称: {name}")
            # print(f"总大小: {size / 1024 ** 3:.2f} GB")
            # print(f"显存(VRAM)大小: {size_vram / 1024 ** 3:.2f} GB")
            if gpu_percentage >= 100:
                stat = "🚀 完全运行在 GPU 上"
            elif gpu_percentage > 0:
                stat = f"🌓 混合模式 (GPU 占比 {gpu_percentage:.2f}%)"
            else:
                stat = "🐌 完全运行在 CPU 上"

            result.append({
                'model_name': name,
                'size': f"{size / 1024 ** 3:.2f} GB",
                'size_vram': f"{size_vram / 1024 ** 3:.2f} GB",
                'gpu_percentage': f"{gpu_percentage:.2f}",
                'stat': stat
            })
    except Exception as e:
        print(f"无法连接到 Ollama 服务: {e}")
    return result