import logging import os os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" import time from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch logger = logging.getLogger(__name__) model_name = "Qwen/Qwen2.5-1.5B-Instruct" start_time = time.time() logger.info(f"⏳ 开始加载 {model_name} 模型...") try: # 1. 定义量化配置 quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, # 1650 显卡建议设为 fp16 bnb_4bit_quant_type="nf4", # 高精度量化类型 bnb_4bit_use_double_quant=True # 进一步压缩显存 ) # 2. 加载模型 model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quantization_config, # 使用配置对象 device_map="auto" # 自动分配到 GPU ) tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"✅ {model_name} 模型加载成功!耗时 {(time.time() - start_time):.2f}秒") except Exception as e: logger.info(f"模型加载失败: {e}") logger.info(f"✅ {model_name} 模型加载失败: {e}") raise e def translate2zh(text): # 构建适合 Moondream 场景的 Prompt prompt = f"你是一个专业的图像描述翻译官。请将下面这段英文描述翻译成自然、地道的中文,直接输出结果,不要解释:\n{text}" messages = [{"role": "user", "content": prompt}] input_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([input_text], return_tensors="pt").to(model.device) # 推理 max_tokens = 128 with torch.no_grad(): generated_ids = model.generate( **model_inputs, max_new_tokens=max_tokens, do_sample=False # 翻译建议关闭随机性,保证结果稳定 ) # 解码 response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # 提取助手回答的部分 final_result = response.split("assistant\n")[-1].strip() return final_result def summarize(): pass