import json import os from setting import OUTPUT_DIR os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" import logging logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(name)s] %(levelname)s: %(message)s' ) import torch import service.pyav as pyav import service.pygpu as pygpu import service.ai_asr as ai_asr import service.ai_image_ollama as ai_image logger = logging.getLogger(__name__) def process(audio_path, srt_path, video_path): try: # 执行识别 ai_asr.generate_srt(audio_path, srt_path) pyav.generate_video(audio_path, srt_path, video_path) except torch.OutOfMemoryError: logger.error(f"❌ 显存溢出!跳过文件: {audio_path}") pygpu.clear_gpu_memory() # 发生 OOM 后强制清理一次 except Exception as e: logger.error(f"💥 处理 {audio_path} 时发生未知错误: {e}") finally: # 每一个文件处理完都主动清理一次,确保下一个文件有足够的初始空间 pygpu.clear_gpu_memory() if __name__ == "__main__": audio = 'abc.mp3' srt = 'abc.mp3.srt' video = 'abc.mp3.mp4' ai_asr.generate_srt(audio, srt) pyav.generate_video(audio, srt, video) video = 'abc.mp4' video_dict = pyav.get_video_info(video) start_scene_times = pyav.get_scene_times(video, 0.3) # 按场景分割视频 # pyav.split_video_by_scenes(video, start_scene_times, OUTPUT_DIR) # 计算的每个场景的抽帧时间点 target_times = pyav.calculate_mid_points(video, start_scene_times) # 使用 ffmpeg 抽帧 scenes = pyav.extract_frames(video, target_times, OUTPUT_DIR) # 使用 ai 对抽帧的画面进行识别 scenes_result = ai_image.describe_frame(scenes) video_dict['scenes'] = scenes_result print(json.dumps(video_dict, indent=8, ensure_ascii=False))