Browse Source

一个基于 Python 的 AI 项目, 依赖 CUDA 和 fastapi

reghao 4 weeks ago
commit
28c7df8fc4
23 changed files with 1643 additions and 0 deletions
  1. 33 0
      .dockerignore
  2. 10 0
      .gitignore
  3. 23 0
      Dockerfile
  4. 17 0
      README.md
  5. 40 0
      ai_server.py
  6. 22 0
      docker-compose.yml
  7. 56 0
      main.py
  8. 120 0
      requirements.txt
  9. 0 0
      route/__init__.py
  10. 99 0
      route/audio.py
  11. 56 0
      route/file.py
  12. 25 0
      route/gpu.py
  13. 55 0
      route/image.py
  14. 89 0
      route/text.py
  15. 0 0
      service/__init__.py
  16. 84 0
      service/ai_asr.py
  17. 80 0
      service/ai_image_ollama.py
  18. 79 0
      service/ai_task.py
  19. 67 0
      service/ai_text.py
  20. 44 0
      service/ai_text_ollama.py
  21. 533 0
      service/pyav.py
  22. 100 0
      service/pygpu.py
  23. 11 0
      setting.py

+ 33 - 0
.dockerignore

@@ -0,0 +1,33 @@
+# 忽略 Python 虚拟环境
+venv/
+.venv/
+env/
+
+# 忽略缓存和编译文件
+**/__pycache__/
+*.py[cod]
+*$py.class
+.pytest_cache/
+.coverage
+htmlcov/
+
+# 忽略 Docker 和本地配置
+.git/
+.gitignore
+.dockerignore
+Dockerfile
+docker-compose.yml
+.env
+
+# 忽略视频处理产生的临时文件(重要!)
+scenes_cache/
+*.mp4
+*.jpg
+
+# 忽略 IDE 配置
+.vscode/
+.idea/
+
+ai_output/
+ai_upload/
+README.md

+ 10 - 0
.gitignore

@@ -0,0 +1,10 @@
+.idea/
+*logs*/
+venv/
+*.iml
+*.log
+*.db
+__pycache__
+*/__pycache__
+ai_output/
+ai_upload/

+ 23 - 0
Dockerfile

@@ -0,0 +1,23 @@
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    ffmpeg \
+    curl \
+    fonts-wqy-microhei \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY ./requirements.txt /app/requirements.txt
+
+RUN pip3 install --no-cache-dir --upgrade -r /app/requirements.txt
+
+COPY . /app
+
+EXPOSE 8010
+
+CMD ["sh", "-c", "uvicorn ai_server:app --host 0.0.0.0 --port 8010"]

+ 17 - 0
README.md

@@ -0,0 +1,17 @@
+一个基于 Python 的 AI 项目, 依赖 CUDA, 通过在代码中加载 AI 模型和调用 ollama 提供的 AI 模型, 提供了以下功能:
+- 图像理解
+- 语音识别
+- 文本分析
+- 文本翻译
+
+## 依赖
+导出依赖
+```
+pip freeze > requirements.txt
+```
+> pip freeze 会导出当前环境下所有安装的包
+
+安装依赖
+```
+pip install -r requirements.txt
+```

+ 40 - 0
ai_server.py

@@ -0,0 +1,40 @@
+import asyncio
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
+)
+
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from route import gpu, file, audio, text, image
+import service.ai_task as ai_task
+import service.ai_asr as pyasr
+
+logger = logging.getLogger(__name__)
+# 获取 uvicorn 的 logger
+# logger = logging.getLogger("uvicorn.error")
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("🚀 服务已启动...")
+    asyncio.create_task(pyasr.init_funasr())
+    await ai_task.start_worker()
+    yield
+    logger.info("🛑 服务已停止")
+app = FastAPI(title="GPU Worker Server", lifespan=lifespan)
+
+# 挂载子路由
+app.include_router(gpu.router)
+app.include_router(file.router)
+app.include_router(audio.router)
+app.include_router(text.router)
+app.include_router(image.router)
+
+@app.get("/")
+async def root():
+    return {"message": "Welcome to pyai"}
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8010)

+ 22 - 0
docker-compose.yml

@@ -0,0 +1,22 @@
+services:
+  pyai:
+    image: pyai:12345678
+    container_name: pyai
+    restart: always
+    network_mode: host
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - /opt/docker/pyai/ai_uploads:/app/ai_uploads
+      - /opt/docker/pyai/ai_outputs:/app/ai_outputs
+      - /opt/docker/pyai/model_cache/modelscope:/root/.cache/modelscope
+    environment:
+      - MODELSCOPE_CACHE=/root/.cache/modelscope
+      - OLLAMA_HOST=http://127.0.0.1:11434
+      - PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

+ 56 - 0
main.py

@@ -0,0 +1,56 @@
+import json
+import os
+
+from setting import OUTPUT_DIR
+
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(name)s] %(levelname)s: %(message)s'
+)
+
+import torch
+import service.pyav as pyav
+import service.pygpu as pygpu
+import service.ai_asr as ai_asr
+import service.ai_image_ollama as ai_image
+
+logger = logging.getLogger(__name__)
+
+def process(audio_path, srt_path, video_path):
+    try:
+        # 执行识别
+        ai_asr.generate_srt(audio_path, srt_path)
+        pyav.generate_video(audio_path, srt_path, video_path)
+    except torch.OutOfMemoryError:
+        logger.error(f"❌ 显存溢出!跳过文件: {audio_path}")
+        pygpu.clear_gpu_memory()  # 发生 OOM 后强制清理一次
+    except Exception as e:
+        logger.error(f"💥 处理 {audio_path} 时发生未知错误: {e}")
+    finally:
+        # 每一个文件处理完都主动清理一次,确保下一个文件有足够的初始空间
+        pygpu.clear_gpu_memory()
+
+
+if __name__ == "__main__":
+    audio = 'abc.mp3'
+    srt = 'abc.mp3.srt'
+    video = 'abc.mp3.mp4'
+    ai_asr.generate_srt(audio, srt)
+    pyav.generate_video(audio, srt, video)
+
+    video = 'abc.mp4'
+    video_dict = pyav.get_video_info(video)
+    start_scene_times = pyav.get_scene_times(video, 0.3)
+    # 按场景分割视频
+    # pyav.split_video_by_scenes(video, start_scene_times, OUTPUT_DIR)
+    # 计算的每个场景的抽帧时间点
+    target_times = pyav.calculate_mid_points(video, start_scene_times)
+    # 使用 ffmpeg 抽帧
+    scenes = pyav.extract_frames(video, target_times, OUTPUT_DIR)
+    # 使用 ai 对抽帧的画面进行识别
+    scenes_result = ai_image.describe_frame(scenes)
+    video_dict['scenes'] = scenes_result
+    print(json.dumps(video_dict, indent=8, ensure_ascii=False))

+ 120 - 0
requirements.txt

@@ -0,0 +1,120 @@
+accelerate==1.13.0
+aliyun-python-sdk-core==2.16.0
+aliyun-python-sdk-kms==2.16.5
+annotated-doc==0.0.4
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.13.0
+audioread==3.1.0
+bitsandbytes==0.49.2
+certifi==2026.2.25
+cffi==2.0.0
+charset-normalizer==3.4.7
+click==8.3.2
+contourpy==1.3.2
+crcmod==1.7
+cryptography==46.0.6
+cuda-bindings==13.2.0
+cuda-pathfinder==1.5.1
+cuda-toolkit==13.0.2
+cycler==0.12.1
+decorator==5.2.1
+editdistance==0.8.1
+exceptiongroup==1.3.1
+fastapi==0.135.3
+filelock==3.25.2
+fonttools==4.62.1
+fsspec==2026.3.0
+funasr==1.3.1
+h11==0.16.0
+hf-xet==1.4.3
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.9.0
+hydra-core==1.3.2
+idna==3.11
+jaconv==0.5.0
+jamo==0.4.1
+jieba==0.42.1
+Jinja2==3.1.6
+jmespath==0.10.0
+joblib==1.5.3
+kaldiio==2.18.1
+kiwisolver==1.5.0
+lazy-loader==0.5
+librosa==0.11.0
+llvmlite==0.47.0
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mdurl==0.1.2
+modelscope==1.35.3
+mpmath==1.3.0
+msgpack==1.1.2
+networkx==3.4.2
+numba==0.65.0
+numpy==2.2.6
+nvidia-cublas==13.1.0.3
+nvidia-cuda-cupti==13.0.85
+nvidia-cuda-nvrtc==13.0.88
+nvidia-cuda-runtime==13.0.96
+nvidia-cudnn-cu13==9.19.0.56
+nvidia-cufft==12.0.0.61
+nvidia-cufile==1.15.1.6
+nvidia-curand==10.4.0.35
+nvidia-cusolver==12.0.4.66
+nvidia-cusparse==12.6.3.3
+nvidia-cusparselt-cu13==0.8.0
+nvidia-ml-py==13.595.45
+nvidia-nccl-cu13==2.28.9
+nvidia-nvjitlink==13.0.88
+nvidia-nvshmem-cu13==3.4.5
+nvidia-nvtx==13.0.85
+ollama==0.6.1
+omegaconf==2.3.0
+opencv-python==4.13.0.92
+oss2==2.19.1
+packaging==26.0
+pillow==12.2.0
+platformdirs==4.9.4
+pooch==1.9.0
+protobuf==7.34.1
+psutil==7.2.2
+pycparser==3.0
+pycryptodome==3.23.0
+pydantic==2.12.5
+pydantic_core==2.41.5
+Pygments==2.20.0
+pynndescent==0.6.0
+pyparsing==3.3.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.22
+pytorch-wpe==0.0.1
+PyYAML==6.0.3
+regex==2026.4.4
+requests==2.33.1
+rich==14.3.3
+safetensors==0.7.0
+scikit-learn==1.7.2
+scipy==1.15.3
+sentencepiece==0.2.1
+shellingham==1.5.4
+six==1.17.0
+soundfile==0.13.1
+soxr==1.0.0
+starlette==1.0.0
+sympy==1.14.0
+tensorboardX==2.6.5
+threadpoolctl==3.6.0
+tokenizers==0.22.2
+torch==2.11.0
+torch-complex==0.4.4
+torchaudio==2.11.0
+tqdm==4.67.3
+transformers==5.5.0
+triton==3.6.0
+typer==0.24.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+umap-learn==0.5.11
+urllib3==2.6.3
+uvicorn==0.43.0

+ 0 - 0
route/__init__.py


+ 99 - 0
route/audio.py

@@ -0,0 +1,99 @@
+import logging
+from pathlib import Path
+
+from fastapi import APIRouter, HTTPException
+import os
+import uuid
+import shutil
+from fastapi import UploadFile, File
+from starlette.concurrency import run_in_threadpool
+import service.ai_task as ai_task
+import service.pyav as pyav
+from setting import UPLOAD_DIR, OUTPUT_DIR
+
+logger = logging.getLogger(__name__)
+
+# 创建路由对象,可以统一设置前缀 (prefix) 和 标签 (tags)
+router = APIRouter(
+    prefix="/api1/audio",
+    tags=["audio"]
+)
+
+@router.post("/asr")
+async def upload_audio(file: UploadFile = File(...)):
+    # 1. 统一生成一次 task_id,确保前后一致
+    task_id = str(uuid.uuid4())[:8]
+    ext = file.filename.split('.')[-1]
+    save_path = os.path.join(UPLOAD_DIR, f"{task_id}.{ext}")
+
+    # 2. 解决 IO 阻塞方案 A: 使用 run_in_threadpool (推荐)
+    # 这样会将同步的写入操作丢进单独的线程,不阻塞主事件循环
+    def save_file():
+        with open(save_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+    await run_in_threadpool(save_file)
+
+    # 3. 构造路径并存入队列
+    srt_path = os.path.join(OUTPUT_DIR, f"{task_id}.srt")
+    video_path = os.path.join(OUTPUT_DIR, f"{task_id}.mp4")
+
+    # 传递已经确定好的 task_id
+    await ai_task.put_task(task_id, save_path, srt_path, video_path)
+    return {
+        "status": "queued",
+        "task_id": task_id,
+        "message": "文件已上传并加入 GPU 处理队列",
+        "srt_preview_path": f"{OUTPUT_DIR}/{task_id}.srt"
+    }
+
+@router.get("/tasks")
+async def get_queue_status():
+    return {"queue_size": ai_task.get_tasks()}
+
+@router.get("/result/{task_id}")
+async def get_asr_result(task_id: str):
+    file_name = check_file_prefix(UPLOAD_DIR, task_id)
+    if not file_name:
+        raise HTTPException(status_code=404, detail="音频文件不存在")
+    audio_path = f"{UPLOAD_DIR}/{file_name}"
+
+    txt_path = f"{OUTPUT_DIR}/{task_id}.txt"
+    if not os.path.exists(txt_path):
+        raise HTTPException(status_code=404, detail="音频文本文件不存在")
+
+    srt_path = f"{OUTPUT_DIR}/{task_id}.srt"
+    if not os.path.exists(srt_path):
+        raise HTTPException(status_code=404, detail="字幕文件不存在")
+
+    video_path = f"{OUTPUT_DIR}/{task_id}.mp4"
+    if not os.path.exists(video_path):
+        raise HTTPException(status_code=404, detail="视频文件不存在")
+
+    with open(txt_path, "r", encoding="utf-8") as f:
+        text = f.read()
+
+    info = pyav.get_media_info(audio_path)
+    srt = pyav.parse_srt_to_list(srt_path)
+    return {
+        "task_id": task_id,
+        "duration": info['duration'],
+        "text": text,
+        "srt": srt,
+        "audio_url": f"/api1/file/audio/{file_name}",
+        "video_url": f"/api1/file/video/{task_id}.mp4"
+    }
+
+
+def check_file_prefix(directory, prefix):
+    # 1. 转化为 Path 对象
+    path = Path(directory)
+    # 2. 匹配所有以 prefix 开头的文件
+    # 如果要匹配特定后缀,可以使用 f"{prefix}*.jpg"
+    matched_files = list(path.glob(f"{prefix}*"))
+
+    count = len(matched_files)
+    if count == 1:
+        file_path = matched_files[0]
+        return file_path.name
+    else:
+        return None

+ 56 - 0
route/file.py

@@ -0,0 +1,56 @@
+from fastapi import APIRouter, HTTPException
+from fastapi.responses import FileResponse
+import os
+from setting import UPLOAD_DIR, OUTPUT_DIR
+
+router = APIRouter(prefix="/api1/file", tags=["file"])
+
+@router.get("/image/{filename}")
+async def get_image(filename: str):
+    # 1. 构建完整路径
+    file_path = os.path.join(UPLOAD_DIR, filename)
+    return get_file(file_path)
+    #
+    # # 2. 安全检查:防止目录穿越漏洞 (Directory Traversal)
+    # # 确保用户请求的文件确实在 UPLOAD_DIR 目录下
+    # real_path = os.path.realpath(file_path)
+    # if not real_path.startswith(os.path.realpath(UPLOAD_DIR)):
+    #     raise HTTPException(status_code=403, detail="拒绝访问该路径")
+    #
+    # # 3. 检查文件是否存在
+    # if not os.path.exists(real_path):
+    #     raise HTTPException(status_code=404, detail="图片不存在")
+    #
+    # # 4. 返回文件流
+    # # media_type 会根据后缀自动识别(如 image/jpeg),也可以手动指定
+    # return FileResponse(real_path)
+
+
+@router.get("/audio/{filename}")
+async def get_audio(filename: str):
+    # 1. 构建完整路径
+    file_path = os.path.join(UPLOAD_DIR, filename)
+    return get_file(file_path)
+
+
+@router.get("/video/{filename}")
+async def get_video(filename: str):
+    # 1. 构建完整路径
+    file_path = os.path.join(OUTPUT_DIR, filename)
+    return get_file(file_path)
+
+
+def get_file(file_path):
+    # 2. 安全检查:防止目录穿越漏洞 (Directory Traversal)
+    # 确保用户请求的文件确实在 UPLOAD_DIR 目录下
+    real_path = os.path.realpath(file_path)
+    if not (real_path.startswith(os.path.realpath(UPLOAD_DIR)) or real_path.startswith(os.path.realpath(OUTPUT_DIR))):
+        raise HTTPException(status_code=403, detail="拒绝访问该路径")
+
+    # 3. 检查文件是否存在
+    if not os.path.exists(real_path):
+        raise HTTPException(status_code=404, detail="视频不存在")
+
+    # 4. 返回文件流
+    # media_type 会根据后缀自动识别(如 image/jpeg),也可以手动指定
+    return FileResponse(real_path)

+ 25 - 0
route/gpu.py

@@ -0,0 +1,25 @@
+import logging
+from fastapi import APIRouter, HTTPException
+import service.pygpu as pygpu
+
+logger = logging.getLogger(__name__)
+
+# 创建路由对象,可以统一设置前缀 (prefix) 和 标签 (tags)
+router = APIRouter(
+    prefix="/api1/gpu",
+    tags=["gpu"]
+)
+
+@router.get("/info")
+async def gpu_info():
+    try:
+        gpu_result = pygpu.get_gpu_memory_info(0)
+        torch_result = pygpu.get_torch_memory_usage()
+        result = pygpu.get_ollama_resource()
+        return {
+            "gpu": gpu_result,
+            "torch": torch_result,
+            "ollama": result,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

+ 55 - 0
route/image.py

@@ -0,0 +1,55 @@
+import logging
+import os
+import uuid
+from typing import List
+
+from fastapi import APIRouter, UploadFile, File, Form, HTTPException
+import shutil
+from starlette.concurrency import run_in_threadpool
+from setting import UPLOAD_DIR
+import service.ai_image_ollama as ai_image
+
+logger = logging.getLogger(__name__)
+
+# 创建路由对象,可以统一设置前缀 (prefix) 和 标签 (tags)
+router = APIRouter(
+    prefix="/api1/image",
+    tags=["video"]
+)
+
+@router.post("/analyze")
+async def upload_image(
+        file: UploadFile = File(...),
+        prompts: List[str] = Form(...)
+):
+    if not file:
+        raise HTTPException(status_code=400, detail="文件不能为空")
+
+    if not prompts:
+        raise HTTPException(status_code=400, detail="Prompts 不能为空")
+
+    task_id = str(uuid.uuid4())[:8]
+    ext = file.filename.split('.')[-1]
+    save_filename = f"{task_id}.{ext}"
+    save_path = os.path.join(UPLOAD_DIR, save_filename)
+
+    # 异步保存文件
+    def save_file():
+        with open(save_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+    await run_in_threadpool(save_file)
+
+    try:
+        # 读取字节流进行 AI 处理
+        with open(save_path, 'rb') as f:
+            image_bytes = f.read()
+            result = ai_image.describe_image(prompts, image_bytes)
+            return {
+                "task_id": task_id,
+                "model_name": result['model_name'],
+                "image_url": f"/api1/file/image/{save_filename}",
+                "results": result['results']
+            }
+    except Exception as e:
+        # 记录日志并抛出错误
+        raise HTTPException(status_code=500, detail=f"AI 推理失败: {str(e)}")

+ 89 - 0
route/text.py

@@ -0,0 +1,89 @@
+import logging
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+import service.ai_text_ollama as ai_text
+
+logger = logging.getLogger(__name__)
+
+# 创建路由对象,可以统一设置前缀 (prefix) 和 标签 (tags)
+router = APIRouter(
+    prefix="/api1/text",
+    tags=["text"]
+)
+
+class TranslationRequest(BaseModel):
+    text: str
+
+@router.post("/translate")
+async def translate(request: TranslationRequest):
+    if not request.text:
+        raise HTTPException(status_code=400, detail="输入文本不能为空")
+
+    try:
+        final_result = ai_text.translate2zh(request.text)
+        return {"original": request.text, "translation": final_result}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+# 定义接收 JSON 的模型
+class SummarizeRequest(BaseModel):
+    text: str
+    prompt: Optional[str] = ""  # 允许 prompt 为空字符串或不传
+
+
+@router.post("/summarize")
+async def get_summarize(request: SummarizeRequest):
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="输入文本不能为空")
+
+    try:
+        max_length = 256
+        combined_prompt = f"""
+            你是一个专业的视频内容分析师。请对以下识别结果进行精炼的中文总结。
+
+            要求字数不超过{max_length}字,语气客观专业:
+            {request.text}
+            """
+
+        result = ai_text.summarize(combined_prompt)
+        return {
+            "model_name": result['model_name'],
+            "prompt": combined_prompt,
+            "result": result['result']
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"总结失败: {str(e)}")
+
+
+@router.post("/tag")
+async def get_tag(request: SummarizeRequest):
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="输入文本不能为空")
+
+    try:
+        combined_prompt = f"""
+            你是一个专业的视频内容分析师。请从下方文本中提取出 1-10 个关键词标签。
+
+            限制条件:
+            1. 标签必须是中文。
+            2. 标签要包含:人物特征、动作行为、场景环境、氛围。
+            3. 严禁输出任何解释性文字,只输出标签。
+            4. 格式要求:标签之间用英文逗号分隔。
+
+            文本内容:
+            {request.text}
+
+            标签结果:
+            """
+
+        result = ai_text.summarize(combined_prompt)
+        return {
+            "model_name": result['model_name'],
+            "prompt": combined_prompt,
+            "result": result['result']
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"总结失败: {str(e)}")

+ 0 - 0
service/__init__.py


+ 84 - 0
service/ai_asr.py

@@ -0,0 +1,84 @@
+import logging
+import asyncio
+import re
+import time
+from typing import Optional
+
+import service.pyav as pyav
+import torch
+from funasr import AutoModel
+
+logger = logging.getLogger(__name__)
+asr_model: Optional[AutoModel] = None
+is_model_ready = False
+
+
+def get_asr_model():
+    """获取模型实例的接口"""
+    global asr_model
+    return asr_model
+
+
+def check_ready():
+    """检查模型是否加载完成"""
+    global is_model_ready
+    return is_model_ready
+
+
+async def init_funasr():
+    """异步初始化函数"""
+    global asr_model, is_model_ready
+
+    if is_model_ready:
+        return
+
+    logger.info("⏳ [ASR] 开始异步加载 funasr 模型...")
+    start_time = time.time()
+
+    try:
+        # 使用 run_in_executor 避免阻塞主事件循环
+        loop = asyncio.get_event_loop()
+
+        # 定义具体的加载逻辑
+        def load():
+            return AutoModel(
+                model="paraformer-zh",
+                vad_model="fsmn-vad",
+                vad_kwargs={"max_single_segment_time": 30000},
+                punc_model="ct-punc",
+                device="cuda:0" if torch.cuda.is_available() else "cpu",
+                disable_update=True
+            )
+
+        asr_model = await loop.run_in_executor(None, load)
+        is_model_ready = True
+        logger.info(f"✅ [ASR] 模型加载成功!耗时 {(time.time() - start_time):.2f}s")
+    except Exception as e:
+        logger.error(f"❌ [ASR] 模型加载失败: {e}")
+        is_model_ready = False
+
+
+def get_text(audio_path):
+    start_time = time.time()
+    logger.info("⏳ 开始进行音频识别...")
+    result = asr_model.generate(input=[audio_path], cache={}, batch_size_s=300)
+    logger.info(f"✅ 音频识别完成, 耗时 {(time.time() - start_time):.2f}秒")
+
+    # 清理文本中的空字符
+    text = result[0]['text'].replace(" ", "")
+    timestamps = result[0]['timestamp']
+    return {
+        'text': text,
+        'timestamps': timestamps
+    }
+
+
+def generate_srt(audio_path, srt_path):
+    result = get_text(audio_path)
+    text = result['text']
+    timestamp_list = result['timestamps']
+
+    # 使用正则表达式将文本按标点切分为一个 list,保留标点
+    text_list = re.split(r"([。!?;,])", text)
+    srt_list = pyav.get_precise_srt(text_list, timestamp_list)
+    pyav.save_srt_file(srt_list, srt_path)

+ 80 - 0
service/ai_image_ollama.py

@@ -0,0 +1,80 @@
+import logging
+from setting import ollama_client
+
+logger = logging.getLogger(__name__)
+model_name = 'moondream'
+
+def describe_frame(scene_list):
+    results = []
+    logger.info(f"🚀 开始分析视频,共有 {len(scene_list)} 个场景待处理...")
+    for i, scene in enumerate(scene_list):
+        logger.info(f"\n🎬 正在处理场景 {i + 1}/{len(scene_list)} (时间点: {scene['frame_pos']}s)")
+        frame_path = scene['frame_path']
+        with open(frame_path, 'rb') as f:
+            image_bytes = f.read()
+            image_data = {
+                "scene_start": scene['scene_start'],
+                "scene_end": scene['scene_end'],
+                "frame_pos": scene['frame_pos'],
+                "frame_path": frame_path,
+                "prompts": []
+            }
+
+            prompts = [
+                """
+                Analyze the physical interaction between the individuals. Is there any intimate or sexual contact visible? Describe the positioning of their bodies and limbs objectively
+                """,
+                """
+                Describe the clothing status of all individuals. Is there any visible nudity, undergarments, or partially exposed sensitive areas? Identify the specific body parts shown.
+                """,
+                """
+                Observe the posture and movement. Does the scene depict a sexual act or a highly suggestive sexual position?provide a neutral description of the pose.
+                """
+            ]
+            for i, p in enumerate(prompts):
+                try:
+                    response = ollama_client.chat(
+                        model=model_name,
+                        messages=[{
+                            'role': 'user',
+                            'content': p,
+                            'images': [image_bytes]
+                        }]
+                    )
+                    eng_text = response['message']['content']
+                    eng_text1 = eng_text.strip()
+                    prompt_item = {
+                        "prompt": p,
+                        "result": eng_text1
+                    }
+                    image_data["prompts"].append(prompt_item)
+                except Exception as e:
+                    logger.error(f"❌ Prompt {i + 1} 推理失败: {e}")
+
+            results.append(image_data)
+
+    return results
+
+
+def describe_image(prompts, image_bytes):
+    results = []
+    for i, p in enumerate(prompts):
+        response = ollama_client.chat(
+            model=model_name,
+            messages=[{
+                'role': 'user',
+                'content': p,
+                'images': [image_bytes]
+            }]
+        )
+        eng_text = response['message']['content']
+        prompt_item = {
+            "prompt": p,
+            "result": eng_text.strip()
+        }
+        results.append(prompt_item)
+
+    return {
+                "model_name": model_name,
+                "results": results
+            }

+ 79 - 0
service/ai_task.py

@@ -0,0 +1,79 @@
+import logging
+import time
+import re
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+
+import service.pyav as pyav
+import service.pygpu as pygpu
+import service.ai_text_ollama as ai_text
+import service.ai_asr as pyasr
+from setting import OUTPUT_DIR
+
+logger = logging.getLogger(__name__)
+executor = ThreadPoolExecutor(max_workers=1)
+task_queue = asyncio.Queue()
+
+async def start_worker():
+    asyncio.create_task(gpu_worker())  # 开启 Worker
+
+
+# --- 后台工作进程 (Consumer) ---
+async def gpu_worker():
+    logger.info("🏃 Worker 开始监听任务队列")
+    while True:
+        if not pyasr.check_ready():
+            logger.info("休眠 10s 等待模型加载完成...")
+            await asyncio.sleep(10)
+            continue
+
+        # 获取任务
+        task = await task_queue.get()
+        task_id, audio_path, srt_path, video_path = task
+        logger.info(f"⚡ 开始处理任务 [{task_id}]: {audio_path}")
+        try:
+            start_t = time.time()
+            # 2. 推理 (在线程池中运行同步识别,防止阻塞事件循环)
+            # 使用 run_in_executor 将同步函数包装成异步,不再阻塞 Event Loop
+            # 这样在处理视频的同时,FastAPI 依然可以接收新请求并 put 到队列中
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(
+                executor,
+                pyasr.get_text,
+                audio_path
+            )
+
+            text = result['text']
+            timestamp_list = result['timestamps']
+            text_list = re.split(r"([。!?;,])", text)
+            srt_list = pyav.get_precise_srt(text_list, timestamp_list)
+            pyav.save_srt_file(srt_list, srt_path)
+
+            with open(f"{OUTPUT_DIR}/{task_id}.txt", "w", encoding="utf-8") as f:
+                f.write(text)
+
+            await loop.run_in_executor(
+                executor,
+                pyav.generate_video,
+                audio_path,
+                srt_path,
+                video_path
+            )
+            logger.info(f"🎉 任务 [{task_id}] 完成,耗时: {time.time() - start_t:.2f}s")
+        except Exception as e:
+            logger.error(f"❌ 任务 [{task_id}] 失败: {str(e)}")
+        finally:
+            # 4. 定时/按需清理 GPU 显存
+            task_queue.task_done()
+            pygpu.clear_gpu_memory()
+
+
+async def put_task(task_id, save_path, srt_path, video_path):
+    await task_queue.put((task_id, save_path, srt_path, video_path))
+
+def get_tasks():
+    return task_queue.qsize()
+
+def translate_to_zh(text):
+    result = ai_text.translate2zh(text)
+    return result

+ 67 - 0
service/ai_text.py

@@ -0,0 +1,67 @@
+import logging
+import os
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+
+import time
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import torch
+
+logger = logging.getLogger(__name__)
+model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+start_time = time.time()
+logger.info(f"⏳ 开始加载 {model_name} 模型...")
+try:
+    # 1. 定义量化配置
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,  # 1650 显卡建议设为 fp16
+        bnb_4bit_quant_type="nf4",  # 高精度量化类型
+        bnb_4bit_use_double_quant=True  # 进一步压缩显存
+    )
+
+    # 2. 加载模型
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,  # 使用配置对象
+        device_map="auto"  # 自动分配到 GPU
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    logger.info(f"✅ {model_name} 模型加载成功!耗时 {(time.time() - start_time):.2f}秒")
+except Exception as e:
+    logger.info(f"模型加载失败: {e}")
+    logger.info(f"✅ {model_name} 模型加载失败: {e}")
+    raise e
+
+
+def translate2zh(text):
+    # 构建适合 Moondream 场景的 Prompt
+    prompt = f"你是一个专业的图像描述翻译官。请将下面这段英文描述翻译成自然、地道的中文,直接输出结果,不要解释:\n{text}"
+
+    messages = [{"role": "user", "content": prompt}]
+    input_text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+
+    model_inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
+
+    # 推理
+    max_tokens = 128
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **model_inputs,
+            max_new_tokens=max_tokens,
+            do_sample=False  # 翻译建议关闭随机性,保证结果稳定
+        )
+
+    # 解码
+    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # 提取助手回答的部分
+    final_result = response.split("assistant\n")[-1].strip()
+    return final_result
+
+
+def summarize():
+    pass

+ 44 - 0
service/ai_text_ollama.py

@@ -0,0 +1,44 @@
+import logging
+import time
+from setting import ollama_client
+
+logger = logging.getLogger(__name__)
+model_name = "qwen2.5:1.5b"
+
+def translate2zh(text):
+    start_time = time.time()
+    logger.info(f"⏳ 正在通过 Ollama 调用 {model_name}...")
+
+    prompt = f"你是一个专业的图像描述翻译官。请将下面这段英文描述翻译成自然、地道的中文,直接输出结果,不要解释:\n{text}"
+    try:
+        # Ollama 会自动处理 4-bit 加载和显存分配
+        response = ollama_client.chat(
+            model=model_name,
+            messages=[{'role': 'user', 'content': prompt}],
+            # options={
+            #     "num_gpu": 1,  # 强制使用 GPU
+            #     "temperature": 0.7,
+            #     "top_p": 0.9
+            # }
+        )
+
+        duration = time.time() - start_time
+        logger.info(f"✅ 推理成功!耗时 {duration:.2f}秒")
+        return response['message']['content']
+
+    except Exception as e:
+        logger.error(f"❌ Ollama 调用失败: {e}")
+        raise e
+
+
+def summarize(content):
+    response = ollama_client.chat(
+        model=model_name,
+        messages=[{'role': 'user', 'content': content}],
+        options={"temperature": 0.3}  # 较低的温度使总结更稳定
+    )
+    result = response['message']['content'].strip()
+    return {
+        "model_name": model_name,
+        "result": result
+    }

+ 533 - 0
service/pyav.py

@@ -0,0 +1,533 @@
+import json
+import os
+import re
+import hashlib
+import logging
+import shutil
+import subprocess
+from pathlib import Path
+import cv2
+import librosa
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+def get_video_info(video_path):
+    """获取视频基础元数据"""
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return None
+
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = int(frame_count / fps) if fps > 0 else 0
+    size_byte = int(os.path.getsize(video_path))
+
+    # 判定横竖屏:1 为横屏, 0 为竖屏
+    horizontal = 1 if width >= height else 0
+
+    cap.release()
+    return {
+        "video_path": video_path,
+        "duration": duration,
+        "size_byte": size_byte,
+        "width": width,
+        "height": height,
+        "horizontal": horizontal,
+        "scenes": []
+    }
+
+
+def get_media_info(media_path):
+    """获取视频基础元数据(含音频)"""
+    # 1. 使用 ffprobe 获取详细流信息
+    cmd = [
+        'ffprobe', '-v', 'quiet', '-print_format', 'json',
+        '-show_streams', '-show_format', media_path
+    ]
+
+    try:
+        result = subprocess.check_output(cmd, encoding='utf-8')
+        data = json.loads(result)
+    except Exception as e:
+        print(f"ffprobe 解析失败: {e}")
+        return None
+
+    # 提取视频流和音频流
+    video_stream = next((s for s in data['streams'] if s['codec_type'] == 'video'), None)
+    audio_stream = next((s for s in data['streams'] if s['codec_type'] == 'audio'), None)
+
+    # 2. 基础视频信息
+    width = int(video_stream.get('width', 0)) if video_stream else 0
+    height = int(video_stream.get('height', 0)) if video_stream else 0
+    duration = float(data['format'].get('duration', 0))
+    size_byte = int(os.path.getsize(media_path))
+    horizontal = 1 if width >= height else 0
+
+    # 3. 构造返回结构
+    info = {
+        "media_path": media_path,
+        "duration": round(duration, 2),
+        "size_byte": size_byte,
+        "width": width,
+        "height": height,
+        "horizontal": horizontal,
+        # 新增音频字段
+        "has_audio": audio_stream is not None,
+        "audio_info": {
+            "codec": audio_stream.get('codec_name'),
+            "sample_rate": audio_stream.get('sample_rate'),
+            "channels": audio_stream.get('channels'),
+            "bit_rate": audio_stream.get('bit_rate')
+        } if audio_stream else None,
+        "scenes": []
+    }
+
+    return info
+
+
+def get_scene_times(video_path, threshold=0.3):
+    cmd = [
+        'ffmpeg',
+        '-hide_banner',
+        '-i', video_path,
+        '-threads', '0',
+        '-vf', f"select='eq(n,0)+gt(scene,{threshold})',showinfo",
+        '-vsync', 'vfr',
+        '-f', 'null', '-' # 仅测试检测,不实际写文件;如需写文件请换回你的参数
+    ]
+
+    scene_start_times = []
+    try:
+        # 2. 启动子进程
+        # stderr=subprocess.PIPE 捕获日志,stdout=subprocess.DEVNULL 忽略正常输出
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
+            encoding='utf-8'
+        )
+
+        # 3. 实时解析日志
+        # 使用 stdout/stderr 迭代时,建议处理编码或可能的读取中断
+        try:
+            # showinfo 的输出在 stderr
+            for line in process.stderr:
+                if "pts_time:" in line:
+                    match = re.search(r"pts_time:(\d+\.\d+)", line)
+                    if match:
+                        time_val = float(match.group(1))
+                        scene_start_times.append(time_val)
+                        logger.info(f"检测到新场景起始点: {time_val}s")
+        except Exception as e:
+            process.kill()  # 如果读取过程崩溃,强制结束进程
+            raise RuntimeError(f"读取 FFmpeg 输出时发生错误: {e}")
+
+        # 4. 等待进程结束并检查退出码
+        process.wait()
+
+        if process.returncode != 0:
+            # 获取最后几行错误信息(如果有)
+            raise subprocess.CalledProcessError(process.returncode, cmd)
+
+    except FileNotFoundError:
+        # 当系统环境变量里找不到 'ffmpeg' 命令时触发
+        raise RuntimeError("系统未安装 FFmpeg 或未将其添加到环境变量 PATH 中")
+
+    except subprocess.CalledProcessError as e:
+        # FFmpeg 执行过程中报错(如视频解码失败、参数错误)
+        raise RuntimeError(f"FFmpeg 处理视频失败,退出码: {e.returncode}")
+
+    except Exception as e:
+        # 其他未知异常
+        raise RuntimeError(f"发生未知错误: {e}")
+
+    if not scene_start_times:
+        return [0.0]
+
+    if scene_start_times[0] > 0.5:
+        # 手动把第一个点修正为 0.0
+        scene_start_times.insert(0, 0.0)
+
+    return scene_start_times
+
+
+def split_video_by_scenes(video_path, scene_start_times, output_dir="segment"):
+    """
+    根据给定的起始时间列表分割视频
+    """
+    if not scene_start_times:
+        logger.info("没有检测到场景,跳过分割。")
+        return
+
+    # 添加一个结束标识,方便循环计算时长
+    # 这里不需要准确的视频总长,FFmpeg 处理最后一个片段时会自动截取到末尾
+    times = scene_start_times + [None]
+    for i in range(len(times) - 1):
+        start_time = times[i]
+        next_time = times[i + 1]
+
+        output_file = f"{output_dir}/segment_{i:03d}.mp4"
+
+        # 构建命令
+        # -ss 放在 -i 前面可以实现快速定位(基于关键帧)
+        cmd = [
+            'ffmpeg', '-hide_banner', '-y',
+            '-ss', str(start_time),
+            '-i', video_path
+        ]
+
+        # 如果不是最后一个片段,指定持续时间 -t
+        if next_time is not None:
+            duration = next_time - start_time
+            cmd.extend(['-t', str(duration)])
+
+        # 使用 copy 模式不重编码,速度极快
+        cmd.extend(['-c', 'copy', output_file])
+
+        try:
+            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+            logger.info(f"完成: {output_file} (起始点: {start_time}s)")
+        except subprocess.CalledProcessError as e:
+            logger.info(f"分割片段 {i} 失败: {e.stderr.decode()}")
+
+
+def calculate_mid_points(video_path, scene_start_times):
+    """
+    计算每个场景的详细信息:
+    1. 获取视频总时长以确定最后一个场景的边界。
+    2. 如果场景时长 < 0.5s,抽帧点取起始点;否则取中点。
+    3. 返回格式化的字典列表。
+    """
+    # 1. 获取视频总时长
+    duration_cmd = [
+        'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
+        '-of', 'default=noprint_wrappers=1:nokey=1', video_path
+    ]
+    try:
+        total_duration = float(subprocess.check_output(duration_cmd).decode().strip())
+    except Exception as e:
+        # 如果获取时长失败,可以根据需求抛出异常或设置一个保守值
+        raise RuntimeError(f"无法获取视频时长: {e}")
+
+    # 2. 构建结束时间点(下一个场景的开始即当前场景的结束)
+    end_times = scene_start_times[1:] + [total_duration]
+
+    scenes = []
+    for start, end in zip(scene_start_times, end_times):
+        duration = end - start
+
+        # 3. 计算抽帧位置逻辑
+        if duration < 0.5:
+            mid_point = start
+        else:
+            mid_point = (start + end) / 2
+
+        # 4. 组装成指定的字典格式
+        scenes.append({
+            "scene_start": round(start, 3),
+            "scene_end": round(end, 3),
+            "frame_pos": round(mid_point, 3)
+        })
+
+    return scenes
+
+
+def extract_frames(video_path, scenes, output_dir="thumbnails"):
+    """
+    执行实际的 FFmpeg 抽帧操作。
+    输入: scenes 字典列表。
+    输出: 带有 'frame_path' 绝对路径的 scenes 字典列表。
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # 获取输出目录的绝对路径,确保返回的路径是完整的
+    abs_output_dir = os.path.abspath(output_dir)
+
+    logger.info(f"开始执行抽帧任务,目标数量: {len(scenes)}")
+
+    for i, scene in enumerate(scenes):
+        ts = scene["frame_pos"]
+        # 文件命名保持之前的规范:序号_时间戳.jpg
+        file_name = f"scene_{i + 1:03d}_{ts}s.jpg"
+        output_file_path = os.path.join(abs_output_dir, file_name)
+
+        # 使用快速定位 (-ss 在 -i 前)
+        cmd = [
+            'ffmpeg', '-hide_banner', '-loglevel', 'error',
+            '-ss', str(ts),
+            '-i', video_path,
+            '-frames:v', '1',
+            '-q:v', '2',
+            '-vf', 'scale=640:-1',  # 预览图建议缩放,速度更快
+            output_file_path, '-y'
+        ]
+
+        try:
+            subprocess.run(cmd, check=True)
+            # 抽帧成功后,将绝对路径存入字典
+            scene["frame_path"] = output_file_path
+
+            if (i + 1) % 5 == 0 or (i + 1) == len(scenes):
+                logger.info(f"进度: {i + 1}/{len(scenes)}")
+        except subprocess.CalledProcessError:
+            logger.info(f"错误: 无法提取 {ts}s 处的帧")
+            scene["frame_path"] = None  # 如果提取失败,可以标记为 None
+
+    logger.info(f"任务完成,存储路径: {abs_output_dir}")
+    return scenes
+
+
+# subtitles 滤镜位于 filter_complex 的字符串内部,FFmpeg 会对其进行二次解析。如果路径包含 \、: 或空格,解析就会崩溃
+# 将 srt 文件临时改名为一个完全合法的名字并复制文件, 处理完成后再删除复制的文件
+def get_safe_temp_srt(srt_path):
+    """
+    根据原始路径生成一个位于同目录下的 SHA256 临时文件名
+    """
+    srt_obj = Path(srt_path).resolve()
+    # 计算路径或内容的 hash (建议计算路径的 hash 即可,速度快)
+    path_hash = hashlib.sha256(str(srt_obj).encode('utf-8')).hexdigest()
+
+    # 构造临时文件路径:与原文件同目录,名字为 hash.srt
+    temp_srt_path = srt_obj.parent / f"{path_hash}.srt"
+    return temp_srt_path
+
+
+def generate_video(audio_path, srt_path, video_output):
+    # 预处理 srt 路径
+    # 1. 转为绝对路径
+    # 2. 统一使用正斜杠 /
+    # 3. 处理 subtitles 滤镜特有的转义:将 ':' 替换为 '\:'
+    temp_srt_path = get_safe_temp_srt(srt_path)
+    shutil.copy(srt_path, temp_srt_path)
+
+    font_name = 'WenQuanYi Micro Hei'
+    font_size = 20
+    font_color = '&H0000FFFF&'
+
+    """调用 FFmpeg 合成视频"""
+    # 建议设置:
+    # -rc vbr: 使用可变码率模式
+    # -cq 28: 控制质量。数值越大,体积越小。推荐范围 24-32
+    # -b:v 0: 在 cq 模式下,将目标码率设为 0,让编码器完全根据质量控制
+    command = [
+        'ffmpeg', '-y',
+        '-hide_banner',
+        '-i', audio_path,
+        '-filter_complex',
+        f"[0:a]showwaves=s=854x480:mode=line:colors=0x00FFFF[v];"
+        f"[v]subtitles={temp_srt_path}:charenc=UTF-8:force_style='FontName={font_name},FontSize={font_size},PrimaryColour={font_color},Alignment=2'[v_out]",
+        '-map', '[v_out]',
+        '-map', '0:a',
+        '-c:v', 'libx264',  # 使用 CPU 编码压缩率更高
+        '-preset', 'veryfast',  # 编码速度预设。想要体积更小可以改为 'medium',但速度会慢一点
+        '-crf', '28',  # 质量控制:23 是默认,28 体积更小,对于 480p 波形图完全够用
+        '-pix_fmt', 'yuv420p',  # 增强兼容性,确保所有播放器都能看
+        '-c:a', 'aac',
+        '-b:a', '128k',  # 音频码率限制在 128k
+        '-shortest',
+        video_output
+    ]
+
+    try:
+        # 使用 Popen 启动进程,将 stderr 重定向到 PIPE
+        # 注意:FFmpeg 的进度信息是在 stderr 输出的
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # 将 stderr 合并到 stdout 统一处理
+            text=True,
+            encoding='utf-8',
+            errors='replace'
+        )
+
+        # 用于记录最后几行日志,方便报错时排查
+        error_log_buffer = []
+        logger.info(f"🎬 开始合成视频: {video_output}")
+        # 实时读取并打印输出内容
+        while True:
+            line = process.stdout.readline()
+            if not line and process.poll() is not None:
+                break
+
+            if line:
+                clean_line = line.strip()
+                error_log_buffer.append(clean_line)
+                # 保持缓冲区大小,只留最后 20 行
+                if len(error_log_buffer) > 20:
+                    error_log_buffer.pop(0)
+
+                # 如果是进度行,则在同一行刷新;如果是警告/错误,则换行打印
+                if "frame=" in clean_line or "size=" in clean_line:
+                    # 这里的 line 会包含诸如 "frame=  123 fps= 30 size=  512kB time=00:00:05.12..." 的进度信息
+                    # 使用 end='' 是因为 readline 自带换行符
+                    logger.info(f"\rFFmpeg 进度: {clean_line}")
+
+        # 检查最终退出状态
+        process.wait()
+
+        if process.returncode != 0:
+            # 拼接最后的错误片段
+            last_errors = "\n".join(error_log_buffer)
+            raise RuntimeError(
+                f"FFmpeg 执行失败 (退出码 {process.returncode})\n"
+                f"--- 最后 20 行日志 ---\n{last_errors}"
+            )
+
+        logger.info(f"\n🚀 视频合成成功: {video_output}")
+    finally:
+        if temp_srt_path.exists():
+            temp_srt_path.unlink()
+
+
+def get_precise_srt(text_list, timestamp_list, max_chars=20):
+    total_ts = len(timestamp_list)
+    raw_parts = text_list
+    sentences = []
+    # 合并标点到前面的短句
+    for i in range(0, len(raw_parts) - 1, 2):
+        sentences.append(raw_parts[i] + raw_parts[i + 1])
+    if len(raw_parts) % 2 == 1:
+        sentences.append(raw_parts[-1])
+
+    ts_idx = 0
+    line_count = 1
+
+    srt_list = []
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence or ts_idx >= total_ts:
+            continue
+
+        # 2. 如果单句太长,进行硬切分(按 max_chars)
+        sub_sentences = [sentence[i:i + max_chars] for i in range(0, len(sentence), max_chars)]
+
+        for s in sub_sentences:
+            # 统计这行里有多少个字符是对应时间戳的
+            # 注意:Paraformer 的时间戳通常不包含标点,需要过滤掉标点再计数
+            pure_words = re.sub(r'[^\w\u4e00-\u9fa5]', '', s)  # 仅保留中文字符和字母数字
+            num_words = len(pure_words)
+
+            if num_words == 0:
+                continue
+
+            # --- 关键防护:检查 ts_idx 是否越界 ---
+            if ts_idx >= total_ts:
+                break
+
+            # 获取开始时间
+            start_t = timestamp_list[ts_idx][0]
+
+            # 计算结束索引,确保不越界
+            end_pos = ts_idx + num_words - 1
+            if end_pos >= total_ts:
+                end_pos = total_ts - 1
+
+            end_t = timestamp_list[end_pos][1]
+
+            # 写入 SRT 格式
+            # f.write(f"{line_count}\n")
+            # f.write(f"{format_time_srt(start_t)} --> {format_time_srt(end_t)}\n")
+            # f.write(f"{s}\n\n")
+            srt_list.append({
+                "line": line_count,
+                "time": f"{format_time_srt(start_t)} --> {format_time_srt(end_t)}",
+                "text": s
+            })
+
+            # 更新索引
+            ts_idx += num_words
+            line_count += 1
+    return srt_list
+
+
+def format_time_srt(ms):
+    """毫秒转 SRT 格式: HH:MM:SS,mmm"""
+    s, ms = divmod(ms, 1000)
+    m, s = divmod(s, 60)
+    h, m = divmod(m, 60)
+    return f"{h:02}:{m:02}:{s:02},{int(ms):03}"
+
+
+def save_srt_file(srt_list, output_path):
+    """
+    将 srt 列表写入文件
+    :param srt_list: 包含 line, time, text 字典的列表
+    :param output_path: 输出路径 (如 'output.srt')
+    """
+    with open(output_path, 'w', encoding='utf-8') as f:
+        for entry in srt_list:
+            # 1. 写入序号 (line)
+            f.write(f"{entry['line']}\n")
+            # 2. 写入时间轴 (time)
+            f.write(f"{entry['time']}\n")
+            # 3. 写入文本 (text)
+            f.write(f"{entry['text']}\n")
+            # 4. 写入一个空行作为分隔符
+            f.write("\n")
+
+
+def parse_srt_to_list(file_path):
+    """
+    将 SRT 文件还原为 [{line, time, text}, ...] 结构
+    """
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read().strip()
+
+    # 正则表达式解释:
+    # (\d+)                -> 匹配序号 (line)
+    # (\d{2}:\d{2}:.*)     -> 匹配时间轴 (time)
+    # ([\s\S]*?)           -> 匹配文本内容 (text),支持多行
+    # (?=\n\d+\n|\Z)       -> 断言后面紧跟下一个序号或文件末尾
+    pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3})\n([\s\S]*?)(?=\n\d+\n|\Z)')
+
+    matches = pattern.findall(content)
+
+    srt_list = []
+    for m in matches:
+        srt_list.append({
+            "line": int(m[0]),
+            "time": m[1],
+            "text": m[2].strip()  # 去掉文本末尾可能的换行
+        })
+    return srt_list
+
+
+def analyze_audio_energy(audio_path, segment_ms=100):
+    """
+    按时间片段分析音频能量,帮助确定静音阈值
+    :param audio_path: 音频文件路径
+    :param segment_ms: 检查的时间块大小(毫秒)
+    """
+    # 1. 加载音频
+    sr = 16000
+    y, _ = librosa.load(audio_path, sr=sr)
+
+    # 2. 计算每个片段的能量 (RMS)
+    hop_length = int(sr * segment_ms / 1000)
+    energy_list = []
+
+    logger.info(f"{'时间 (秒)':<10} | {'能量值 (RMS)':<15} | {'状态估计'}")
+    logger.info("-" * 45)
+
+    for i in range(0, len(y), hop_length):
+        segment = y[i: i + hop_length]
+        if len(segment) == 0: break
+
+        rms = np.sqrt(np.mean(segment ** 2))
+        energy_list.append(rms)
+
+        # 打印进度和数值
+        time_sec = i / sr
+        status = "🤫 静音" if rms < 0.005 else "🗣️ 有声"
+        logger.info(f"{time_sec:>8.2f}s | {rms:>15.6f} | {status}")
+
+    # 3. 输出统计建议
+    logger.info("-" * 45)
+    logger.info(f"最大能量: {max(energy_list):.6f}")
+    logger.info(f"最小能量: {min(energy_list):.6f}")
+    logger.info(f"建议阈值: {np.percentile(energy_list, 20):.6f} (取前20%分位数作为参考)")

+ 100 - 0
service/pygpu.py

@@ -0,0 +1,100 @@
+import logging
+from datetime import datetime
+import gc
+import pynvml
+import torch
+from setting import ollama_client
+
+logger = logging.getLogger(__name__)
+
+def get_gpu_memory_info(device_index=0):
+    pynvml.nvmlInit()
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
+    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+
+    # print(f"显卡型号: {pynvml.nvmlDeviceGetName(handle)}")
+    # print(f"总显存: {info.total / 1024 ** 2:.2f} MB")
+    # print(f"已用显存: {info.used / 1024 ** 2:.2f} MB")
+    # print(f"空闲显存: {info.free / 1024 ** 2:.2f} MB")
+    result = {
+        "model": pynvml.nvmlDeviceGetName(handle),
+        "mem_total": f"{info.total / 1024 ** 2:.2f} MB",
+        "mem_used": f"{info.used / 1024 ** 2:.2f} MB",
+        "mem_free": f"{info.free / 1024 ** 2:.2f} MB",
+    }
+    pynvml.nvmlShutdown()
+    return  result
+
+
+def get_torch_memory_usage():
+    if torch.cuda.is_available():
+        # 获取当前设备索引
+        device = torch.cuda.current_device()
+        # 显存缓存(PyTorch 预占用的)
+        reserved = torch.cuda.memory_reserved(device) / 1024 ** 2
+        # 已经分配给 Tensor 的显存
+        allocated = torch.cuda.memory_allocated(device) / 1024 ** 2
+
+        # reserved 是实际占有的 gpu 内存
+        # allocated 是 reserved 中实际使用的部分
+        return {
+            "reserved": f"{reserved:.2f} MB",
+            "allocated": f"{allocated:.2f} MB"
+        }
+    else:
+        return {
+                "message": "CUDA N/A",
+            }
+
+
+# --- GPU 显存深度清理函数 ---
+def clear_gpu_memory():
+    if torch.cuda.is_available():
+        # 释放 PyTorch 占用的缓存
+        torch.cuda.empty_cache()
+        # 释放进程间共享内存
+        torch.cuda.ipc_collect()
+    # 强制进行 Python 层的垃圾回收
+    gc.collect()
+    logger.info(f"[{datetime.now().strftime('%H:%M:%S')}] 🧹 GPU 显存深度清理完成")
+
+
+def get_ollama_resource():
+    result = []
+    try:
+        # 获取当前运行中的模型列表
+        response = ollama_client.ps()
+        if not response['models']:
+            return result
+
+        for model in response['models']:
+            name = model['name']
+            size_vram = model.get('size_vram', 0)
+            size = model.get('size', 0)
+
+            # 计算显存占比
+            if size > 0:
+                gpu_percentage = (size_vram / size) * 100
+            else:
+                gpu_percentage = 0
+
+            # print(f"模型名称: {name}")
+            # print(f"总大小: {size / 1024 ** 3:.2f} GB")
+            # print(f"显存(VRAM)大小: {size_vram / 1024 ** 3:.2f} GB")
+            if gpu_percentage >= 100:
+                stat = "🚀 完全运行在 GPU 上"
+            elif gpu_percentage > 0:
+                stat = f"🌓 混合模式 (GPU 占比 {gpu_percentage:.2f}%)"
+            else:
+                stat = "🐌 完全运行在 CPU 上"
+
+            result.append({
+                'model_name': name,
+                'size': f"{size / 1024 ** 3:.2f} GB",
+                'size_vram': f"{size_vram / 1024 ** 3:.2f} GB",
+                'gpu_percentage': f"{gpu_percentage:.2f}",
+                'stat': stat
+            })
+    except Exception as e:
+        print(f"无法连接到 Ollama 服务: {e}")
+    return result

+ 11 - 0
setting.py

@@ -0,0 +1,11 @@
+import os
+from ollama import Client
+
+# 从环境变量获取地址,默认指向 compose 中的服务名
+OLLAMA_URL = os.getenv("OLLAMA_HOST", "http://127.0.0.1:11434")
+ollama_client = Client(host=OLLAMA_URL)
+
+UPLOAD_DIR = "ai_uploads"
+OUTPUT_DIR = "ai_outputs"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)