video-maker/make.py at master · guanlinyi/video-maker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
"""
图片 → 竖屏字幕短视频（带 AI 配音）

工作流程:
  1. 从 input/images/ 加载图片（按文件名排序）
  2. 从 input/subtitles.txt 加载字幕（每行对应一张图）
  3. 用 edge-tts 为每行字幕生成 AI 配音（微软晓晓女声）
  4. 各字幕 + 配音按顺序对齐到每张图
  5. 叠加半透明黑底字幕条 + 交叉淡入淡出转场
  6. 混入背景音乐 → 输出竖屏 MP4

用法:
  python make.py                          # 默认运行
  python make.py --preview                  # 预览不渲染
  python make.py --tts-voice xiaoyi         # 换配音声线
  python make.py --no-tts                   # 不要配音
  python make.py --no-music                 # 不要背景音乐
  python make.py --duration 20              # 调总时长
  python make.py --images my_pics/ --subs my_subtitles.txt
"""

import argparse
import asyncio
import shutil
import sys
import tempfile
from pathlib import Path

try:
    from local_tts import LocalTTS
    _HAS_LOCAL_TTS = True
except ImportError:
    _HAS_LOCAL_TTS = False

import edge_tts
from moviepy import (
    ImageClip,
    TextClip,
    ColorClip,
    CompositeVideoClip,
    concatenate_videoclips,
    AudioFileClip,
    CompositeAudioClip,
    vfx,
)

# ── 常量 ─────────────────────────────────────────────
PROJECT_DIR = Path(__file__).parent
IMAGE_DIR = PROJECT_DIR / "input" / "images"
SUBTITLES_FILE = PROJECT_DIR / "input" / "subtitles.txt"
MUSIC_DIR = PROJECT_DIR / "input" / "music"

WIDTH, HEIGHT = 1160, 2112
FPS = 30
FONT_PATH = "C:/Windows/Fonts/msyh.ttc"

# 字幕条参数
BAR_HEIGHT = 200           # 底部半透明黑条高度
FONT_SIZE = 48             # 字幕字号
TEXT_MAX_WIDTH = WIDTH - 240  # 文字最大宽度（左右留边距）

# TTS 默认声线（微软晓晓，自然中文女声）
DEFAULT_TTS_VOICE = "zh-CN-XiaoxiaoNeural"

# TTS 临时目录（随项目，避免每次都重新生成）
TTS_CACHE_DIR = PROJECT_DIR / ".tts_cache"


# ── 工具函数 ─────────────────────────────────────────


def load_images(image_dir: Path) -> list[Path]:
    """加载图片，按文件名排序"""
    exts = {".jpg", ".jpeg", ".png", ".webp"}
    files = sorted(
        [f for f in image_dir.iterdir() if f.suffix.lower() in exts],
        key=lambda f: f.stem,
    )
    if not files:
        print(f"⚠️  在 {image_dir} 中没找到图片")
        print(f"   请放入 .jpg/.png/.webp 文件")
        sys.exit(1)
    return files


def load_subtitles(sub_path: Path, image_count: int) -> list[str]:
    """加载字幕文件，每行对应一张图"""
    if not sub_path.exists():
        print(f"⚠️  字幕文件不存在: {sub_path}")
        print(f"   将使用空字幕")
        return [""] * image_count

    lines = sub_path.read_text(encoding="utf-8").strip().splitlines()
    lines = [l.strip() for l in lines if l.strip()]

    if len(lines) < image_count:
        print(f"⚠️  字幕行数 ({len(lines)}) < 图片数 ({image_count})，不足部分留空")
        lines += [""] * (image_count - len(lines))
    elif len(lines) > image_count:
        print(f"⚠️  字幕行数 ({len(lines)}) > 图片数 ({image_count})，多余行忽略")
        lines = lines[:image_count]

    return lines


def resize_to_fill(clip: ImageClip, target_w: int, target_h: int) -> ImageClip:
    """等比例缩放 + 居中裁剪，填满目标尺寸"""
    scale = max(target_w / clip.w, target_h / clip.h)
    scaled = clip.resized(width=int(clip.w * scale), height=int(clip.h * scale))
    x = (scaled.w - target_w) / 2
    y = (scaled.h - target_h) / 2
    return scaled.cropped(x1=x, y1=y, width=target_w, height=target_h)


# ── 字幕 + TTS ────────────────────────────────────────


def make_subtitle_layer(text: str, duration: float) -> CompositeVideoClip | None:
    """
    创建字幕层：半透明黑底条 + 白色文字
    返回一个 CompositeVideoClip，时长 = duration
    """
    if not text:
        return None

    # 底部半透明黑条
    bar = (
        ColorClip(size=(WIDTH, BAR_HEIGHT), color=(0, 0, 0))
        .with_opacity(0.55)
        .with_duration(duration)
        .with_position((0, HEIGHT - BAR_HEIGHT))
    )

    # 白色文字（居中在黑条内）
    txt = (
        TextClip(
            text=text,
            font=FONT_PATH,
            font_size=FONT_SIZE,
            color="white",
            stroke_color="black",
            stroke_width=1,
            text_align="center",
            method="caption",
            size=(TEXT_MAX_WIDTH, BAR_HEIGHT - 20),
        )
        .with_duration(duration)
        .with_position(("center", HEIGHT - BAR_HEIGHT + 10))
    )

    return CompositeVideoClip([bar, txt], size=(WIDTH, HEIGHT)).with_duration(duration)


async def _generate_tts_async(
    text: str, output_path: Path, voice: str
) -> float:
    """生成单条 TTS 音频，返回时长（秒）"""
    if not text.strip():
        return 0.0

    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(str(output_path))

    # 用 ffprobe 获取时长
    import subprocess
    import json

    result = subprocess.run(
        [
            "ffprobe", "-v", "quiet",
            "-print_format", "json",
            "-show_entries", "format=duration",
            str(output_path),
        ],
        capture_output=True, text=True,
    )
    data = json.loads(result.stdout)
    return float(data["format"]["duration"])


def generate_tts(text: str, voice: str, cache_dir: Path) -> tuple[Path | None, float]:
    """
    生成 TTS 音频（带缓存），返回 (路径, 时长秒)
    - 如果文本为空，返回 (None, 0)
    - 如果已缓存，直接读取
    - 如果生成失败，返回 (None, 0)
    """
    if not text.strip():
        return None, 0.0

    # 缓存文件名 = voice + 文本的 hash
    import hashlib
    text_hash = hashlib.md5((voice + text).encode()).hexdigest()[:12]
    cache_path = cache_dir / f"{text_hash}.mp3"

    if cache_path.exists():
        # 读缓存
        import subprocess, json
        result = subprocess.run(
            ["ffprobe", "-v", "quiet", "-print_format", "json",
             "-show_entries", "format=duration", str(cache_path)],
            capture_output=True, text=True,
        )
        data = json.loads(result.stdout)
        duration = float(data["format"]["duration"])
        return cache_path, duration

    # 生成
    cache_dir.mkdir(parents=True, exist_ok=True)
    try:
        duration = asyncio.run(_generate_tts_async(text, cache_path, voice))
        print(f"   🎤 TTS: 「{text}」 → {duration:.1f}s")
        return cache_path, duration
    except Exception as e:
        print(f"   ⚠️  TTS 生成失败「{text}」: {e}")
        return None, 0.0


# ── 主渲染函数 ──────────────────────────────────────────


def make_video(
    image_paths: list[Path],
    subtitles: list[str],
    output_path: Path,
    total_duration: float = 15.0,
    fade_duration: float = 0.4,
    music_path: Path | None = None,
    tts_voice: str | None = DEFAULT_TTS_VOICE,
    tts_cache_dir: Path = TTS_CACHE_DIR,
    local_tts: 'LocalTTS | None' = None,
    preview: bool = False,
):
    n = len(image_paths)
    per_image = total_duration / n

    # ── 预览 ──
    if preview:
        print("\n📋 预览 ──────────────────────────────────")
        print(f"   图片数: {n}")
        print(f"   总时长: {total_duration}s")
        print(f"   每张图: {per_image:.1f}s")
        print(f"   淡入淡出: {fade_duration}s")
        print(f"   尺寸: {WIDTH}x{HEIGHT}")
        print(f"   输出: {output_path}")
        if tts_voice:
            print(f"   AI 配音: {tts_voice}")
        else:
            print(f"   AI 配音: 无")
        if music_path and music_path.exists():
            print(f"   背景音乐: {music_path.name}")
        else:
            print(f"   背景音乐: 无")
        print()
        for i, (img, sub) in enumerate(zip(image_paths, subtitles)):
            print(f"   [{i+1}] {img.name}  →  「{sub}」")
        print("──────────────────────────────────────")
        return

    # ═══════════════════════════════════════════════
    # 第 1 步：生成 TTS 音频
    # ═══════════════════════════════════════════════
    tts_clips: list[AudioFileClip | None] = [None] * n
    tts_durations = [0.0] * n
    if local_tts:
        print("\n🎤 使用本地 CosyVoice2 配音...")
        for i, text in enumerate(subtitles):
            if not text.strip():
                continue
            wav_path = tts_cache_dir / f"local_tts_{i}.wav"
            dur = local_tts.synthesize(text, wav_path)
            if dur > 0:
                clip = AudioFileClip(str(wav_path))
                tts_clips[i] = clip
                tts_durations[i] = dur
                print(f"   🎤 配音[{i+1}]: 「{text}」 → {dur:.1f}s")
        # 动态调整总时长：每张图至少显示到配音播完
        total_tts = sum(tts_durations)
        if total_tts > total_duration:
            old = total_duration
            total_duration = total_tts
            per_image = total_duration / n
            print(f"   ⏱️  配音总长 {total_tts:.0f}s > 原设 {old:.0f}s，自动延长到 {total_duration:.0f}s")
            print(f"      每张图 {per_image:.1f}s")
    elif tts_voice:
        print("\n🎤 生成 AI 配音...")
        for i, text in enumerate(subtitles):
            audio_path, dur = generate_tts(text, tts_voice, tts_cache_dir)
            if audio_path and dur > 0:
                clip = AudioFileClip(str(audio_path))
                tts_clips[i] = clip

    # ═══════════════════════════════════════════════
    # 第 2 步：构建视频片段（图片 + 字幕）
    # ═══════════════════════════════════════════════
    print("\n🎞️  构建视频层...")
    clips = []
    for i, (img_path, sub_text) in enumerate(zip(image_paths, subtitles)):
        # 图片
        img = ImageClip(str(img_path)).with_duration(per_image)
        img = resize_to_fill(img, WIDTH, HEIGHT)

        # 字幕
        sub_layer = make_subtitle_layer(sub_text, per_image)
        if sub_layer:
            layer = CompositeVideoClip([img, sub_layer], size=(WIDTH, HEIGHT))
        else:
            layer = img

        clips.append(layer.with_duration(per_image))

    # ═══════════════════════════════════════════════
    # 第 3 步：转场
    # ═══════════════════════════════════════════════
    transitions = []
    for i, clip in enumerate(clips):
        if i > 0:
            clip = clip.with_effects([vfx.CrossFadeIn(fade_duration)])
        if i < n - 1:
            clip = clip.with_effects([vfx.CrossFadeOut(fade_duration)])
        transitions.append(clip)

    final_video = concatenate_videoclips(transitions, method="compose")

    # ═══════════════════════════════════════════════
    # 第 4 步：混音（TTS + 背景音乐）
    # ═══════════════════════════════════════════════
    audio_parts = []

    # TTS：每条对齐到对应图片的时间窗口
    for i, tts_clip in enumerate(tts_clips):
        if tts_clip is None:
            continue
        start_time = i * per_image
        # 如果 TTS 比图片窗口长，还是从窗口起点开始播（允许轻微溢出）
        tts_clip = tts_clip.with_start(start_time)
        audio_parts.append(tts_clip)

    # 背景音乐
    if music_path and music_path.exists():
        bgm = AudioFileClip(str(music_path))
        if bgm.duration > final_video.duration:
            bgm = bgm.subclipped(0, final_video.duration)
        bgm = bgm.with_volume_scaled(0.35)  # 背景音乐音量压低，不盖过配音
        audio_parts.append(bgm)

    if audio_parts:
        final_audio = CompositeAudioClip(audio_parts)
        final_video = final_video.with_audio(final_audio)

    # ═══════════════════════════════════════════════
    # 第 5 步：导出
    # ═══════════════════════════════════════════════
    output_path.parent.mkdir(parents=True, exist_ok=True)
    print(f"\n🎬 渲染中 ({n} 张图, {total_duration:.0f}s)...")
    # 检测 NVENC 可用性，优先硬件编码
    import subprocess, json
    nvenc_test = subprocess.run(["ffmpeg", "-encoders"], capture_output=True, text=True)
    video_codec = "h264_nvenc" if "h264_nvenc" in nvenc_test.stdout else "libx264"
    enc_preset = "p7" if video_codec == "h264_nvenc" else "medium"
    if video_codec == "libx264":
        print(f"   ⚠️  未检测到 NVENC，使用 CPU 编码（较慢）")

    final_video.write_videofile(
        str(output_path),
        fps=FPS,
        codec=video_codec,
        audio_codec="aac",
        threads=4,
        preset=enc_preset,
        logger=None,
    )
    print(f"\n✅ 完成！输出: {output_path}")
    print(f"   尺寸: {WIDTH}x{HEIGHT} | 时长: {final_video.duration:.1f}s | {output_path.stat().st_size / 1024:.0f}KB")

    # 清理：关闭所有音频剪辑
    for ac in tts_clips:
        if ac:
            ac.close()


# ── CLI ────────────────────────────────────────────────

TTS_VOICES = {
    "xiaoxiao": "zh-CN-XiaoxiaoNeural",
    "xiaoyi": "zh-CN-XiaoyiNeural",
    "xiaohan": "zh-CN-XiaohanNeural",
    "xiaomeng": "zh-CN-XiaomengNeural",
    "xiaomo": "zh-CN-XiaomoNeural",
    "xiaorui": "zh-CN-XiaoruiNeural",
}


def main():
    parser = argparse.ArgumentParser(
        description="图片 → 竖屏字幕短视频（带 AI 配音）",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=(
            "示例:\n"
            "  python make.py                              # 默认运行\n"
            "  python make.py --preview                      # 预览\n"
            "  python make.py --tts-voice xiaoyi             # 换女声声线\n"
            "  python make.py --no-tts                       # 不要配音\n"
            "  python make.py --no-music --no-tts            # 纯图片+字幕\n"
            "  python make.py --duration 20 --fade 0.3       # 调时长\n"
            "\n可用声线: " + ", ".join(TTS_VOICES.keys())
        ),
    )
    parser.add_argument("--images", default=str(IMAGE_DIR),
                        help=f"图片目录（默认: {IMAGE_DIR}）")
    parser.add_argument("--subs", default=str(SUBTITLES_FILE),
                        help=f"字幕文件（默认: {SUBTITLES_FILE}）")
    parser.add_argument("--music", default=None,
                        help="背景音乐路径（默认找 input/music/ 下的 mp3）")
    parser.add_argument("--no-music", action="store_true",
                        help="不使用背景音乐")
    parser.add_argument("--local-tts", action="store_true",
                        help="使用本地 CosyVoice2 声音克隆（无需联网）")
    parser.add_argument("--tts-voice", default="xiaoxiao",
                        choices=list(TTS_VOICES.keys()),
                        help="AI 配音声线（默认 xiaoxiao）")
    parser.add_argument("--no-tts", action="store_true",
                        help="不生成 AI 配音")
    parser.add_argument("--output", default=str(PROJECT_DIR / "output" / "output.mp4"),
                        help="输出视频路径")
    parser.add_argument("--duration", type=float, default=15.0,
                        help="视频总时长（秒，默认 15）")
    parser.add_argument("--fade", type=float, default=0.4,
                        help="淡入淡出时长（秒，默认 0.4）")
    parser.add_argument("--preview", action="store_true",
                        help="预览模式（不渲染）")
    args = parser.parse_args()

    # ── 加载素材 ──
    images = load_images(Path(args.images))
    subs = load_subtitles(Path(args.subs), len(images))

    # ── 背景音乐 ──
    music = None
    if not args.no_music:
        if args.music:
            music = Path(args.music)
        else:
            music_files = list(MUSIC_DIR.glob("*.mp3")) + list(MUSIC_DIR.glob("*.wav"))
            if music_files:
                music = music_files[0]
        if music and not music.exists():
            print(f"⚠️  音乐文件不存在: {music}")
            music = None

    # ── TTS ──
    local_tts_engine = None
    if args.local_tts:
        if not _HAS_LOCAL_TTS:
            print("❌ local_tts 模块未找到，请确认 CosyVoice 已部署")
            sys.exit(1)
        print("\n🔊 本地配音模式（CosyVoice2 声音克隆）")
        local_tts_engine = LocalTTS()
        tts_voice = None
    elif args.no_tts:
        tts_voice = None
    else:
        tts_voice = TTS_VOICES[args.tts_voice]

    make_video(
        image_paths=images,
        subtitles=subs,
        output_path=Path(args.output),
        total_duration=args.duration,
        fade_duration=args.fade,
        music_path=music,
        tts_voice=tts_voice,
        local_tts=local_tts_engine,
        preview=args.preview,
    )


if __name__ == "__main__":
    main()