Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions projects/singularity_cinema/ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
* [x] 分析多模态数据 P0
* [x] 直接使用多模态数据(图表、图片等) P0
* [ ] 支持额外的梗图 P1
- [ ] 默认支持更多的tts语音 P0
- [x] 默认支持更多的tts语音 P0
- [ ] 支持更多LLM模型,例如Qwen系列、DeepSeek系列等 P0
- [ ] 支持一个segment中多个字幕切换,防止文字超长 P0
- [x] 支持一个segment中多个字幕切换,防止文字超长 P0
- [x] 支持文生视频 P1
- [ ] 支持更复杂的前景设计和背景特效 P0
- [ ] 上/下/左/右镜头移动 P0
Expand Down
111 changes: 110 additions & 1 deletion projects/singularity_cinema/agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ foreground:
voice: male

voices:
# Only partial Chinese and English voice are provided. If you have more voice requirements, please execute the "edge-tts --list-voices" command to view
# Chinese voices - Mandarin
male:
voice: zh-CN-YunjianNeural
rate: '+0%'
Expand All @@ -150,8 +152,115 @@ voices:
rate: '-5%'
pitch: '-50Hz'

xiaoyi:
voice: zh-CN-XiaoyiNeural
rate: '+0%'
pitch: '+0Hz'

yunjie:
voice: zh-CN-YunjieNeural
rate: '+0%'
pitch: '+0Hz'

yunxi:
voice: zh-CN-YunxiNeural
rate: '+0%'
pitch: '+0Hz'

yunxia:
voice: zh-CN-YunxiaNeural
rate: '+0%'
pitch: '+0Hz'

# Chinese voices - Dialect
hsiaochen:
voice: zh-TW-HsiaoChenNeural
rate: '+0%'
pitch: '+0Hz'

yunjhe:
voice: zh-TW-YunJheNeural
rate: '+0%'
pitch: '+0Hz'

xiaoni:
voice: zh-CN-shaanxi-XiaoniNeural
rate: '+0%'
pitch: '+0Hz'

xiaobei:
voice: zh-CN-liaoning-XiaobeiNeural
rate: '+0%'
pitch: '+0Hz'

wanlung:
voice: zh-HK-WanLungNeural
rate: '+0%'
pitch: '+0Hz'

hiugaai:
voice: zh-HK-HiuGaaiNeural
rate: '+0%'
pitch: '+0Hz'

# English voices
aria:
voice: en-US-AriaNeural
rate: '+0%'
pitch: '+0Hz'

sonia:
voice: en-GB-SoniaNeural
rate: '+0%'
pitch: '+0Hz'

guy:
voice: en-US-GuyNeural
rate: '+0%'
pitch: '+0Hz'

jenny:
voice: en-US-JennyNeural
rate: '+0%'
pitch: '+0Hz'

natasha:
voice: en-AU-NatashaNeural
rate: '+0%'
pitch: '+0Hz'

william:
voice: en-AU-WilliamMultilingualNeural
rate: '+0%'
pitch: '+0Hz'

clara:
voice: en-CA-ClaraNeural
rate: '+0%'
pitch: '+0Hz'

liam:
voice: en-CA-LiamNeural
rate: '+0%'
pitch: '+0Hz'

ava:
voice: en-US-AvaNeural
rate: '+0%'
pitch: '+0Hz'

andrew:
voice: en-US-AndrewNeural
rate: '+0%'
pitch: '+0Hz'

emma:
voice: en-US-EmmaMultilingualNeural
voice: en-US-EmmaNeural
rate: '+0%'
pitch: '+0Hz'

ana:
voice: en-US-AnaNeural
rate: '+0%'
pitch: '+0Hz'

Expand Down
105 changes: 91 additions & 14 deletions projects/singularity_cinema/step12_generate_subtitle/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,71 @@

logger = get_logger()

PUNCTUATION_OVERFLOW_ALLOWANCE = 2
PUNCT_CHARS = r',。!?;:,.!?;:、()[]{}"\'——“”《》<>—'


def _is_punct(tok: str) -> bool:
return len(tok) == 1 and tok in PUNCT_CHARS


def _tokenize_text(text: str) -> List[str]:
if not text:
return []
# Split by whitespace or punctuation, keeping single-char punctuation as tokens
tokens = re.split(r'(\s+|[' + re.escape(PUNCT_CHARS) + r'])', text)
tokens = [t for t in tokens if t and not t.isspace()]
return tokens


def _chunk_tokens(tokens: List[str], max_len: int) -> List[str]:
chunks = []
cur = ''
for t in tokens:
if len(t) > max_len:
# If a single token exceeds max_len, split it
for i in range(0, len(t), max_len):
sub = t[i:i + max_len]
if cur:
chunks.append(cur.strip())
cur = ''
chunks.append(sub)
continue

candidate = (cur + t).strip() if cur else t
if len(candidate) <= max_len:
cur = candidate + ' '
continue

# If t is punctuation and can be merged with previous chunk (allowing slight overflow)
if _is_punct(t) and cur and len(cur) + len(
t) <= max_len + PUNCTUATION_OVERFLOW_ALLOWANCE:
cur = (cur + t).strip() + ' '
continue

if cur:
chunks.append(cur.strip())
cur = t + ' '

if cur.strip():
chunks.append(cur.strip())
return chunks


def _clean_chunks(chunks: List[str], max_len: int) -> List[str]:
cleaned = []
for c in chunks:
c = c.strip()
while c and _is_punct(c[0]) and cleaned:
if len(cleaned[-1]) + 1 > max_len + PUNCTUATION_OVERFLOW_ALLOWANCE:
break
cleaned[-1] += c[0]
c = c[1:].lstrip()

if c:
cleaned.append(c)
return cleaned


class GenerateSubtitle(CodeAgent):

Expand All @@ -38,22 +103,34 @@ async def execute_code(self, messages, **kwargs):
logger.info('Generating subtitles.')
for i, seg in enumerate(segments):
text = seg.get('content', '')
subtitle = None
if self.subtitle_translate:
subtitle = await self.translate_text(text,
self.subtitle_translate)
output_file = os.path.join(self.subtitle_dir,
f'bilingual_subtitle_{i + 1}.png')
if os.path.exists(output_file):
continue
self.create_bilingual_subtitle_image(
source=text,
target=subtitle,
output_file=output_file,
width=1720,
height=180)
text_chunks = self.split_text_to_chunks(text)
for j, chunk_text in enumerate(text_chunks):
subtitle = None
if self.subtitle_translate:
subtitle = await self.translate_text(
chunk_text, self.subtitle_translate)

output_file = os.path.join(
self.subtitle_dir, f'bilingual_subtitle_{i + 1}_{j}.png')
if os.path.exists(output_file):
continue

self.create_bilingual_subtitle_image(
source=chunk_text,
target=subtitle,
output_file=output_file,
width=1720,
height=180)
return messages

def split_text_to_chunks(self, text, max_len: int = 30):
"""
Split text into chunks of max_len, prioritizing splits at punctuation.
"""
tokens = _tokenize_text(text)
chunks = _chunk_tokens(tokens, max_len)
return _clean_chunks(chunks, max_len)

async def translate_text(self, text, to_lang):

prompt = f"""You are a professional translation expert specializing in accurately and fluently translating text into {to_lang}.
Expand Down
85 changes: 61 additions & 24 deletions projects/singularity_cinema/step14_compose_video/agent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import glob
import os
import re
from typing import Union

import json
import moviepy as mp
Expand All @@ -11,6 +14,24 @@

logger = get_logger()

_SUBTITLE_INDEX_RE = re.compile(
r'(\d+)(?=[^\d]*$)') # Match the last sequence of digits in the filename


def _get_subtitle_index_from_filename(path: str) -> Union[int, float]:
"""
Extract the trailing numeric index from a file path or filename.
Returns an int index; if parsing fails, returns float('inf') to sort invalid files to the end.
"""
name = os.path.basename(path)
m = _SUBTITLE_INDEX_RE.search(name)
if not m:
return float('inf')
try:
return int(m.group(1))
except (ValueError, TypeError):
return float('inf')


class ComposeVideo(CodeAgent):

Expand Down Expand Up @@ -285,27 +306,41 @@ def illustration_pos(t):
fg_clip = fg_clip.with_position(('center', 'center'))
fg_clip = fg_clip.with_duration(duration)
current_video_clips.append(fg_clip)
if self.config.use_subtitle:
if i < len(subtitle_paths
) and subtitle_paths[i] and os.path.exists(
subtitle_paths[i]):
subtitle_img = Image.open(subtitle_paths[i])
subtitle_w, subtitle_h = subtitle_img.size

# Validate subtitle dimensions
if subtitle_w <= 0 or subtitle_h <= 0:
logger.error(
f'Invalid subtitle dimensions: {subtitle_w}x{subtitle_h} for {subtitle_paths[i]}'
)
else:
subtitle_clip = mp.ImageClip(
subtitle_paths[i], duration=duration)
subtitle_clip = subtitle_clip.resized(
(subtitle_w, subtitle_h))
subtitle_y = 900
subtitle_clip = subtitle_clip.with_position(
('center', subtitle_y))
current_video_clips.append(subtitle_clip)
if self.config.use_subtitle and i < len(
subtitle_paths) and subtitle_paths[i]:
current_segment_subs = subtitle_paths[i]
num_subs = len(current_segment_subs)
if num_subs > 0:
sub_duration = duration / num_subs
for j, sub_path in enumerate(current_segment_subs):
if sub_path and os.path.exists(sub_path):
subtitle_img = Image.open(sub_path)
subtitle_w, subtitle_h = subtitle_img.size

if subtitle_w <= 0 or subtitle_h <= 0:
logger.error(
f'Invalid subtitle dimensions: {subtitle_w}x{subtitle_h} for {sub_path}'
)
else:
target_h = 180 # TODO: Define as a class constant
if subtitle_h > target_h:
scale = target_h / subtitle_h
new_w = max(1, int(subtitle_w * scale))
new_h = target_h
else:
new_w, new_h = subtitle_w, subtitle_h

subtitle_clip = mp.ImageClip(
sub_path, duration=sub_duration)
subtitle_clip = subtitle_clip.resized(
(new_w, new_h))

subtitle_y = 900
subtitle_clip = subtitle_clip.with_position(
('center', subtitle_y))
subtitle_clip = subtitle_clip.with_start(
j * sub_duration)
current_video_clips.append(subtitle_clip)

# Add background as top layer (transparent PNG with decorative elements)
if background_path and os.path.exists(background_path):
Expand Down Expand Up @@ -442,9 +477,11 @@ async def execute_code(self, messages, **kwargs):
f'Scene{i+1}.mov'))
audio_paths.append(
os.path.join(self.tts_dir, f'segment_{i + 1}.mp3'))
subtitle_paths.append(
os.path.join(self.subtitle_dir,
f'bilingual_subtitle_{i + 1}.png'))
pattern = os.path.join(self.subtitle_dir,
f'bilingual_subtitle_{i + 1}_*.png')
sub_files = sorted(
glob.glob(pattern), key=_get_subtitle_index_from_filename)
subtitle_paths.append(sub_files)
video_paths.append(
os.path.join(self.videos_dir, f'video_{i + 1}.mp4'))

Expand Down
Loading