diff --git a/projects/singularity_cinema/ROADMAP.md b/projects/singularity_cinema/ROADMAP.md index b983a8f27..32fc1bde8 100644 --- a/projects/singularity_cinema/ROADMAP.md +++ b/projects/singularity_cinema/ROADMAP.md @@ -15,9 +15,9 @@ * [x] 分析多模态数据 P0 * [x] 直接使用多模态数据(图表、图片等) P0 * [ ] 支持额外的梗图 P1 -- [ ] 默认支持更多的tts语音 P0 +- [x] 默认支持更多的tts语音 P0 - [ ] 支持更多LLM模型,例如Qwen系列、DeepSeek系列等 P0 -- [ ] 支持一个segment中多个字幕切换,防止文字超长 P0 +- [x] 支持一个segment中多个字幕切换,防止文字超长 P0 - [x] 支持文生视频 P1 - [ ] 支持更复杂的前景设计和背景特效 P0 - [ ] 上/下/左/右镜头移动 P0 diff --git a/projects/singularity_cinema/agent.yaml b/projects/singularity_cinema/agent.yaml index ae1a57de9..918539125 100644 --- a/projects/singularity_cinema/agent.yaml +++ b/projects/singularity_cinema/agent.yaml @@ -130,6 +130,8 @@ foreground: voice: male voices: + # Only partial Chinese and English voice are provided. If you have more voice requirements, please execute the "edge-tts --list-voices" command to view + # Chinese voices - Mandarin male: voice: zh-CN-YunjianNeural rate: '+0%' @@ -150,8 +152,115 @@ voices: rate: '-5%' pitch: '-50Hz' + xiaoyi: + voice: zh-CN-XiaoyiNeural + rate: '+0%' + pitch: '+0Hz' + + yunjie: + voice: zh-CN-YunjieNeural + rate: '+0%' + pitch: '+0Hz' + + yunxi: + voice: zh-CN-YunxiNeural + rate: '+0%' + pitch: '+0Hz' + + yunxia: + voice: zh-CN-YunxiaNeural + rate: '+0%' + pitch: '+0Hz' + + # Chinese voices - Dialect + hsiaochen: + voice: zh-TW-HsiaoChenNeural + rate: '+0%' + pitch: '+0Hz' + + yunjhe: + voice: zh-TW-YunJheNeural + rate: '+0%' + pitch: '+0Hz' + + xiaoni: + voice: zh-CN-shaanxi-XiaoniNeural + rate: '+0%' + pitch: '+0Hz' + + xiaobei: + voice: zh-CN-liaoning-XiaobeiNeural + rate: '+0%' + pitch: '+0Hz' + + wanlung: + voice: zh-HK-WanLungNeural + rate: '+0%' + pitch: '+0Hz' + + hiugaai: + voice: zh-HK-HiuGaaiNeural + rate: '+0%' + pitch: '+0Hz' + + # English voices + aria: + voice: en-US-AriaNeural + rate: '+0%' + pitch: '+0Hz' + + sonia: + voice: en-GB-SoniaNeural + rate: '+0%' + pitch: '+0Hz' + + guy: + voice: en-US-GuyNeural + rate: '+0%' + pitch: '+0Hz' + + jenny: + voice: en-US-JennyNeural + rate: '+0%' + pitch: '+0Hz' + + natasha: + voice: en-AU-NatashaNeural + rate: '+0%' + pitch: '+0Hz' + + william: + voice: en-AU-WilliamMultilingualNeural + rate: '+0%' + pitch: '+0Hz' + + clara: + voice: en-CA-ClaraNeural + rate: '+0%' + pitch: '+0Hz' + + liam: + voice: en-CA-LiamNeural + rate: '+0%' + pitch: '+0Hz' + + ava: + voice: en-US-AvaNeural + rate: '+0%' + pitch: '+0Hz' + + andrew: + voice: en-US-AndrewNeural + rate: '+0%' + pitch: '+0Hz' + emma: - voice: en-US-EmmaMultilingualNeural + voice: en-US-EmmaNeural + rate: '+0%' + pitch: '+0Hz' + + ana: + voice: en-US-AnaNeural rate: '+0%' pitch: '+0Hz' diff --git a/projects/singularity_cinema/step12_generate_subtitle/agent.py b/projects/singularity_cinema/step12_generate_subtitle/agent.py index 6be3edd31..0c0992f2c 100644 --- a/projects/singularity_cinema/step12_generate_subtitle/agent.py +++ b/projects/singularity_cinema/step12_generate_subtitle/agent.py @@ -13,6 +13,71 @@ logger = get_logger() +PUNCTUATION_OVERFLOW_ALLOWANCE = 2 +PUNCT_CHARS = r',。!?;:,.!?;:、()[]{}"\'——“”《》<>—' + + +def _is_punct(tok: str) -> bool: + return len(tok) == 1 and tok in PUNCT_CHARS + + +def _tokenize_text(text: str) -> List[str]: + if not text: + return [] + # Split by whitespace or punctuation, keeping single-char punctuation as tokens + tokens = re.split(r'(\s+|[' + re.escape(PUNCT_CHARS) + r'])', text) + tokens = [t for t in tokens if t and not t.isspace()] + return tokens + + +def _chunk_tokens(tokens: List[str], max_len: int) -> List[str]: + chunks = [] + cur = '' + for t in tokens: + if len(t) > max_len: + # If a single token exceeds max_len, split it + for i in range(0, len(t), max_len): + sub = t[i:i + max_len] + if cur: + chunks.append(cur.strip()) + cur = '' + chunks.append(sub) + continue + + candidate = (cur + t).strip() if cur else t + if len(candidate) <= max_len: + cur = candidate + ' ' + continue + + # If t is punctuation and can be merged with previous chunk (allowing slight overflow) + if _is_punct(t) and cur and len(cur) + len( + t) <= max_len + PUNCTUATION_OVERFLOW_ALLOWANCE: + cur = (cur + t).strip() + ' ' + continue + + if cur: + chunks.append(cur.strip()) + cur = t + ' ' + + if cur.strip(): + chunks.append(cur.strip()) + return chunks + + +def _clean_chunks(chunks: List[str], max_len: int) -> List[str]: + cleaned = [] + for c in chunks: + c = c.strip() + while c and _is_punct(c[0]) and cleaned: + if len(cleaned[-1]) + 1 > max_len + PUNCTUATION_OVERFLOW_ALLOWANCE: + break + cleaned[-1] += c[0] + c = c[1:].lstrip() + + if c: + cleaned.append(c) + return cleaned + class GenerateSubtitle(CodeAgent): @@ -38,22 +103,34 @@ async def execute_code(self, messages, **kwargs): logger.info('Generating subtitles.') for i, seg in enumerate(segments): text = seg.get('content', '') - subtitle = None - if self.subtitle_translate: - subtitle = await self.translate_text(text, - self.subtitle_translate) - output_file = os.path.join(self.subtitle_dir, - f'bilingual_subtitle_{i + 1}.png') - if os.path.exists(output_file): - continue - self.create_bilingual_subtitle_image( - source=text, - target=subtitle, - output_file=output_file, - width=1720, - height=180) + text_chunks = self.split_text_to_chunks(text) + for j, chunk_text in enumerate(text_chunks): + subtitle = None + if self.subtitle_translate: + subtitle = await self.translate_text( + chunk_text, self.subtitle_translate) + + output_file = os.path.join( + self.subtitle_dir, f'bilingual_subtitle_{i + 1}_{j}.png') + if os.path.exists(output_file): + continue + + self.create_bilingual_subtitle_image( + source=chunk_text, + target=subtitle, + output_file=output_file, + width=1720, + height=180) return messages + def split_text_to_chunks(self, text, max_len: int = 30): + """ + Split text into chunks of max_len, prioritizing splits at punctuation. + """ + tokens = _tokenize_text(text) + chunks = _chunk_tokens(tokens, max_len) + return _clean_chunks(chunks, max_len) + async def translate_text(self, text, to_lang): prompt = f"""You are a professional translation expert specializing in accurately and fluently translating text into {to_lang}. diff --git a/projects/singularity_cinema/step14_compose_video/agent.py b/projects/singularity_cinema/step14_compose_video/agent.py index a857bc679..488cde9ea 100644 --- a/projects/singularity_cinema/step14_compose_video/agent.py +++ b/projects/singularity_cinema/step14_compose_video/agent.py @@ -1,5 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import glob import os +import re +from typing import Union import json import moviepy as mp @@ -11,6 +14,24 @@ logger = get_logger() +_SUBTITLE_INDEX_RE = re.compile( + r'(\d+)(?=[^\d]*$)') # Match the last sequence of digits in the filename + + +def _get_subtitle_index_from_filename(path: str) -> Union[int, float]: + """ + Extract the trailing numeric index from a file path or filename. + Returns an int index; if parsing fails, returns float('inf') to sort invalid files to the end. + """ + name = os.path.basename(path) + m = _SUBTITLE_INDEX_RE.search(name) + if not m: + return float('inf') + try: + return int(m.group(1)) + except (ValueError, TypeError): + return float('inf') + class ComposeVideo(CodeAgent): @@ -285,27 +306,41 @@ def illustration_pos(t): fg_clip = fg_clip.with_position(('center', 'center')) fg_clip = fg_clip.with_duration(duration) current_video_clips.append(fg_clip) - if self.config.use_subtitle: - if i < len(subtitle_paths - ) and subtitle_paths[i] and os.path.exists( - subtitle_paths[i]): - subtitle_img = Image.open(subtitle_paths[i]) - subtitle_w, subtitle_h = subtitle_img.size - - # Validate subtitle dimensions - if subtitle_w <= 0 or subtitle_h <= 0: - logger.error( - f'Invalid subtitle dimensions: {subtitle_w}x{subtitle_h} for {subtitle_paths[i]}' - ) - else: - subtitle_clip = mp.ImageClip( - subtitle_paths[i], duration=duration) - subtitle_clip = subtitle_clip.resized( - (subtitle_w, subtitle_h)) - subtitle_y = 900 - subtitle_clip = subtitle_clip.with_position( - ('center', subtitle_y)) - current_video_clips.append(subtitle_clip) + if self.config.use_subtitle and i < len( + subtitle_paths) and subtitle_paths[i]: + current_segment_subs = subtitle_paths[i] + num_subs = len(current_segment_subs) + if num_subs > 0: + sub_duration = duration / num_subs + for j, sub_path in enumerate(current_segment_subs): + if sub_path and os.path.exists(sub_path): + subtitle_img = Image.open(sub_path) + subtitle_w, subtitle_h = subtitle_img.size + + if subtitle_w <= 0 or subtitle_h <= 0: + logger.error( + f'Invalid subtitle dimensions: {subtitle_w}x{subtitle_h} for {sub_path}' + ) + else: + target_h = 180 # TODO: Define as a class constant + if subtitle_h > target_h: + scale = target_h / subtitle_h + new_w = max(1, int(subtitle_w * scale)) + new_h = target_h + else: + new_w, new_h = subtitle_w, subtitle_h + + subtitle_clip = mp.ImageClip( + sub_path, duration=sub_duration) + subtitle_clip = subtitle_clip.resized( + (new_w, new_h)) + + subtitle_y = 900 + subtitle_clip = subtitle_clip.with_position( + ('center', subtitle_y)) + subtitle_clip = subtitle_clip.with_start( + j * sub_duration) + current_video_clips.append(subtitle_clip) # Add background as top layer (transparent PNG with decorative elements) if background_path and os.path.exists(background_path): @@ -442,9 +477,11 @@ async def execute_code(self, messages, **kwargs): f'Scene{i+1}.mov')) audio_paths.append( os.path.join(self.tts_dir, f'segment_{i + 1}.mp3')) - subtitle_paths.append( - os.path.join(self.subtitle_dir, - f'bilingual_subtitle_{i + 1}.png')) + pattern = os.path.join(self.subtitle_dir, + f'bilingual_subtitle_{i + 1}_*.png') + sub_files = sorted( + glob.glob(pattern), key=_get_subtitle_index_from_filename) + subtitle_paths.append(sub_files) video_paths.append( os.path.join(self.videos_dir, f'video_{i + 1}.mp4'))