modelscope · xiaoxianhjy · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025
diff --git a/projects/singularity_cinema/ROADMAP.md b/projects/singularity_cinema/ROADMAP.md
@@ -15,9 +15,9 @@
     * [x] 分析多模态数据 P0
     * [x] 直接使用多模态数据（图表、图片等） P0
     * [ ] 支持额外的梗图 P1
-- [ ] 默认支持更多的tts语音 P0
+- [x] 默认支持更多的tts语音 P0
 - [ ] 支持更多LLM模型，例如Qwen系列、DeepSeek系列等 P0
-- [ ] 支持一个segment中多个字幕切换，防止文字超长 P0
+- [x] 支持一个segment中多个字幕切换，防止文字超长 P0
 - [x] 支持文生视频 P1
 - [ ] 支持更复杂的前景设计和背景特效 P0
   - [ ] 上/下/左/右镜头移动 P0

diff --git a/projects/singularity_cinema/agent.yaml b/projects/singularity_cinema/agent.yaml
@@ -130,6 +130,8 @@ foreground:
 voice: male
 
 voices:
+  # Only partial Chinese and English voice are provided. If you have more voice requirements, please execute the "edge-tts --list-voices" command to view
+  # Chinese voices - Mandarin
   male:
     voice: zh-CN-YunjianNeural
     rate: '+0%'
@@ -150,8 +152,115 @@ voices:
     rate: '-5%'
     pitch: '-50Hz'
 
+  xiaoyi:
+    voice: zh-CN-XiaoyiNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  yunjie:
+    voice: zh-CN-YunjieNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  yunxi:
+    voice: zh-CN-YunxiNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  yunxia:
+    voice: zh-CN-YunxiaNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  # Chinese voices - Dialect
+  hsiaochen:
+    voice: zh-TW-HsiaoChenNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  yunjhe:
+    voice: zh-TW-YunJheNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  xiaoni:
+    voice: zh-CN-shaanxi-XiaoniNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  xiaobei:
+    voice: zh-CN-liaoning-XiaobeiNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  wanlung:
+    voice: zh-HK-WanLungNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  hiugaai:
+    voice: zh-HK-HiuGaaiNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  # English voices
+  aria:
+    voice: en-US-AriaNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  sonia:
+    voice: en-GB-SoniaNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  guy:
+    voice: en-US-GuyNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  jenny:
+    voice: en-US-JennyNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  natasha:
+    voice: en-AU-NatashaNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  william:
+    voice: en-AU-WilliamMultilingualNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  clara:
+    voice: en-CA-ClaraNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  liam:
+    voice: en-CA-LiamNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  ava:
+    voice: en-US-AvaNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  andrew:
+    voice: en-US-AndrewNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
   emma:
-    voice: en-US-EmmaMultilingualNeural
+    voice: en-US-EmmaNeural
+    rate: '+0%'
+    pitch: '+0Hz'
+
+  ana:
+    voice: en-US-AnaNeural
     rate: '+0%'
     pitch: '+0Hz'
 

diff --git a/projects/singularity_cinema/step12_generate_subtitle/agent.py b/projects/singularity_cinema/step12_generate_subtitle/agent.py
@@ -13,6 +13,71 @@
 
 logger = get_logger()
 
+PUNCTUATION_OVERFLOW_ALLOWANCE = 2
+PUNCT_CHARS = r'，。！？;:,.!?;:、()[]{}"\'——“”《》<>—'
+
+
+def _is_punct(tok: str) -> bool:
+    return len(tok) == 1 and tok in PUNCT_CHARS
+
+
+def _tokenize_text(text: str) -> List[str]:
+    if not text:
+        return []
+    # Split by whitespace or punctuation, keeping single-char punctuation as tokens
+    tokens = re.split(r'(\s+|[' + re.escape(PUNCT_CHARS) + r'])', text)
+    tokens = [t for t in tokens if t and not t.isspace()]
+    return tokens
+
+
+def _chunk_tokens(tokens: List[str], max_len: int) -> List[str]:
+    chunks = []
+    cur = ''
+    for t in tokens:
+        if len(t) > max_len:
+            # If a single token exceeds max_len, split it
+            for i in range(0, len(t), max_len):
+                sub = t[i:i + max_len]
+                if cur:
+                    chunks.append(cur.strip())
+                    cur = ''
+                chunks.append(sub)
+            continue
+
+        candidate = (cur + t).strip() if cur else t
+        if len(candidate) <= max_len:
+            cur = candidate + ' '
+            continue
+
+        # If t is punctuation and can be merged with previous chunk (allowing slight overflow)
+        if _is_punct(t) and cur and len(cur) + len(
+                t) <= max_len + PUNCTUATION_OVERFLOW_ALLOWANCE:
+            cur = (cur + t).strip() + ' '
+            continue
+
+        if cur:
+            chunks.append(cur.strip())
+        cur = t + ' '
+
+    if cur.strip():
+        chunks.append(cur.strip())
+    return chunks
+
+
+def _clean_chunks(chunks: List[str], max_len: int) -> List[str]:
+    cleaned = []
+    for c in chunks:
+        c = c.strip()
+        while c and _is_punct(c[0]) and cleaned:
+            if len(cleaned[-1]) + 1 > max_len + PUNCTUATION_OVERFLOW_ALLOWANCE:
+                break
+            cleaned[-1] += c[0]
+            c = c[1:].lstrip()
+
+        if c:
+            cleaned.append(c)
+    return cleaned
+
 
 class GenerateSubtitle(CodeAgent):
 
@@ -38,22 +103,34 @@ async def execute_code(self, messages, **kwargs):
         logger.info('Generating subtitles.')
         for i, seg in enumerate(segments):
             text = seg.get('content', '')
-            subtitle = None
-            if self.subtitle_translate:
-                subtitle = await self.translate_text(text,
-                                                     self.subtitle_translate)
-            output_file = os.path.join(self.subtitle_dir,
-                                       f'bilingual_subtitle_{i + 1}.png')
-            if os.path.exists(output_file):
-                continue
-            self.create_bilingual_subtitle_image(
-                source=text,
-                target=subtitle,
-                output_file=output_file,
-                width=1720,
-                height=180)
+            text_chunks = self.split_text_to_chunks(text)
+            for j, chunk_text in enumerate(text_chunks):
+                subtitle = None
+                if self.subtitle_translate:
+                    subtitle = await self.translate_text(
+                        chunk_text, self.subtitle_translate)
+
+                output_file = os.path.join(
+                    self.subtitle_dir, f'bilingual_subtitle_{i + 1}_{j}.png')
+                if os.path.exists(output_file):
+                    continue
+
+                self.create_bilingual_subtitle_image(
+                    source=chunk_text,
+                    target=subtitle,
+                    output_file=output_file,
+                    width=1720,
+                    height=180)
         return messages
 
+    def split_text_to_chunks(self, text, max_len: int = 30):
+        """
+        Split text into chunks of max_len, prioritizing splits at punctuation.
+        """
+        tokens = _tokenize_text(text)
+        chunks = _chunk_tokens(tokens, max_len)
+        return _clean_chunks(chunks, max_len)
+
     async def translate_text(self, text, to_lang):
 
         prompt = f"""You are a professional translation expert specializing in accurately and fluently translating text into {to_lang}.

diff --git a/projects/singularity_cinema/step14_compose_video/agent.py b/projects/singularity_cinema/step14_compose_video/agent.py
@@ -1,5 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
 import os
+import re
+from typing import Union
 
 import json
 import moviepy as mp
@@ -11,6 +14,24 @@
 
 logger = get_logger()
 
+_SUBTITLE_INDEX_RE = re.compile(
+    r'(\d+)(?=[^\d]*$)')  # Match the last sequence of digits in the filename
+
+
+def _get_subtitle_index_from_filename(path: str) -> Union[int, float]:
+    """
+    Extract the trailing numeric index from a file path or filename.
+    Returns an int index; if parsing fails, returns float('inf') to sort invalid files to the end.
+    """
+    name = os.path.basename(path)
+    m = _SUBTITLE_INDEX_RE.search(name)
+    if not m:
+        return float('inf')
+    try:
+        return int(m.group(1))
+    except (ValueError, TypeError):
+        return float('inf')
+
 
 class ComposeVideo(CodeAgent):
 
@@ -285,27 +306,41 @@ def illustration_pos(t):
                 fg_clip = fg_clip.with_position(('center', 'center'))
                 fg_clip = fg_clip.with_duration(duration)
                 current_video_clips.append(fg_clip)
-            if self.config.use_subtitle:
-                if i < len(subtitle_paths
-                           ) and subtitle_paths[i] and os.path.exists(
-                               subtitle_paths[i]):
-                    subtitle_img = Image.open(subtitle_paths[i])
-                    subtitle_w, subtitle_h = subtitle_img.size
-
-                    # Validate subtitle dimensions
-                    if subtitle_w <= 0 or subtitle_h <= 0:
-                        logger.error(
-                            f'Invalid subtitle dimensions: {subtitle_w}x{subtitle_h} for {subtitle_paths[i]}'
-                        )
-                    else:
-                        subtitle_clip = mp.ImageClip(
-                            subtitle_paths[i], duration=duration)
-                        subtitle_clip = subtitle_clip.resized(
-                            (subtitle_w, subtitle_h))
-                        subtitle_y = 900
-                        subtitle_clip = subtitle_clip.with_position(
-                            ('center', subtitle_y))
-                        current_video_clips.append(subtitle_clip)
+            if self.config.use_subtitle and i < len(
+                    subtitle_paths) and subtitle_paths[i]:
+                current_segment_subs = subtitle_paths[i]
+                num_subs = len(current_segment_subs)
+                if num_subs > 0:
+                    sub_duration = duration / num_subs
+                    for j, sub_path in enumerate(current_segment_subs):
+                        if sub_path and os.path.exists(sub_path):
+                            subtitle_img = Image.open(sub_path)
+                            subtitle_w, subtitle_h = subtitle_img.size
+
+                            if subtitle_w <= 0 or subtitle_h <= 0:
+                                logger.error(
+                                    f'Invalid subtitle dimensions: {subtitle_w}x{subtitle_h} for {sub_path}'
+                                )
+                            else:
+                                target_h = 180  # TODO: Define as a class constant
+                                if subtitle_h > target_h:
+                                    scale = target_h / subtitle_h
+                                    new_w = max(1, int(subtitle_w * scale))
+                                    new_h = target_h
+                                else:
+                                    new_w, new_h = subtitle_w, subtitle_h
+
+                                subtitle_clip = mp.ImageClip(
+                                    sub_path, duration=sub_duration)
+                                subtitle_clip = subtitle_clip.resized(
+                                    (new_w, new_h))
+
+                                subtitle_y = 900
+                                subtitle_clip = subtitle_clip.with_position(
+                                    ('center', subtitle_y))
+                                subtitle_clip = subtitle_clip.with_start(
+                                    j * sub_duration)
+                                current_video_clips.append(subtitle_clip)
 
             # Add background as top layer (transparent PNG with decorative elements)
             if background_path and os.path.exists(background_path):
@@ -442,9 +477,11 @@ async def execute_code(self, messages, **kwargs):
                              f'Scene{i+1}.mov'))
             audio_paths.append(
                 os.path.join(self.tts_dir, f'segment_{i + 1}.mp3'))
-            subtitle_paths.append(
-                os.path.join(self.subtitle_dir,
-                             f'bilingual_subtitle_{i + 1}.png'))
+            pattern = os.path.join(self.subtitle_dir,
+                                   f'bilingual_subtitle_{i + 1}_*.png')
+            sub_files = sorted(
+                glob.glob(pattern), key=_get_subtitle_index_from_filename)
+            subtitle_paths.append(sub_files)
             video_paths.append(
                 os.path.join(self.videos_dir, f'video_{i + 1}.mp4'))