FluidInference · Alex-Wengg · Jun 26, 2026 · Jun 24, 2026
diff --git a/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Constants.swift b/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Constants.swift
@@ -69,8 +69,14 @@ public enum Supertonic3Constants {
     public static let defaultSpeed: Float = 1.05
 
     /// Default silence inserted between text chunks when synthesizing long
-    /// utterances. 0.3 s mirrors the reference CLI default.
-    public static let defaultSilenceDuration: Float = 0.3
+    /// utterances. The 70-char chunk cap (#669) splits a paragraph into many
+    /// chunks; the reference CLI's 0.3 s pad stacks on top of the model's own
+    /// trailing sentence silence, inflating natural ~0.5–1.0 s sentence pauses
+    /// to ~1.1–1.2 s (the "unintended pauses" of #736). 0.05 s keeps the seams
+    /// from butting tokens together while letting the model's intrinsic
+    /// sentence prosody come through. Override via the synthesize parameter
+    /// (CLI `--silence`).
+    public static let defaultSilenceDuration: Float = 0.05
 
     /// Max characters per chunk when synthesizing long English/Latin text.
     /// Although `textTFixed = 128` would *fit* ~110 chars, the model's output

diff --git a/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Manager.swift b/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Manager.swift
@@ -107,7 +107,8 @@ public actor Supertonic3Manager {
     ///   - speed: Speech-rate multiplier (default 1.05). Divides the
     ///     predicted duration vector.
     ///   - silenceDuration: Silence inserted between chunks when the text
-    ///     is split into multiple chunks. Default 0.3 s.
+    ///     is split into multiple chunks. Default 0.05 s — see
+    ///     `Supertonic3Constants.defaultSilenceDuration`.
     public func synthesize(
         text: String,
         language: String,

diff --git a/Sources/FluidAudioCLI/Commands/TTSCommand.swift b/Sources/FluidAudioCLI/Commands/TTSCommand.swift
@@ -92,6 +92,7 @@ public struct TTS {
         var supertonicVoiceStylePath: String? = nil
         var supertonicTotalSteps: Int = Supertonic3Constants.defaultTotalSteps
         var supertonicSpeed: Float = Supertonic3Constants.defaultSpeed
+        var supertonicSilence: Float = Supertonic3Constants.defaultSilenceDuration
         // VectorEstimator build: fp16 | int8/int6/int4 (ANE-bucketed) |
         // dyn-int8/dyn-int6/dyn-int4 (dynamic CPU/GPU). Default fp16.
         var supertonicVE: Supertonic3VectorEstimator = .aneBucketed(.int4)
@@ -189,6 +190,11 @@ public struct TTS {
                     supertonicSpeed = v
                     i += 1
                 }
+            case "--silence":
+                if i + 1 < arguments.count, let v = Float(arguments[i + 1]) {
+                    supertonicSilence = v
+                    i += 1
+                }
             case "--alpha":
                 if i + 1 < arguments.count, let v = Float(arguments[i + 1]) {
                     styletts2Alpha = v
@@ -314,6 +320,7 @@ public struct TTS {
                 text: text, output: output, language: supertonicLanguage,
                 voiceStylePath: supertonicVoiceStylePath, voiceName: voice,
                 totalSteps: supertonicTotalSteps, speed: supertonicSpeed,
+                silenceDuration: supertonicSilence,
                 vectorEstimator: supertonicVE,
                 metricsPath: metricsPath, cpuOnly: cpuOnly)
         }
@@ -803,6 +810,7 @@ public struct TTS {
         text: String, output: String, language: String,
         voiceStylePath: String?, voiceName: String,
         totalSteps: Int, speed: Float,
+        silenceDuration: Float,
         vectorEstimator: Supertonic3VectorEstimator,
         metricsPath: String?, cpuOnly: Bool
     ) async {
@@ -844,7 +852,8 @@ public struct TTS {
             let tSynth0 = Date()
             let result = try await manager.synthesize(
                 text: text, language: language, style: style,
-                totalSteps: totalSteps, speed: speed)
+                totalSteps: totalSteps, speed: speed,
+                silenceDuration: silenceDuration)
             let tSynth1 = Date()
 
             let outURL = resolveInputURL(output)
@@ -925,6 +934,7 @@ public struct TTS {
                                      --lang en                  ISO-639-1 language code (default en)
                                      --total-steps 8            denoising step count (default 8)
                                      --speed 1.05               duration multiplier (default 1.05)
+                                     --silence 0.05             inter-chunk silence seconds (default 0.05)
                                      --cpu-only                 disable Neural Engine
               --lexicon, -l        Custom pronunciation lexicon file (KokoroAne --variant zh only):
                                      word  pinyin1 pinyin2   (e.g. zi4 jie2)