diff --git a/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Constants.swift b/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Constants.swift index f3ee0f76b..b80a0cdb0 100644 --- a/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Constants.swift +++ b/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Constants.swift @@ -69,8 +69,14 @@ public enum Supertonic3Constants { public static let defaultSpeed: Float = 1.05 /// Default silence inserted between text chunks when synthesizing long - /// utterances. 0.3 s mirrors the reference CLI default. - public static let defaultSilenceDuration: Float = 0.3 + /// utterances. The 70-char chunk cap (#669) splits a paragraph into many + /// chunks; the reference CLI's 0.3 s pad stacks on top of the model's own + /// trailing sentence silence, inflating natural ~0.5–1.0 s sentence pauses + /// to ~1.1–1.2 s (the "unintended pauses" of #736). 0.05 s keeps the seams + /// from butting tokens together while letting the model's intrinsic + /// sentence prosody come through. Override via the synthesize parameter + /// (CLI `--silence`). + public static let defaultSilenceDuration: Float = 0.05 /// Max characters per chunk when synthesizing long English/Latin text. /// Although `textTFixed = 128` would *fit* ~110 chars, the model's output diff --git a/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Manager.swift b/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Manager.swift index f34e19e26..c25961df4 100644 --- a/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Manager.swift +++ b/Sources/FluidAudio/TTS/Supertonic3/Supertonic3Manager.swift @@ -107,7 +107,8 @@ public actor Supertonic3Manager { /// - speed: Speech-rate multiplier (default 1.05). Divides the /// predicted duration vector. /// - silenceDuration: Silence inserted between chunks when the text - /// is split into multiple chunks. Default 0.3 s. + /// is split into multiple chunks. Default 0.05 s — see + /// `Supertonic3Constants.defaultSilenceDuration`. public func synthesize( text: String, language: String, diff --git a/Sources/FluidAudioCLI/Commands/TTSCommand.swift b/Sources/FluidAudioCLI/Commands/TTSCommand.swift index 32ebdedca..287453c98 100644 --- a/Sources/FluidAudioCLI/Commands/TTSCommand.swift +++ b/Sources/FluidAudioCLI/Commands/TTSCommand.swift @@ -92,6 +92,7 @@ public struct TTS { var supertonicVoiceStylePath: String? = nil var supertonicTotalSteps: Int = Supertonic3Constants.defaultTotalSteps var supertonicSpeed: Float = Supertonic3Constants.defaultSpeed + var supertonicSilence: Float = Supertonic3Constants.defaultSilenceDuration // VectorEstimator build: fp16 | int8/int6/int4 (ANE-bucketed) | // dyn-int8/dyn-int6/dyn-int4 (dynamic CPU/GPU). Default fp16. var supertonicVE: Supertonic3VectorEstimator = .aneBucketed(.int4) @@ -189,6 +190,11 @@ public struct TTS { supertonicSpeed = v i += 1 } + case "--silence": + if i + 1 < arguments.count, let v = Float(arguments[i + 1]) { + supertonicSilence = v + i += 1 + } case "--alpha": if i + 1 < arguments.count, let v = Float(arguments[i + 1]) { styletts2Alpha = v @@ -314,6 +320,7 @@ public struct TTS { text: text, output: output, language: supertonicLanguage, voiceStylePath: supertonicVoiceStylePath, voiceName: voice, totalSteps: supertonicTotalSteps, speed: supertonicSpeed, + silenceDuration: supertonicSilence, vectorEstimator: supertonicVE, metricsPath: metricsPath, cpuOnly: cpuOnly) } @@ -803,6 +810,7 @@ public struct TTS { text: String, output: String, language: String, voiceStylePath: String?, voiceName: String, totalSteps: Int, speed: Float, + silenceDuration: Float, vectorEstimator: Supertonic3VectorEstimator, metricsPath: String?, cpuOnly: Bool ) async { @@ -844,7 +852,8 @@ public struct TTS { let tSynth0 = Date() let result = try await manager.synthesize( text: text, language: language, style: style, - totalSteps: totalSteps, speed: speed) + totalSteps: totalSteps, speed: speed, + silenceDuration: silenceDuration) let tSynth1 = Date() let outURL = resolveInputURL(output) @@ -925,6 +934,7 @@ public struct TTS { --lang en ISO-639-1 language code (default en) --total-steps 8 denoising step count (default 8) --speed 1.05 duration multiplier (default 1.05) + --silence 0.05 inter-chunk silence seconds (default 0.05) --cpu-only disable Neural Engine --lexicon, -l Custom pronunciation lexicon file (KokoroAne --variant zh only): word pinyin1 pinyin2 (e.g. zi4 jie2)