Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions Sources/FluidAudio/TTS/Supertonic3/Supertonic3Constants.swift
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,14 @@ public enum Supertonic3Constants {
public static let defaultSpeed: Float = 1.05

/// Default silence inserted between text chunks when synthesizing long
/// utterances. 0.3 s mirrors the reference CLI default.
public static let defaultSilenceDuration: Float = 0.3
/// utterances. The 70-char chunk cap (#669) splits a paragraph into many
/// chunks; the reference CLI's 0.3 s pad stacks on top of the model's own
/// trailing sentence silence, inflating natural ~0.5–1.0 s sentence pauses
/// to ~1.1–1.2 s (the "unintended pauses" of #736). 0.05 s keeps the seams
/// from butting tokens together while letting the model's intrinsic
/// sentence prosody come through. Override via the synthesize parameter
/// (CLI `--silence`).
public static let defaultSilenceDuration: Float = 0.05

/// Max characters per chunk when synthesizing long English/Latin text.
/// Although `textTFixed = 128` would *fit* ~110 chars, the model's output
Expand Down
3 changes: 2 additions & 1 deletion Sources/FluidAudio/TTS/Supertonic3/Supertonic3Manager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ public actor Supertonic3Manager {
/// - speed: Speech-rate multiplier (default 1.05). Divides the
/// predicted duration vector.
/// - silenceDuration: Silence inserted between chunks when the text
/// is split into multiple chunks. Default 0.3 s.
/// is split into multiple chunks. Default 0.05 s — see
/// `Supertonic3Constants.defaultSilenceDuration`.
public func synthesize(
text: String,
language: String,
Expand Down
12 changes: 11 additions & 1 deletion Sources/FluidAudioCLI/Commands/TTSCommand.swift
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ public struct TTS {
var supertonicVoiceStylePath: String? = nil
var supertonicTotalSteps: Int = Supertonic3Constants.defaultTotalSteps
var supertonicSpeed: Float = Supertonic3Constants.defaultSpeed
var supertonicSilence: Float = Supertonic3Constants.defaultSilenceDuration
// VectorEstimator build: fp16 | int8/int6/int4 (ANE-bucketed) |
// dyn-int8/dyn-int6/dyn-int4 (dynamic CPU/GPU). Default fp16.
var supertonicVE: Supertonic3VectorEstimator = .aneBucketed(.int4)
Expand Down Expand Up @@ -189,6 +190,11 @@ public struct TTS {
supertonicSpeed = v
i += 1
}
case "--silence":
if i + 1 < arguments.count, let v = Float(arguments[i + 1]) {
supertonicSilence = v
i += 1
}
case "--alpha":
if i + 1 < arguments.count, let v = Float(arguments[i + 1]) {
styletts2Alpha = v
Expand Down Expand Up @@ -314,6 +320,7 @@ public struct TTS {
text: text, output: output, language: supertonicLanguage,
voiceStylePath: supertonicVoiceStylePath, voiceName: voice,
totalSteps: supertonicTotalSteps, speed: supertonicSpeed,
silenceDuration: supertonicSilence,
vectorEstimator: supertonicVE,
metricsPath: metricsPath, cpuOnly: cpuOnly)
}
Expand Down Expand Up @@ -803,6 +810,7 @@ public struct TTS {
text: String, output: String, language: String,
voiceStylePath: String?, voiceName: String,
totalSteps: Int, speed: Float,
silenceDuration: Float,
vectorEstimator: Supertonic3VectorEstimator,
metricsPath: String?, cpuOnly: Bool
) async {
Expand Down Expand Up @@ -844,7 +852,8 @@ public struct TTS {
let tSynth0 = Date()
let result = try await manager.synthesize(
text: text, language: language, style: style,
totalSteps: totalSteps, speed: speed)
totalSteps: totalSteps, speed: speed,
silenceDuration: silenceDuration)
let tSynth1 = Date()

let outURL = resolveInputURL(output)
Expand Down Expand Up @@ -925,6 +934,7 @@ public struct TTS {
--lang en ISO-639-1 language code (default en)
--total-steps 8 denoising step count (default 8)
--speed 1.05 duration multiplier (default 1.05)
--silence 0.05 inter-chunk silence seconds (default 0.05)
--cpu-only disable Neural Engine
--lexicon, -l Custom pronunciation lexicon file (KokoroAne --variant zh only):
word pinyin1 pinyin2 (e.g. zi4 jie2)
Expand Down
Loading