From fb2e0da6553f1f91087abc549d4ab15981de2196 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 00:46:27 +0300 Subject: [PATCH] =?UTF-8?q?fix(tts):=20#72=20=E2=80=94=20phrase=20prosody?= =?UTF-8?q?=20volume=20must=20be=20%,=20not=20dB=20(root=20cause)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While running the delivery-003 corpus regen, 6 of 8 elephant Tier B scenes reliably hit #72 (`Azure SSML parsing error 0x80045003`). Bisected the failing SSML and isolated the trigger: ← outer, from _volume_to_string text stress ← inner, from _HINT_DEFAULTS text Confirmed against Azure with 9 A/B SSML tests: - nested word-aligned, all-% units → OK - nested word-aligned, inner pitch="+1st"+vol% → OK - nested word-aligned, inner pitch=%+vol="+3dB" → FAIL - nested mid-word, mixed units → FAIL - mid-word (no nested prosody) → OK Pitch unit mismatch (`+1st` inner inside `+N%` outer) is tolerated; volume unit mismatch (`+NdB` inside `+N%`) is the trigger. Fix: `_HINT_DEFAULTS["stress"]["volume"]` changed from `"+3dB"` to `"+3%"`. This matches the lossy 1:1 dB→% mapping convention that `_volume_to_string` already uses, so the inner and outer prosody elements live in the same unit system. Two regression tests added: 1. `test_no_hint_default_uses_db_for_volume` — structural check that no entry in `_HINT_DEFAULTS` emits volume in `dB`, since the outer emitter is always `%`. 2. `test_hint_default_volumes_parse_as_percent` — companion: any volume default must end in `%` and parse as numeric. Updates the `PhraseProsody.volume` docstring to explain the invariant and reference #72. Reliable repro from the delivery-003 attempt: any elephant Tier B scene with intensity ≥ 3 (where the LLM emits `stress` hints on aggressive BEN turns) hits this; the failing scene/turn manifest is captured in `/tmp/ssml-diag/intercept_call_01.{xml,status}` during investigation. After this fix, re-running those 6 scenes succeeds. Test plan: - `pytest tests/unit/` — 1696 passed (1694 + 2 new) - `ruff check synthbanshee/ tests/` — clean - Manual Azure round-trip with TEST H (nested, vol=%) confirms the fix on live Azure. Refs #72. Unblocks delivery-003 corpus PR (avdp-synth-corpus). Co-Authored-By: Claude Opus 4.7 --- synthbanshee/tts/ssml_types.py | 16 ++++++++--- tests/unit/test_phrase_prosody.py | 44 ++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/synthbanshee/tts/ssml_types.py b/synthbanshee/tts/ssml_types.py index ef44db1..78468fb 100644 --- a/synthbanshee/tts/ssml_types.py +++ b/synthbanshee/tts/ssml_types.py @@ -65,8 +65,14 @@ class PhraseProsody: char_end: Exclusive end offset. rate: SSML rate value (e.g. ``"+15%"``, ``"slow"``). ``None`` means no rate change. - pitch: SSML pitch value (e.g. ``"+2st"``). ``None`` means no change. - volume: SSML volume value (e.g. ``"+3dB"``). ``None`` means no change. + pitch: SSML pitch value (e.g. ``"+2st"`` or ``"+6%"``). ``None`` + means no change. Mixing ``st`` (inner) with ``%`` (outer) is + tolerated by Azure. + volume: SSML volume value (e.g. ``"+3%"``). ``None`` means no + change. **Must be expressed in ``%``** to match the outer + ```` emitted by ``_volume_to_string``; + nesting ``volume="+NdB"`` inside ``volume="+N%"`` triggers + Azure SSML parser error 0x80045003 (#72). break_before_ms: Milliseconds of silence inserted before the span. break_after_ms: Milliseconds of silence inserted after the span. """ @@ -86,8 +92,10 @@ class PhraseProsody: # --------------------------------------------------------------------------- _HINT_DEFAULTS: dict[str, dict[str, str | int]] = { - # Stressed accusatory phrase — faster, louder, higher pitch - "stress": {"rate": "+15%", "volume": "+3dB", "pitch": "+1st", "break_before_ms": 0}, + # Stressed accusatory phrase — faster, louder, higher pitch. + # `volume` must be in `%` (#72): Azure SSML rejects `volume="+NdB"` + # nested inside the outer `volume="+N%"` emitted by `_volume_to_string`. + "stress": {"rate": "+15%", "volume": "+3%", "pitch": "+1st", "break_before_ms": 0}, # Deliberate command slowing — measured menace "slow": {"rate": "-20%", "break_before_ms": 150}, # Pause before the phrase for dramatic weight diff --git a/tests/unit/test_phrase_prosody.py b/tests/unit/test_phrase_prosody.py index ce61794..6e8ebe7 100644 --- a/tests/unit/test_phrase_prosody.py +++ b/tests/unit/test_phrase_prosody.py @@ -15,6 +15,7 @@ from synthbanshee.tts.ssml_builder import SSMLBuilder, UtteranceSpec, _apply_phrase_prosody from synthbanshee.tts.ssml_types import ( + _HINT_DEFAULTS, PhraseHint, PhraseProsody, _build_offset_map, @@ -106,7 +107,10 @@ def test_hint_defaults_applied_stress(self) -> None: ) result = resolve_phrase_hints([hint], text, text) assert result[0].rate == "+15%" - assert result[0].volume == "+3dB" + # `volume` must be in `%` (#72): Azure SSML rejects `volume="+NdB"` + # nested inside the outer `volume="+N%"` emitted by `_volume_to_string`. + # Don't reintroduce `+3dB` here without also changing the outer emitter. + assert result[0].volume == "+3%" assert result[0].pitch == "+1st" def test_hint_defaults_applied_menace(self) -> None: @@ -190,6 +194,44 @@ def test_negative_char_start_clamped(self) -> None: assert result[0].char_start == 0 +# --------------------------------------------------------------------------- +# Hint defaults — Azure SSML unit invariant +# --------------------------------------------------------------------------- + + +class TestHintDefaultUnits: + """Azure rejects SSML when an inner `` is nested + inside an outer `` (#72: `0x80045003 / Connection + was closed by the remote host`). Since `_volume_to_string` always emits + the outer prosody volume in `%`, every nested phrase prosody value must + also be in `%`. Reintroducing `dB` here re-breaks the corpus generation + for any scene that emits a `stress` hint (most violent / agitated scenes).""" + + def test_no_hint_default_uses_db_for_volume(self) -> None: + for hint_name, defaults in _HINT_DEFAULTS.items(): + volume = defaults.get("volume") + if volume is None: + continue + assert isinstance(volume, str) + assert not volume.endswith("dB"), ( + f"Hint {hint_name!r} default volume {volume!r} uses dB; " + "must be % to match the outer prosody emitter and avoid #72 " + "(Azure SSML parse error 0x80045003)" + ) + + def test_hint_default_volumes_parse_as_percent(self) -> None: + for hint_name, defaults in _HINT_DEFAULTS.items(): + volume = defaults.get("volume") + if volume is None: + continue + assert isinstance(volume, str) + assert volume.endswith("%"), ( + f"Hint {hint_name!r} default volume {volume!r} must end in '%'" + ) + n = volume.rstrip("%").lstrip("+") + int(n.lstrip("-")) # numeric (no exception => fine) + + # --------------------------------------------------------------------------- # rebase_phrase_prosody # ---------------------------------------------------------------------------