Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/voice/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
ConversationItemAddedEvent,
ErrorEvent,
FunctionToolsExecutedEvent,
FunctionToolsExecutingEvent,
MetricsCollectedEvent,
RunContext,
SpeechCreatedEvent,
Expand Down Expand Up @@ -43,6 +44,7 @@
"UserStateChangedEvent",
"AgentStateChangedEvent",
"FunctionToolsExecutedEvent",
"FunctionToolsExecutingEvent",
"AgentFalseInterruptionEvent",
"TranscriptSynchronizer",
"io",
Expand Down
9 changes: 9 additions & 0 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
AgentFalseInterruptionEvent,
ErrorEvent,
FunctionToolsExecutedEvent,
FunctionToolsExecutingEvent,
MetricsCollectedEvent,
SpeechCreatedEvent,
UserInputTranscribedEvent,
Expand Down Expand Up @@ -1926,6 +1927,10 @@ def _on_first_frame(_: asyncio.Future[None]) -> None:
# messages in RunResult are ordered by the `created_at` field
def _tool_execution_started_cb(fnc_call: llm.FunctionCall) -> None:
speech_handle._item_added([fnc_call])
self._session.emit(
"function_tools_executing",
FunctionToolsExecutingEvent(function_call=fnc_call),
)

def _tool_execution_completed_cb(out: ToolExecutionOutput) -> None:
if out.fnc_call_out:
Expand Down Expand Up @@ -2396,6 +2401,10 @@ def _tool_execution_started_cb(fnc_call: llm.FunctionCall) -> None:
speech_handle._item_added([fnc_call])
self._agent._chat_ctx.items.append(fnc_call)
self._session._tool_items_added([fnc_call])
self._session.emit(
"function_tools_executing",
FunctionToolsExecutingEvent(function_call=fnc_call),
)

def _tool_execution_completed_cb(out: ToolExecutionOutput) -> None:
if out.fnc_call_out:
Expand Down
59 changes: 53 additions & 6 deletions livekit-agents/livekit/agents/voice/background_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from ..utils.aio import cancel_and_wait
from ..utils.audio import audio_frames_from_file
from .agent_session import AgentSession
from .events import AgentStateChangedEvent
from .events import AgentStateChangedEvent, FunctionToolsExecutedEvent, FunctionToolsExecutingEvent

_resource_stack = contextlib.ExitStack()
atexit.register(_resource_stack.close)
Expand Down Expand Up @@ -73,18 +73,21 @@ def __init__(
thinking_sound: NotGivenOr[
AudioSource | AudioConfig | list[AudioConfig] | None
] = NOT_GIVEN,
tool_calling_sound: NotGivenOr[
AudioSource | AudioConfig | list[AudioConfig] | None
] = NOT_GIVEN,
stream_timeout_ms: int = 200,
) -> None:
"""
Initializes the BackgroundAudio component with optional ambient and thinking sounds.
Initializes the BackgroundAudio component with optional ambient, thinking, and tool calling sounds.

This component creates and publishes a continuous audio track to a LiveKit room while managing
the playback of ambient and agent thinking sounds. It supports three types of audio sources:
the playback of ambient and agent "thinking" sounds. It supports three types of audio sources:
- A BuiltinAudioClip enum value, which will use a pre-defined sound from the package resources
- A file path (string) pointing to an audio file, which can be looped.
- An AsyncIterator that yields rtc.AudioFrame

When a list (or AudioConfig) is supplied, the component considers each sounds volume and probability:
When a list (or AudioConfig) is supplied, the component considers each sound's volume and probability:
- The probability value determines the chance that a particular sound is selected for playback.
- A total probability below 1.0 means there is a chance no sound will be selected (resulting in silence).

Expand All @@ -94,13 +97,21 @@ def __init__(
For AsyncIterator sources, ensure the iterator is infinite or looped.

thinking_sound (NotGivenOr[Union[AudioSource, AudioConfig, List[AudioConfig], None]], optional):
The sound to be played when the associated agent enters a “thinking” state. This can be a single
sound source or a list of AudioConfig objects (with volume and probability settings).
The sound to be played when the associated agent enters a "thinking" state (LLM processing).
This can be a single sound source or a list of AudioConfig objects (with volume and
probability settings). If tool_calling_sound is also provided, this sound will be stopped
when a tool starts executing.

tool_calling_sound (NotGivenOr[Union[AudioSource, AudioConfig, List[AudioConfig], None]], optional):
The sound to be played when a function tool starts executing. This allows for a different
sound during tool execution vs regular LLM thinking. When tool execution completes, the
sound will stop. If not provided, thinking_sound will continue playing during tool execution.

""" # noqa: E501

self._ambient_sound = ambient_sound if is_given(ambient_sound) else None
self._thinking_sound = thinking_sound if is_given(thinking_sound) else None
self._tool_calling_sound = tool_calling_sound if is_given(tool_calling_sound) else None

self._audio_source = rtc.AudioSource(48000, 1, queue_size_ms=_AUDIO_SOURCE_BUFFER_MS)
self._audio_mixer = rtc.AudioMixer(
Expand All @@ -116,6 +127,7 @@ def __init__(

self._ambient_handle: PlayHandle | None = None
self._thinking_handle: PlayHandle | None = None
self._tool_calling_handle: PlayHandle | None = None

def _select_sound_from_list(self, sounds: list[AudioConfig]) -> AudioConfig | None:
"""
Expand Down Expand Up @@ -266,6 +278,11 @@ async def start(

if self._agent_session:
self._agent_session.on("agent_state_changed", self._agent_state_changed)
if self._tool_calling_sound:
self._agent_session.on(
"function_tools_executing", self._function_tools_executing
)
self._agent_session.on("function_tools_executed", self._function_tools_executed)

if self._ambient_sound:
normalized = self._normalize_sound_source(
Expand Down Expand Up @@ -301,6 +318,13 @@ async def aclose(self) -> None:

if self._agent_session:
self._agent_session.off("agent_state_changed", self._agent_state_changed)
if self._tool_calling_sound:
self._agent_session.off(
"function_tools_executing", self._function_tools_executing
)
self._agent_session.off(
"function_tools_executed", self._function_tools_executed
)

self._room.off("reconnected", self._on_reconnected)

Expand Down Expand Up @@ -331,6 +355,29 @@ def _agent_state_changed(self, ev: AgentStateChangedEvent) -> None:
elif self._thinking_handle:
self._thinking_handle.stop()

def _function_tools_executing(self, ev: FunctionToolsExecutingEvent) -> None:
"""Handle tool execution start - switch from thinking sound to tool calling sound."""
if not self._tool_calling_sound:
return

# Stop thinking sound if playing
if self._thinking_handle and not self._thinking_handle.done():
self._thinking_handle.stop()

# Start tool calling sound if not already playing (loop until execution completes)
if self._tool_calling_handle and not self._tool_calling_handle.done():
return

self._tool_calling_handle = self.play(
cast(Union[AudioSource, AudioConfig, list[AudioConfig]], self._tool_calling_sound),
loop=True,
)

def _function_tools_executed(self, ev: FunctionToolsExecutedEvent) -> None:
"""Handle tool execution completion - stop tool calling sound."""
if self._tool_calling_handle and not self._tool_calling_handle.done():
self._tool_calling_handle.stop()

@log_exceptions(logger=logger)
async def _play_task(
self, play_handle: PlayHandle, sound: AudioSource, volume: float, loop: bool
Expand Down
15 changes: 15 additions & 0 deletions livekit-agents/livekit/agents/voice/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ async def wait_for_playout(self) -> None:
"user_input_transcribed",
"conversation_item_added",
"agent_false_interruption",
"function_tools_executing",
"function_tools_executed",
"metrics_collected",
"speech_created",
Expand Down Expand Up @@ -155,6 +156,19 @@ class ConversationItemAddedEvent(BaseModel):
created_at: float = Field(default_factory=time.time)


class FunctionToolsExecutingEvent(BaseModel):
"""Event emitted when a function tool starts executing.

This event is fired before the tool execution begins, allowing listeners
to react immediately (e.g., play a different sound during tool execution).
"""

type: Literal["function_tools_executing"] = "function_tools_executing"
function_call: FunctionCall
"""The function call that is about to be executed."""
created_at: float = Field(default_factory=time.time)


class FunctionToolsExecutedEvent(BaseModel):
type: Literal["function_tools_executed"] = "function_tools_executed"
function_calls: list[FunctionCall]
Expand Down Expand Up @@ -233,6 +247,7 @@ class CloseEvent(BaseModel):
AgentFalseInterruptionEvent,
MetricsCollectedEvent,
ConversationItemAddedEvent,
FunctionToolsExecutingEvent,
FunctionToolsExecutedEvent,
SpeechCreatedEvent,
ErrorEvent,
Expand Down
Loading