diff --git a/.fern/replay.lock b/.fern/replay.lock index 536b6e4..fcd9525 100644 --- a/.fern/replay.lock +++ b/.fern/replay.lock @@ -6,5 +6,12089 @@ generations: timestamp: 2026-05-20T20:38:02.180Z cli_version: unknown generator_versions: {} -current_generation: a217c8ecfd919345831eebaca8295e292d65ebcf -patches: [] + - commit_sha: f652c69edbd1815c832fc9354c193090ac8dde8e + tree_hash: 6a32ee744683b30c1c77191210d46b16a2a78ca4 + timestamp: 2026-06-02T00:10:16.318Z + cli_version: unknown + generator_versions: + fernapi/fern-python-sdk: 4.37.0 +current_generation: f652c69edbd1815c832fc9354c193090ac8dde8e +patches: + - id: patch-6e30398b + content_hash: sha256:e99898e508e2d6cb9f134cc33e0b73c1c8acb845f5887924e0e38031a6e089c0 + original_commit: 6e30398b5dc6e8ff2681a442a4d6a49c7d866032 + original_message: "chore(agentkit): bump to v1.5.0 and expose v2.7 type aliases" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/vendors/__init__.py + patch_content: | + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 1942bce..5ceda66 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -1,13 +1,30 @@ + from .agent import ( + Agent, + + AgentConfig, + + AgentConfigUpdate, + + ConversationHistory, + + ConversationRole, + + ConversationSessionTurn, + + ConversationTurn, + + ConversationTurns, + StartAgentsRequestProperties, + + AvatarConfig, + + AvatarVendor, + GeofenceConfig, + + LlmConfig, + + LlmStyle, + + MllmConfig, + + MllmVendor, + RtcConfig, + + SttConfig, + + SttVendor, + + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + @@ -37,9 +54,14 @@ from .agent import ( + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + + SessionInfo, + + SessionListResponse, + + SessionSummary, + + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + @@ -57,8 +79,10 @@ from ..agent_management.types.agent_think_agent_management_request_on_speaking_a + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -112,6 +136,7 @@ from .vendors import ( + FishAudioTTS, + Gemini, + GeminiLive, + + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + @@ -132,14 +157,27 @@ from .vendors import ( + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + + XaiGrok, + + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + + "AgentConfig", + + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + + "LlmConfig", + + "LlmStyle", + + "SttConfig", + + "SttVendor", + + "TtsConfig", + + "MllmConfig", + + "MllmVendor", + + "AvatarConfig", + + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + @@ -147,6 +185,7 @@ __all__ = [ + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + @@ -181,6 +220,7 @@ __all__ = [ + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + @@ -197,6 +237,15 @@ __all__ = [ + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + + "SessionInfo", + + "SessionListResponse", + + "SessionSummary", + + "ConversationHistory", + + "ConversationTurn", + + "ConversationRole", + + "ConversationTurns", + + "ConversationSessionTurn", + + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + @@ -253,14 +302,19 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + + "is_generic_avatar", + + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 0320843..689eab1 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -11,9 +11,9 @@ from .base import ( + OpenAISampleRate, + SampleRate, + ) + -from .avatar import AkoolAvatar, AnamAvatar, HeyGenAvatar, LiveAvatarAvatar + +from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + @@ -82,8 +82,11 @@ __all__ = [ + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + + "XaiGrok", + + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + + "GenericAvatar", + ] + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + XaiRealtime, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + status: unresolved + - id: patch-9df782b4 + content_hash: sha256:84c08fe3239d2ecb0b0a3ddd33b0dce4e7b012125be797aa83ca12893363b565 + original_commit: 9df782b46d872599f103078e30c5ded2053f2517 + original_message: "feat(agentkit): update MLLM and LLM vendor wrappers for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + From 9df782b46d872599f103078e30c5ded2053f2517 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:57:54 -0400 + Subject: [PATCH] feat(agentkit): update MLLM and LLM vendor wrappers for v2.7 + + Adds xAI Grok Realtime and Vertex AI MLLM wrappers, and aligns MLLM + config serialization with the generated core types. LLM vendors now + accept typed greeting_configs and serialize them through the generated + model shape, including interruptable. + --- + src/agora_agent/agentkit/vendors/llm.py | 31 ++++-- + src/agora_agent/agentkit/vendors/mllm.py | 118 +++++++++++++++++------ + 2 files changed, 113 insertions(+), 36 deletions(-) + + diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py + index 7465c9f..6f74b43 100644 + --- a/src/agora_agent/agentkit/vendors/llm.py + +++ b/src/agora_agent/agentkit/vendors/llm.py + @@ -1,9 +1,14 @@ + -from typing import Any, Dict, List, Optional + +from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + +from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + + StartAgentsRequestPropertiesLlmGreetingConfigs, + +) + from .base import BaseLLM + + +LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + @@ -15,6 +20,14 @@ def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]] + result.append(item) + return result + + + + +def _dump_optional_model(value: Any) -> Any: + + if hasattr(value, "model_dump"): + + return value.model_dump(exclude_none=True) + + if hasattr(value, "dict"): + + return value.dict(exclude_none=True) + + return value + + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -31,7 +44,7 @@ class OpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -74,7 +87,7 @@ class OpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -104,7 +117,7 @@ class AzureOpenAIOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -150,7 +163,7 @@ class AzureOpenAI(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + @@ -177,7 +190,7 @@ class AnthropicOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -216,7 +229,7 @@ class Anthropic(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + @@ -246,7 +259,7 @@ class GeminiOptions(BaseModel): + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + - greeting_configs: Optional[Dict[str, Any]] = Field(default=None) + + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + @@ -287,7 +300,7 @@ class Gemini(BaseLLM): + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + - config["greeting_configs"] = self.options.greeting_configs + + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index 5f6f940..cd6cd07 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,3 +1,4 @@ + +import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -22,9 +23,7 @@ class OpenAIRealtimeOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -53,18 +52,97 @@ class OpenAIRealtime(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + +# xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + +# is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + + + +class XaiGrokOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="xAI API key") + + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + + + +class XaiGrok(BaseMLLM): + + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + + + def __init__(self, **kwargs: Any): + + self.options = XaiGrokOptions(**kwargs) + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = dict(self.options.params or {}) + + if self.options.voice is not None: + + params["voice"] = self.options.voice + + if self.options.language is not None: + + params["language"] = self.options.language + + if self.options.sample_rate is not None: + + params["sample_rate"] = self.options.sample_rate + + + + config: Dict[str, Any] = { + + "vendor": "xai", + + "api_key": self.options.api_key, + + "url": self.options.url, + + "params": params, + + } + + + + if self.options.greeting_message is not None: + + config["greeting_message"] = self.options.greeting_message + + if self.options.input_modalities is not None: + + config["input_modalities"] = self.options.input_modalities + + if self.options.output_modalities is not None: + + config["output_modalities"] = self.options.output_modalities + + if self.options.messages is not None: + + config["messages"] = self.options.messages + + if self.options.failure_message is not None: + + config["failure_message"] = self.options.failure_message + + if self.options.turn_detection is not None: + + config["turn_detection"] = self.options.turn_detection + + + + return config + + + + + +class XaiRealtimeOptions(XaiGrokOptions): + + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + + + def __init__(self, **data: Any): + + warnings.warn( + + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**data) + + + + + +class XaiRealtime(XaiGrok): + + """Deprecated: use :class:`XaiGrok` instead.""" + + + + def __init__(self, **kwargs: Any): + + warnings.warn( + + "XaiRealtime is deprecated; use XaiGrok instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + super().__init__(**kwargs) + + + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + @@ -81,28 +159,24 @@ class VertexAIOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + - params: Dict[str, Any] = { + - "model": self.options.model, + - "project_id": self.options.project_id, + - "location": self.options.location, + - "adc_credentials_string": self.options.adc_credentials_string, + - } + - + + # additional_params spread first so that explicit fields always win, + + # matching the TypeScript SDK. + + params: Dict[str, Any] = dict(self.options.additional_params or {}) + + params["model"] = self.options.model + + params["project_id"] = self.options.project_id + + params["location"] = self.options.location + + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + - if self.options.additional_params is not None: + - params.update(self.options.additional_params) + + config: Dict[str, Any] = { + "vendor": "vertexai", + @@ -119,12 +193,8 @@ class VertexAI(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + @@ -145,9 +215,7 @@ class GeminiLiveOptions(BaseModel): + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + - predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + - max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + @@ -179,12 +247,8 @@ class GeminiLive(BaseMLLM): + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + - if self.options.predefined_tools is not None: + - config["predefined_tools"] = self.options.predefined_tools + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + - if self.options.max_history is not None: + - config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/vendors/llm.py: | + from typing import Any, Dict, List, Optional, Union + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( + StartAgentsRequestPropertiesLlmGreetingConfigs, + ) + from .base import BaseLLM + + LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] + + + def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Ensure each MCP server has transport set (API requires it). Default to streamable_http.""" + result = [] + for s in servers: + item = dict(s) + if item.get("transport") is None: + item["transport"] = "streamable_http" + result.append(item) + return result + + + def _dump_optional_model(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + class OpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + model: str = Field(default="gpt-4o-mini", description="Model name") + base_url: Optional[str] = Field(default=None, description="Custom base URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class OpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # model is the default; explicit params entries extend/override it. + # This matches the TS SDK behaviour: { model, ...params }. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + + # Named fields take precedence over anything in the generic params dict. + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.base_url or "https://api.openai.com/v1/chat/completions", + "params": params, + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.api_key is not None: + config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AzureOpenAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Azure OpenAI API key") + endpoint: str = Field(..., description="Azure endpoint URL") + deployment_name: str = Field(..., description="Azure deployment name") + api_version: str = Field(default="2024-08-01-preview", description="Azure API version") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + max_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class AzureOpenAI(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AzureOpenAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + url = ( + f"{self.options.endpoint}/openai/deployments/" + f"{self.options.deployment_name}/chat/completions" + f"?api-version={self.options.api_version}" + ) + config: Dict[str, Any] = { + "url": url, + "api_key": self.options.api_key, + "vendor": self.options.vendor or "azure", + "style": "openai", + "input_modalities": self.options.input_modalities or ["text"], + } + + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if params: + config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class AnthropicOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anthropic API key") + model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + max_tokens: Optional[int] = Field(default=None, gt=0) + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Anthropic(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = AnthropicOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or "https://api.anthropic.com/v1/messages", + "api_key": self.options.api_key, + "params": params, + "style": "anthropic", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + + + class GeminiOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google AI API key") + model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + url: Optional[str] = Field(default=None, description="Custom API endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_k: Optional[int] = Field(default=None, gt=0) + max_output_tokens: Optional[int] = Field(default=None, gt=0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + + class Gemini(BaseLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # Named fields take precedence over anything in the generic params dict. + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + if self.options.top_k is not None: + params["top_k"] = self.options.top_k + if self.options.max_output_tokens is not None: + params["max_output_tokens"] = self.options.max_output_tokens + + config: Dict[str, Any] = { + "url": self.options.url or "https://generativelanguage.googleapis.com/v1beta/models", + "api_key": self.options.api_key, + "params": params, + "style": "gemini", + "input_modalities": self.options.input_modalities or ["text"], + } + + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + + return config + src/agora_agent/agentkit/vendors/mllm.py: | + import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class XaiRealtimeOptions(XaiGrokOptions): + """Deprecated: use :class:`XaiGrokOptions` instead.""" + + def __init__(self, **data: Any): + warnings.warn( + "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**data) + + + class XaiRealtime(XaiGrok): + """Deprecated: use :class:`XaiGrok` instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "XaiRealtime is deprecated; use XaiGrok instead.", + DeprecationWarning, + stacklevel=2, + ) + super().__init__(**kwargs) + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-26706d73 + content_hash: sha256:a9551e0b774b96e7734e9faa7d770611861cf443837428272ef75710447238da + original_commit: 26706d73ae15d860d57daf926837632c01be7f10 + original_message: "feat(agentkit): add GenericAvatar and session-aware avatar validation" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/vendors/avatar.py + patch_content: |+ + From 26706d73ae15d860d57daf926837632c01be7f10 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 20:59:22 -0400 + Subject: [PATCH] feat(agentkit): add GenericAvatar and session-aware avatar + validation + + Adds the GenericAvatar vendor wrapper and extends avatar validation + helpers for generic and RTC-backed avatars. Session-derived fields such + as agora_appid, agora_channel, and agora_token can now be validated + after AgentSession enrichment. + --- + src/agora_agent/agentkit/avatar_types.py | 35 +++++++++++++++++- + src/agora_agent/agentkit/vendors/avatar.py | 42 ++++++++++++++++++++++ + 2 files changed, 76 insertions(+), 1 deletion(-) + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index 9e132a9..a04809c 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -17,7 +17,21 @@ def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + -def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + +def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + return config.get("vendor") == "generic" + + + + + +def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + ) + + + + + +def validate_avatar_config( + + config: typing.Dict[str, typing.Any], + + require_session_fields: bool = False, + +) -> None: + """Validates avatar configuration at runtime. + + Parameters + @@ -45,6 +59,8 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + + if require_session_fields and not params.get("agora_token"): + + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + @@ -53,6 +69,23 @@ def validate_avatar_config(config: typing.Dict[str, typing.Any]) -> None: + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + + elif is_generic_avatar(config): + + params = config.get("params", {}) + + if not params.get("api_key"): + + raise ValueError("Generic avatar requires api_key") + + if not params.get("api_base_url"): + + raise ValueError("Generic avatar requires api_base_url") + + if not params.get("avatar_id"): + + raise ValueError("Generic avatar requires avatar_id") + + if not params.get("agora_uid"): + + raise ValueError("Generic avatar requires agora_uid") + + if require_session_fields: + + if not params.get("agora_token"): + + raise ValueError("Generic avatar requires agora_token after session enrichment") + + if not params.get("agora_appid"): + + raise ValueError("Generic avatar requires agora_appid after session enrichment") + + if not params.get("agora_channel"): + + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index b83a356..00cad8f 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -132,6 +132,48 @@ class LiveAvatarAvatar(BaseAvatar): + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + +class GenericAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Generic avatar provider API key") + + api_base_url: str = Field(..., description="Avatar provider API base URL") + + avatar_id: str = Field(..., description="Avatar ID") + + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + +class GenericAvatar(BaseAvatar): + + def __init__(self, **kwargs: Any): + + self.options = GenericAvatarOptions(**kwargs) + + + + @property + + def required_sample_rate(self) -> int: + + return 0 + + + + def to_config(self) -> Dict[str, Any]: + + params: Dict[str, Any] = { + + "api_key": self.options.api_key, + + "api_base_url": self.options.api_base_url, + + "avatar_id": self.options.avatar_id, + + "agora_uid": self.options.agora_uid, + + } + + + + if self.options.agora_appid is not None: + + params["agora_appid"] = self.options.agora_appid + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + + if self.options.agora_channel is not None: + + params["agora_channel"] = self.options.agora_channel + + if self.options.additional_params is not None: + + params = {**self.options.additional_params, **params} + + + + enable = self.options.enable if self.options.enable is not None else True + + return {"enable": enable, "vendor": "generic", "params": params} + + + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/avatar_types.py: | + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + ) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + AKOOL_SAMPLE_RATE = 16000 + + + class HeyGenAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="HeyGen API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + pass + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + status: unresolved + - id: patch-9f491c63 + content_hash: sha256:d9811b2c5927be74f2125444dcf36642b88ad7be422019688cb0228093dce1d0 + original_commit: 9f491c63a964c13c67ba4af3708379e1b75a92d8 + original_message: "feat(agentkit): update Agent builder and session lifecycle for v2.7" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + patch_content: |+ + From 9f491c63a964c13c67ba4af3708379e1b75a92d8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 20 May 2026 21:00:58 -0400 + Subject: [PATCH] feat(agentkit): update Agent builder and session lifecycle + for v2.7 + + Aligns Agent and AgentSession with the generated v2.7 request shape. + MLLM sessions no longer require TTS, LLM, or STT, and enabled avatars + are rejected when MLLM is configured. AgentSession now enriches generic + and RTC avatars with session context, auto-generates avatar tokens, + validates TTS sample rates from vendor-specific fields, and adds + paginated get_turns/get_all_turns helpers with fail-fast pagination + guards. + --- + src/agora_agent/agentkit/agent.py | 164 +++++++++++++-- + src/agora_agent/agentkit/agent_session.py | 231 ++++++++++++++++++++-- + 2 files changed, 360 insertions(+), 35 deletions(-) + + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 70a1bdd..86a958e 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -8,6 +8,24 @@ if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + +from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + +from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + +from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + +from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + +from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + +from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + +from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + +from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + +from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + +from ..agents.types.get_agents_response import GetAgentsResponse + +from ..agents.types.list_agents_response import ListAgentsResponse + +from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + +from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + +from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + +from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + +from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + +from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + @@ -46,10 +64,21 @@ from ..agents.types.start_agents_request_properties_filler_words_trigger import + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + +LlmConfig = StartAgentsRequestPropertiesLlm + +LlmStyle = StartAgentsRequestPropertiesLlmStyle + +SttConfig = StartAgentsRequestPropertiesAsr + +SttVendor = StartAgentsRequestPropertiesAsrVendor + +TtsConfig = Tts + +MllmConfig = StartAgentsRequestPropertiesMllm + +MllmVendor = StartAgentsRequestPropertiesMllmVendor + +AvatarConfig = StartAgentsRequestPropertiesAvatar + +AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + @@ -93,6 +122,18 @@ InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + +AgentConfig = StartAgentsRequestProperties + +AgentConfigUpdate = UpdateAgentsRequestProperties + +SessionInfo = GetAgentsResponse + +SessionListResponse = ListAgentsResponse + +SessionSummary = ListAgentsResponseDataListItem + +ConversationHistory = GetHistoryAgentsResponse + +ConversationTurn = GetHistoryAgentsResponseContentsItem + +ConversationRole = GetHistoryAgentsResponseContentsItemRole + +ConversationTurns = GetTurnsAgentsResponse + +ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + +SpeakPriority = SpeakAgentsRequestPriority + +Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + @@ -116,6 +157,7 @@ FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + +FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + @@ -183,9 +225,20 @@ class Agent: + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + + sample_rate = vendor.sample_rate + + if ( + + self._avatar_required_sample_rate not in (None, 0) + + and sample_rate is not None + + and sample_rate != self._avatar_required_sample_rate + + ): + + raise ValueError( + + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + + f"but TTS is configured with {sample_rate} Hz. " + + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + - new_agent._tts_sample_rate = vendor.sample_rate + + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + @@ -194,6 +247,9 @@ class Agent: + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` so callers can still + + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + @@ -202,7 +258,10 @@ class Agent: + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + - advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + + advanced_features_model = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_mllm": None}, + + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + @@ -214,6 +273,10 @@ class Agent: + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + + # Note: avatars are not supported with MLLM. The combination is rejected + + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + + # enabled) so callers may still combine the two for testing or for the + + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + @@ -282,7 +345,10 @@ class Agent: + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + - new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + + new_agent._advanced_features = self._copy_model_update( + + new_agent._advanced_features, + + {"enable_tools": enabled}, + + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + @@ -294,6 +360,23 @@ class Agent: + new_agent._parameters = parameters + return new_agent + + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + + """Returns a new Agent with the specified RTC audio scenario.""" + + new_agent = self._clone() + + if new_agent._parameters is None: + + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + + elif isinstance(new_agent._parameters, dict): + + new_agent._parameters = typing.cast( + + SessionParamsInput, + + {**new_agent._parameters, "audio_scenario": audio_scenario}, + + ) + + else: + + new_agent._parameters = self._copy_model_update( + + new_agent._parameters, + + {"audio_scenario": audio_scenario}, + + ) + + return new_agent + + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + @@ -342,6 +425,33 @@ class Agent: + new_agent._filler_words = filler_words + return new_agent + + + @staticmethod + + def _field_value(value: typing.Any, field: str) -> typing.Any: + + if value is None: + + return None + + if isinstance(value, dict): + + return value.get(field) + + return getattr(value, field, None) + + + + @staticmethod + + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + + if hasattr(value, "model_copy"): + + return value.model_copy(update=update) + + if hasattr(value, "copy"): + + return value.copy(update=update) + + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + + data_channel = self._field_value(self._parameters, "data_channel") + + if not enable_rtm or data_channel is not None: + + return self._parameters + + if self._parameters is None: + + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + + if isinstance(self._parameters, dict): + + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + + @property + def name(self) -> typing.Optional[str]: + return self._name + @@ -354,6 +464,10 @@ class Agent: + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + + @property + + def tts_sample_rate(self) -> typing.Optional[int]: + + return self._tts_sample_rate + + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + @@ -536,6 +650,20 @@ class Agent: + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + + # Validate the MLLM + enabled-avatar combination BEFORE generating the + + # RTC token so callers get a clear, actionable error first (matches the + + # TypeScript and Go SDKs' fail-fast contract). + + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + + avatar_enabled = ( + + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + + ) + + if is_mllm_mode and avatar_enabled: + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + @@ -553,9 +681,6 @@ class Agent: + **token_kwargs, + ) + + - mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + - is_mllm_mode = bool(mllm_flag or self._mllm is not None) + - + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + @@ -579,11 +704,12 @@ class Agent: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + - if self._parameters is not None: + - if isinstance(self._parameters, dict): + - base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + + parameters = self._resolved_parameters() + + if parameters is not None: + + if isinstance(parameters, dict): + + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + - base_kwargs["parameters"] = self._parameters + + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + @@ -596,12 +722,10 @@ class Agent: + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + - if self._greeting: + + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + - if self._max_history is not None: + - mllm_config.setdefault("max_history", self._max_history) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + @@ -617,14 +741,14 @@ class Agent: + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + - if self._instructions: + + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + - if self._greeting: + - llm_config.setdefault("greeting_message", self._greeting) + - if self._failure_message: + - llm_config.setdefault("failure_message", self._failure_message) + + if self._greeting is not None: + + llm_config["greeting_message"] = self._greeting + + if self._failure_message is not None: + + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + - llm_config.setdefault("max_history", self._max_history) + + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index 2408659..e41a399 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -14,13 +14,16 @@ from ..agent_management.types.agent_think_agent_management_request_on_thinking_a + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + +from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -182,17 +185,29 @@ class _AgentSessionBase: + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + + if self._is_mllm_mode(): + + raise ValueError( + + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + - sample_rate = tts_params.get("sample_rate") if isinstance(tts_params, dict) else None + + sample_rate = self._agent.tts_sample_rate + + if sample_rate is None and isinstance(tts_params, dict): + + sample_rate = ( + + tts_params.get("sample_rate") + + or tts_params.get("sample_rate_hertz") + + or tts_params.get("samplingRate") + + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + @@ -211,6 +226,50 @@ class _AgentSessionBase: + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + + avatar = properties.get("avatar") + + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + + return + + + + params = avatar.get("params") + + if not isinstance(params, dict): + + params = {} + + avatar["params"] = params + + + + if is_generic_avatar(avatar): + + if not params.get("agora_appid"): + + params["agora_appid"] = self._app_id + + if not params.get("agora_channel"): + + params["agora_channel"] = self._channel + + + + if not is_rtc_avatar(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_token"): + + if not self._app_certificate: + + raise ValueError( + + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + + ) + + token_kwargs: typing.Dict[str, typing.Any] = {} + + if self._expires_in is not None: + + token_kwargs["token_expire"] = self._expires_in + + params["agora_token"] = generate_convo_ai_token( + + app_id=self._app_id, + + app_certificate=self._app_certificate, + + channel_name=self._channel, + + account=str(params["agora_uid"]), + + **token_kwargs, + + ) + + + + if str(params.get("agora_uid")) == self._agent_uid: + + self._warn( + + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + + ) + + + + validate_avatar_config(avatar, require_session_fields=True) + + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + @@ -238,12 +297,17 @@ class _AgentSessionBase: + **token_opts, + ) + properties = self._dump_model(base_properties) + + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + - mllm = dict(self._agent.mllm) + - if self._agent.greeting: + + mllm = self._dump_model(self._agent.mllm) + + if not isinstance(mllm, dict): + + mllm = {} + + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + + if self._agent.failure_message is not None: + + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + @@ -251,20 +315,41 @@ class _AgentSessionBase: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + - if self._agent.instructions: + + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + - if self._agent.greeting: + - llm.setdefault("greeting_message", self._agent.greeting) + - if self._agent.failure_message: + - llm.setdefault("failure_message", self._agent.failure_message) + + if self._agent.greeting is not None: + + llm["greeting_message"] = self._agent.greeting + + if self._agent.failure_message is not None: + + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + - llm.setdefault("max_history", self._agent.max_history) + + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + + @staticmethod + + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + + if pagination is None: + + return None + + if isinstance(pagination, dict): + + return pagination.get(field) + + return getattr(pagination, field, None) + + + + @staticmethod + + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + + return list(turns or []) + + + + @classmethod + + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + + data = cls._dump_model(first_response) + + if not isinstance(data, dict): + + data = {} + + data["turns"] = turns + + return GetTurnsAgentsResponse(**data) + + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + @@ -484,7 +569,12 @@ class AgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -547,15 +637,68 @@ class AgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - def get_turns(self) -> typing.Any: + + def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + @@ -734,7 +877,12 @@ class AsyncAgentSession(_AgentSessionBase): + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + - """Inject a custom text instruction into the current session pipeline.""" + + """Inject a custom text instruction into the current session pipeline. + + + + In API v2.7, omitting ``on_listening_action`` uses the server default + + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + + preserve the pre-v2.7 behavior. + + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + @@ -797,11 +945,64 @@ class AsyncAgentSession(_AgentSessionBase): + self._app_id, self._agent_id, request_options=self._request_options() + ) + + - async def get_turns(self) -> typing.Any: + + async def get_turns( + + self, + + *, + + page_index: typing.Optional[int] = None, + + page_size: typing.Optional[int] = None, + + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + - self._app_id, self._agent_id, request_options=self._request_options() + + self._app_id, + + self._agent_id, + + page_index=page_index, + + page_size=page_size, + + request_options=self._request_options(), + ) + + + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + + """Get all turn analytics pages for this session. + + + + Raises ``RuntimeError`` if the server's pagination metadata is missing + + the fields required to advance, or if requesting the next page returns + + a page index that did not advance. + + """ + + response = await self.get_turns(page_index=1, page_size=page_size) + + all_turns = self._response_turns(response) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + current_page = self._page_value(pagination, "page_index") or 1 + + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + + total_pages = self._page_value(pagination, "total_pages") + + returned_index = self._page_value(pagination, "page_index") + + if returned_index is None and total_pages is None: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + if total_pages is not None and current_page >= total_pages: + + break + + next_page = current_page + 1 + + response = await self.get_turns(page_index=next_page, page_size=page_size) + + all_turns.extend(self._response_turns(response)) + + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + + returned_index = self._page_value(pagination, "page_index") if pagination else None + + if returned_index is not None: + + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + + raise RuntimeError( + + f"get_all_turns pagination did not advance: requested page {next_page}, " + + f"received page {returned_index}." + + ) + + current_page = returned_index + + else: + + total_pages = self._page_value(pagination, "total_pages") if pagination else None + + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + + if total_pages is None and is_last_page is not True: + + raise RuntimeError( + + "get_all_turns pagination cannot continue: response must include " + + "page_index, total_pages, or is_last_page=true." + + ) + + current_page = next_page + + return self._with_all_turns(response, all_turns) + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from .token import generate_convo_ai_token, _validate_expires_in + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_rtc_avatar(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + status: unresolved + - id: patch-eaec58eb + content_hash: sha256:8390ced175326080fc76021a97d315e71229bbc9ad70eef35a63eb9968df7830 + original_commit: eaec58eb2edfe03b1311a32dd137a867edf5d096 + original_message: "refactor(agentkit): align deprecated vendor aliases with canonical names" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/vendors/__init__.py + - src/agora_agent/agentkit/vendors/avatar.py + - src/agora_agent/agentkit/vendors/mllm.py + patch_content: |+ + diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py + index 689eab1..8e2042e 100644 + --- a/src/agora_agent/agentkit/vendors/__init__.py + +++ b/src/agora_agent/agentkit/vendors/__init__.py + @@ -13,7 +13,7 @@ from .base import ( + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + -from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok, XaiRealtime + +from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + @@ -83,7 +83,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py + index 00cad8f..50bdd08 100644 + --- a/src/agora_agent/agentkit/vendors/avatar.py + +++ b/src/agora_agent/agentkit/vendors/avatar.py + @@ -5,19 +5,19 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + -HEYGEN_SAMPLE_RATE = 24000 + LIVEAVATAR_SAMPLE_RATE = 24000 + +HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + -class HeyGenAvatarOptions(BaseModel): + +class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + - api_key: str = Field(..., description="HeyGen API key") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + - avatar_id: Optional[str] = Field(default=None, description="HeyGen avatar ID") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + @@ -31,20 +31,14 @@ class HeyGenAvatarOptions(BaseModel): + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + -class HeyGenAvatar(BaseAvatar): + - """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + +class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - warnings.warn( + - "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - self.options = HeyGenAvatarOptions(**kwargs) + + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return HEYGEN_SAMPLE_RATE + + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + @@ -65,71 +59,79 @@ class HeyGenAvatar(BaseAvatar): + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "heygen", "params": params} + + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + -class AkoolAvatarOptions(BaseModel): + - model_config = ConfigDict(extra="forbid") + +class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + - api_key: str = Field(..., description="Akool API key") + - avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + - enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + - additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + -class AkoolAvatar(BaseAvatar): + +class HeyGenAvatar(BaseAvatar): + + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + + def __init__(self, **kwargs: Any): + - self.options = AkoolAvatarOptions(**kwargs) + + warnings.warn( + + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + + DeprecationWarning, + + stacklevel=2, + + ) + + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return AKOOL_SAMPLE_RATE + + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + + "quality": self.options.quality, + + "agora_uid": self.options.agora_uid, + } + + + if self.options.agora_token is not None: + + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + + if self.options.disable_idle_timeout is not None: + + params["disable_idle_timeout"] = self.options.disable_idle_timeout + + if self.options.activity_idle_timeout is not None: + + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "akool", "params": params} + + return {"enable": enable, "vendor": "heygen", "params": params} + + + -class LiveAvatarAvatarOptions(HeyGenAvatarOptions): + - pass + +class AkoolAvatarOptions(BaseModel): + + model_config = ConfigDict(extra="forbid") + + + + api_key: str = Field(..., description="Akool API key") + + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + -class LiveAvatarAvatar(BaseAvatar): + +class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + - self.options = LiveAvatarAvatarOptions(**kwargs) + + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + - return LIVEAVATAR_SAMPLE_RATE + + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + - "quality": self.options.quality, + - "agora_uid": self.options.agora_uid, + } + + - if self.options.agora_token is not None: + - params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + - if self.options.disable_idle_timeout is not None: + - params["disable_idle_timeout"] = self.options.disable_idle_timeout + - if self.options.activity_idle_timeout is not None: + - params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + - return {"enable": enable, "vendor": "liveavatar", "params": params} + + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + @@ -145,6 +147,7 @@ class GenericAvatarOptions(BaseModel): + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + @@ -178,10 +181,11 @@ class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + - persona_id: Optional[str] = Field(default=None, description="Persona ID") + + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py + index cd6cd07..b58f040 100644 + --- a/src/agora_agent/agentkit/vendors/mllm.py + +++ b/src/agora_agent/agentkit/vendors/mllm.py + @@ -1,4 +1,3 @@ + -import warnings + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + @@ -119,30 +118,6 @@ class XaiGrok(BaseMLLM): + return config + + + -class XaiRealtimeOptions(XaiGrokOptions): + - """Deprecated: use :class:`XaiGrokOptions` instead.""" + - + - def __init__(self, **data: Any): + - warnings.warn( + - "XaiRealtimeOptions is deprecated; use XaiGrokOptions instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**data) + - + - + -class XaiRealtime(XaiGrok): + - """Deprecated: use :class:`XaiGrok` instead.""" + - + - def __init__(self, **kwargs: Any): + - warnings.warn( + - "XaiRealtime is deprecated; use XaiGrok instead.", + - DeprecationWarning, + - stacklevel=2, + - ) + - super().__init__(**kwargs) + - + - + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + theirs_snapshot: + src/agora_agent/agentkit/vendors/__init__.py: | + from .base import ( + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + ElevenLabsSampleRate, + GoogleTTSSampleRate, + MicrosoftSampleRate, + OpenAISampleRate, + SampleRate, + ) + from .avatar import AkoolAvatar, AnamAvatar, GenericAvatar, HeyGenAvatar, LiveAvatarAvatar + from .llm import Anthropic, AzureOpenAI, Gemini, OpenAI + from .mllm import GeminiLive, OpenAIRealtime, VertexAI, XaiGrok + from .stt import ( + AmazonSTT, + AresSTT, + AssemblyAISTT, + DeepgramSTT, + GoogleSTT, + MicrosoftSTT, + OpenAISTT, + SarvamSTT, + SpeechmaticsSTT, + ) + from .tts import ( + AmazonTTS, + CartesiaTTS, + DeepgramTTS, + ElevenLabsTTS, + FishAudioTTS, + GoogleTTS, + HumeAITTS, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAITTS, + RimeTTS, + SarvamTTS, + ) + + __all__ = [ + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "GoogleTTSSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + ] + src/agora_agent/agentkit/vendors/avatar.py: | + import warnings + from typing import Any, Dict, Optional + + from pydantic import BaseModel, ConfigDict, Field, field_validator + + from .base import BaseAvatar + + LIVEAVATAR_SAMPLE_RATE = 24000 + HEYGEN_SAMPLE_RATE = LIVEAVATAR_SAMPLE_RATE + AKOOL_SAMPLE_RATE = 16000 + + + class LiveAvatarAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="LiveAvatar API key") + quality: str = Field(..., description="Avatar quality: low, medium, or high") + agora_uid: str = Field(..., description="Agora UID for the avatar stream") + agora_token: Optional[str] = Field(default=None, description="RTC token for avatar authentication") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + disable_idle_timeout: Optional[bool] = Field(default=None, description="Whether to disable idle timeout") + activity_idle_timeout: Optional[int] = Field(default=None, description="Idle timeout in seconds") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + @field_validator("quality") + @classmethod + def validate_quality(cls, v: str) -> str: + valid = ("low", "medium", "high") + if v not in valid: + raise ValueError(f"Invalid quality '{v}'. Must be one of: {', '.join(valid)}") + return v + + + class LiveAvatarAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = LiveAvatarAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return LIVEAVATAR_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "liveavatar", "params": params} + + + class HeyGenAvatarOptions(LiveAvatarAvatarOptions): + """Deprecated: use :class:`LiveAvatarAvatarOptions` instead.""" + + + class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) + self.options = HeyGenAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return HEYGEN_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "quality": self.options.quality, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.disable_idle_timeout is not None: + params["disable_idle_timeout"] = self.options.disable_idle_timeout + if self.options.activity_idle_timeout is not None: + params["activity_idle_timeout"] = self.options.activity_idle_timeout + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "heygen", "params": params} + + + class AkoolAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Akool API key") + avatar_id: Optional[str] = Field(default=None, description="Avatar ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AkoolAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AkoolAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return AKOOL_SAMPLE_RATE + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.avatar_id is not None: + params["avatar_id"] = self.options.avatar_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "akool", "params": params} + + + class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + + class AnamAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Anam API key") + persona_id: Optional[str] = Field(default=None, description="Anam persona ID") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + + class AnamAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = AnamAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + } + + if self.options.persona_id is not None: + params["persona_id"] = self.options.persona_id + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "anam", "params": params} + src/agora_agent/agentkit/vendors/mllm.py: | + from typing import Any, Dict, List, Optional + + from pydantic import BaseModel, ConfigDict, Field + + from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, + ) + from .base import BaseMLLM + + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + + + class OpenAIRealtimeOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="OpenAI API key") + model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + url: Optional[str] = Field(default=None, description="WebSocket URL") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class OpenAIRealtime(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = OpenAIRealtimeOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + config: Dict[str, Any] = { + "vendor": "openai", + "api_key": self.options.api_key, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.model is not None: + params = {"model": self.options.model} + if self.options.params is not None: + params.update(self.options.params) + config["params"] = params + elif self.options.params is not None: + config["params"] = self.options.params + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + # xAI MLLM: use XaiGrok (product name, mllm.vendor "xai"). Do not use XaiRealtime—that name + # is deprecated and reserved naming for future XaiSTT / XaiTTS cascading vendors. + + + class XaiGrokOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="xAI API key") + url: str = Field(default="wss://api.x.ai/v1/realtime", description="xAI Realtime WebSocket URL") + voice: Optional[str] = Field(default=None, description="Voice identifier (e.g., eve or rex)") + language: Optional[str] = Field(default=None, description="Language code (e.g., en)") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional xAI parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + + class XaiGrok(BaseMLLM): + """xAI Grok MLLM vendor (`mllm.vendor`: ``xai``).""" + + def __init__(self, **kwargs: Any): + self.options = XaiGrokOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.language is not None: + params["language"] = self.options.language + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + config: Dict[str, Any] = { + "vendor": "xai", + "api_key": self.options.api_key, + "url": self.options.url, + "params": params, + } + + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class VertexAIOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + model: str = Field(..., description="Model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud location/region") + adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class VertexAI(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = VertexAIOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + # additional_params spread first so that explicit fields always win, + # matching the TypeScript SDK. + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["model"] = self.options.model + params["project_id"] = self.options.project_id + params["location"] = self.options.location + params["adc_credentials_string"] = self.options.adc_credentials_string + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "vertexai", + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + + + class GeminiLiveOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Google API key") + model: str = Field(..., description="Gemini Live model name") + url: Optional[str] = Field(default=None, description="WebSocket URL") + instructions: Optional[str] = Field(default=None, description="System instructions") + voice: Optional[str] = Field(default=None, description="Voice name") + greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") + input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") + output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") + messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") + failure_message: Optional[str] = Field(default=None, description="Message played on failure") + + class GeminiLive(BaseMLLM): + def __init__(self, **kwargs: Any): + self.options = GeminiLiveOptions(**kwargs) + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if self.options.additional_params is not None: + params.update(self.options.additional_params) + params["model"] = self.options.model + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.voice is not None: + params["voice"] = self.options.voice + + config: Dict[str, Any] = { + "vendor": "gemini", + "api_key": self.options.api_key, + "params": params, + } + + if self.options.url is not None: + config["url"] = self.options.url + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.input_modalities is not None: + config["input_modalities"] = self.options.input_modalities + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.messages is not None: + config["messages"] = self.options.messages + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection + + return config + status: unresolved + - id: patch-20245632 + content_hash: sha256:a22e4a3b114ba8105c8129ccd6222570dc1f231daf9ac6037a00bcd4e11c425b + original_commit: 20245632afd066efe5a453665b29c5ba0e13e4f8 + original_message: "feat(agentkit): export type aliases and avatar token helpers" + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - src/agora_agent/agentkit/__init__.py + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - src/agora_agent/agentkit/avatar_types.py + - src/agora_agent/agentkit/constants.py + patch_content: |+ + From 20245632afd066efe5a453665b29c5ba0e13e4f8 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 15:17:27 -0400 + Subject: [PATCH] feat(agentkit): export type aliases and avatar token helpers + + --- + src/agora_agent/agentkit/__init__.py | 49 ++++++++++++++++------- + src/agora_agent/agentkit/agent.py | 22 +++++++++- + src/agora_agent/agentkit/agent_session.py | 8 +++- + src/agora_agent/agentkit/avatar_types.py | 23 +++++++++-- + src/agora_agent/agentkit/constants.py | 10 +++++ + 5 files changed, 90 insertions(+), 22 deletions(-) + + diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py + index 5ceda66..e9ab221 100644 + --- a/src/agora_agent/agentkit/__init__.py + +++ b/src/agora_agent/agentkit/__init__.py + @@ -2,6 +2,7 @@ from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + @@ -62,23 +63,23 @@ from .agent import ( + SessionListResponse, + SessionSummary, + SpeakPriority, + + ThinkOnListeningAction, + + ThinkOnSpeakingAction, + + ThinkOnThinkingAction, + + ThinkResponse, + ) + -from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + -from ..agent_management.types.agent_think_agent_management_response import ( + - AgentThinkAgentManagementResponse as AgentThinkResponse, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + - AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + - AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + -) + -from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + - AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + +# Deprecated think type aliases (prefer ThinkOn* names). + +from .agent import ( + + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + + ThinkResponse as AgentThinkResponse, + ) + +from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + @@ -94,6 +95,13 @@ from .constants import ( + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + + ThinkOnListeningActionIgnore, + + ThinkOnListeningActionInject, + + ThinkOnListeningActionInterrupt, + + ThinkOnSpeakingActionIgnore, + + ThinkOnSpeakingActionInterrupt, + + ThinkOnThinkingActionIgnore, + + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + @@ -158,7 +166,6 @@ from .vendors import ( + SpeechmaticsSTT, + VertexAI, + XaiGrok, + - XaiRealtime, + LiveAvatarAvatar, + ) + + @@ -172,6 +179,7 @@ __all__ = [ + "LlmConfig", + "LlmStyle", + "SttConfig", + + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + @@ -230,6 +238,13 @@ __all__ = [ + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + + "ThinkOnListeningActionInject", + + "ThinkOnListeningActionInterrupt", + + "ThinkOnListeningActionIgnore", + + "ThinkOnThinkingActionInterrupt", + + "ThinkOnThinkingActionIgnore", + + "ThinkOnSpeakingActionInterrupt", + + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + @@ -246,10 +261,16 @@ __all__ = [ + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + + "ThinkResponse", + + "ThinkOnListeningAction", + + "ThinkOnThinkingAction", + + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + + "is_avatar_token_managed", + + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + @@ -303,7 +324,6 @@ __all__ = [ + "GeminiLive", + "VertexAI", + "XaiGrok", + - "XaiRealtime", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + @@ -314,7 +334,6 @@ __all__ = [ + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + - "is_rtc_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index 86a958e..14933a2 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -66,13 +66,25 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + -from .token import generate_convo_ai_token, _validate_expires_in + +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + + AgentThinkAgentManagementRequestOnListeningAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + + AgentThinkAgentManagementRequestOnThinkingAction, + +) + +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + + AgentThinkAgentManagementRequestOnSpeakingAction, + +) + +from ..agent_management.types.agent_think_agent_management_response import ( + + AgentThinkAgentManagementResponse, + +) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + +AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + @@ -159,6 +171,14 @@ FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + +# Think type aliases and response + +ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + +ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + +ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + +ThinkResponse = AgentThinkAgentManagementResponse + + + +from .token import generate_convo_ai_token, _validate_expires_in + + + + class Agent: + """A reusable agent definition. + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index e41a399..269619e 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -20,10 +20,10 @@ from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + - is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + @@ -242,7 +242,11 @@ class _AgentSessionBase: + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + - if not is_rtc_avatar(avatar): + + if not is_avatar_token_managed(avatar): + + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + + return + + + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + diff --git a/src/agora_agent/agentkit/avatar_types.py b/src/agora_agent/agentkit/avatar_types.py + index a04809c..aea9da1 100644 + --- a/src/agora_agent/agentkit/avatar_types.py + +++ b/src/agora_agent/agentkit/avatar_types.py + @@ -1,3 +1,4 @@ + +import warnings + import typing + + + @@ -21,11 +22,25 @@ def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + +def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + + """Return True when AgentKit manages the avatar RTC publisher identity.""" + + return ( + + is_heygen_avatar(config) + + or is_live_avatar_avatar(config) + + or is_generic_avatar(config) + + ) + + + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + - params = config.get("params", {}) + - return isinstance(params, dict) and bool(params.get("agora_uid")) and ( + - is_heygen_avatar(config) or is_live_avatar_avatar(config) or is_generic_avatar(config) + + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + + warnings.warn( + + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + + "and keep agora_uid checks in session enrichment.", + + DeprecationWarning, + + stacklevel=2, + ) + + params = config.get("params", {}) + + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + @@ -95,7 +110,7 @@ def validate_tts_sample_rate( + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - - HeyGen: ONLY supports 24,000 Hz + + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py + index f86e4d3..c0a852e 100644 + --- a/src/agora_agent/agentkit/constants.py + +++ b/src/agora_agent/agentkit/constants.py + @@ -58,3 +58,13 @@ class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + + + +# Think action value constants (match Fern wire values) + +ThinkOnListeningActionInject = "inject" + +ThinkOnListeningActionInterrupt = "interrupt" + +ThinkOnListeningActionIgnore = "ignore" + +ThinkOnThinkingActionInterrupt = "interrupt" + +ThinkOnThinkingActionIgnore = "ignore" + +ThinkOnSpeakingActionInterrupt = "interrupt" + +ThinkOnSpeakingActionIgnore = "ignore" + -- + 2.52.0 + + theirs_snapshot: + src/agora_agent/agentkit/__init__.py: | + from .agent import ( + Agent, + AgentConfig, + AgentConfigUpdate, + AsrConfig, + ConversationHistory, + ConversationRole, + ConversationSessionTurn, + ConversationTurn, + ConversationTurns, + StartAgentsRequestProperties, + AvatarConfig, + AvatarVendor, + GeofenceConfig, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + RtcConfig, + SttConfig, + SttVendor, + TtsConfig, + FillerWordsConfig, + FillerWordsTrigger, + FillerWordsTriggerFixedTimeConfig, + FillerWordsContent, + FillerWordsContentStaticConfig, + FillerWordsContentSelectionRule, + TurnDetectionConfig, + TurnDetectionNestedConfig, + StartOfSpeechConfig, + StartOfSpeechMode, + StartOfSpeechVadConfig, + StartOfSpeechKeywordsConfig, + StartOfSpeechDisabledConfig, + StartOfSpeechDisabledConfigStrategy, + EndOfSpeechConfig, + EndOfSpeechMode, + EndOfSpeechVadConfig, + EndOfSpeechSemanticConfig, + TurnDetectionType, + InterruptMode, + Eagerness, + SalConfig, + SalMode, + AdvancedFeatures, + SessionParams, + SessionParamsInput, + SilenceConfig, + SilenceAction, + FarewellConfig, + ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, + Labels, + LlmGreetingConfigs, + LlmGreetingConfigsMode, + McpServersItem, + SessionInfo, + SessionListResponse, + SessionSummary, + SpeakPriority, + ThinkOnListeningAction, + ThinkOnSpeakingAction, + ThinkOnThinkingAction, + ThinkResponse, + ) + # Deprecated think type aliases (prefer ThinkOn* names). + from .agent import ( + ThinkOnListeningAction as AgentThinkRequestOnListeningAction, + ThinkOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ThinkOnThinkingAction as AgentThinkRequestOnThinkingAction, + ThinkResponse as AgentThinkResponse, + ) + from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .constants import ( + DataChannel, + AudioScenario, + SilenceActionValues, + SalModeValues, + GeofenceArea, + GeofenceExcludeArea, + FillerWordsSelectionRule, + ThinkOnListeningActionIgnore, + ThinkOnListeningActionInject, + ThinkOnListeningActionInterrupt, + ThinkOnSpeakingActionIgnore, + ThinkOnSpeakingActionInterrupt, + ThinkOnThinkingActionIgnore, + ThinkOnThinkingActionInterrupt, + TurnDetectionTypeValues, + ) + from .token import ( + GenerateConvoAITokenOptions, + GenerateTokenOptions, + MAX_EXPIRY_SECONDS, + generate_convo_ai_token, + generate_rtc_token, + expires_in_hours, + expires_in_minutes, + ) + from .presets import ( + AgentPresets, + DeepgramPresetModels, + MiniMaxPresetModels, + OpenAIPresetModels, + OpenAITtsPresetModels, + normalize_preset_input, + ) + from .vendors import ( + AkoolAvatar, + AmazonSTT, + AmazonTTS, + AnamAvatar, + Anthropic, + AresSTT, + AssemblyAISTT, + AzureOpenAI, + BaseAvatar, + BaseLLM, + BaseMLLM, + BaseSTT, + BaseTTS, + CartesiaSampleRate, + CartesiaTTS, + DeepgramSTT, + DeepgramTTS, + ElevenLabsSampleRate, + ElevenLabsTTS, + FishAudioTTS, + Gemini, + GeminiLive, + GenericAvatar, + GoogleSTT, + GoogleTTS, + HeyGenAvatar, + HumeAITTS, + MicrosoftSampleRate, + MicrosoftSTT, + MicrosoftTTS, + MiniMaxTTS, + MurfTTS, + OpenAI, + OpenAIRealtime, + OpenAISampleRate, + OpenAISTT, + OpenAITTS, + RimeTTS, + SampleRate, + SarvamSTT, + SarvamTTS, + SpeechmaticsSTT, + VertexAI, + XaiGrok, + LiveAvatarAvatar, + ) + + __all__ = [ + "Agent", + "AgentConfig", + "AgentConfigUpdate", + # Return type of Agent.to_properties() + "StartAgentsRequestProperties", + # Top-level config types + "LlmConfig", + "LlmStyle", + "SttConfig", + "AsrConfig", + "SttVendor", + "TtsConfig", + "MllmConfig", + "MllmVendor", + "AvatarConfig", + "AvatarVendor", + "GeofenceConfig", + "RtcConfig", + "FillerWordsConfig", + "FillerWordsTrigger", + "FillerWordsTriggerFixedTimeConfig", + "FillerWordsContent", + "FillerWordsContentStaticConfig", + "FillerWordsContentSelectionRule", + # Turn detection types + "TurnDetectionConfig", + "TurnDetectionNestedConfig", + "StartOfSpeechConfig", + "StartOfSpeechMode", + "StartOfSpeechVadConfig", + "StartOfSpeechKeywordsConfig", + "StartOfSpeechDisabledConfig", + "StartOfSpeechDisabledConfigStrategy", + "EndOfSpeechConfig", + "EndOfSpeechMode", + "EndOfSpeechVadConfig", + "EndOfSpeechSemanticConfig", + # Deprecated turn detection types + "TurnDetectionType", + "InterruptMode", + "Eagerness", + # SAL types + "SalConfig", + "SalMode", + # Advanced features + "AdvancedFeatures", + # Session parameters types + "SessionParams", + "SessionParamsInput", + "SilenceConfig", + "SilenceAction", + "FarewellConfig", + "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", + "Labels", + # Type-safe constants + "DataChannel", + "AudioScenario", + "SilenceActionValues", + "SalModeValues", + "GeofenceArea", + "GeofenceExcludeArea", + "FillerWordsSelectionRule", + "TurnDetectionTypeValues", + "ThinkOnListeningActionInject", + "ThinkOnListeningActionInterrupt", + "ThinkOnListeningActionIgnore", + "ThinkOnThinkingActionInterrupt", + "ThinkOnThinkingActionIgnore", + "ThinkOnSpeakingActionInterrupt", + "ThinkOnSpeakingActionIgnore", + # LLM sub-types + "LlmGreetingConfigs", + "LlmGreetingConfigsMode", + "McpServersItem", + "AgentSession", + "AsyncAgentSession", + "AgentSessionOptions", + "SessionInfo", + "SessionListResponse", + "SessionSummary", + "ConversationHistory", + "ConversationTurn", + "ConversationRole", + "ConversationTurns", + "ConversationSessionTurn", + "SpeakPriority", + "ThinkResponse", + "ThinkOnListeningAction", + "ThinkOnThinkingAction", + "ThinkOnSpeakingAction", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", + "is_avatar_token_managed", + "is_rtc_avatar", + "AgentPresets", + "DeepgramPresetModels", + "OpenAIPresetModels", + "OpenAITtsPresetModels", + "MiniMaxPresetModels", + "normalize_preset_input", + "generate_rtc_token", + "GenerateTokenOptions", + "generate_convo_ai_token", + "GenerateConvoAITokenOptions", + "MAX_EXPIRY_SECONDS", + "expires_in_hours", + "expires_in_minutes", + "BaseLLM", + "BaseTTS", + "BaseSTT", + "BaseMLLM", + "BaseAvatar", + "SampleRate", + "ElevenLabsSampleRate", + "MicrosoftSampleRate", + "OpenAISampleRate", + "CartesiaSampleRate", + "OpenAI", + "AzureOpenAI", + "Anthropic", + "Gemini", + "ElevenLabsTTS", + "MicrosoftTTS", + "OpenAITTS", + "CartesiaTTS", + "DeepgramTTS", + "GoogleTTS", + "AmazonTTS", + "HumeAITTS", + "RimeTTS", + "FishAudioTTS", + "MiniMaxTTS", + "MurfTTS", + "SarvamTTS", + "SpeechmaticsSTT", + "DeepgramSTT", + "MicrosoftSTT", + "OpenAISTT", + "GoogleSTT", + "AmazonSTT", + "AssemblyAISTT", + "AresSTT", + "SarvamSTT", + "OpenAIRealtime", + "GeminiLive", + "VertexAI", + "XaiGrok", + "HeyGenAvatar", + "LiveAvatarAvatar", + "AkoolAvatar", + "AnamAvatar", + "GenericAvatar", + "is_heygen_avatar", + "is_live_avatar_avatar", + "is_akool_avatar", + "is_anam_avatar", + "is_generic_avatar", + "validate_avatar_config", + "validate_tts_sample_rate", + ] + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _validate_expires_in + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent.agentkit import Agent + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + account=agent_uid, + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + account=self._agent_uid, + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + account=str(params["agora_uid"]), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area + >>> from agora_agent.agentkit import Agent + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + page_index=page_index, + page_size=page_size, + request_options=self._request_options(), + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = response.get("pagination") if isinstance(response, dict) else response.pagination + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + src/agora_agent/agentkit/avatar_types.py: | + import warnings + import typing + + + def is_heygen_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "heygen" + + + def is_live_avatar_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "liveavatar" + + + def is_akool_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "akool" + + + def is_anam_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "anam" + + + def is_generic_avatar(config: typing.Dict[str, typing.Any]) -> bool: + return config.get("vendor") == "generic" + + + def is_avatar_token_managed(config: typing.Dict[str, typing.Any]) -> bool: + """Return True when AgentKit manages the avatar RTC publisher identity.""" + return ( + is_heygen_avatar(config) + or is_live_avatar_avatar(config) + or is_generic_avatar(config) + ) + + + def is_rtc_avatar(config: typing.Dict[str, typing.Any]) -> bool: + """Deprecated: use :func:`is_avatar_token_managed` for vendor gating.""" + warnings.warn( + "is_rtc_avatar is deprecated; use is_avatar_token_managed for vendor gating " + "and keep agora_uid checks in session enrichment.", + DeprecationWarning, + stacklevel=2, + ) + params = config.get("params", {}) + return isinstance(params, dict) and bool(params.get("agora_uid")) and is_avatar_token_managed(config) + + + def validate_avatar_config( + config: typing.Dict[str, typing.Any], + require_session_fields: bool = False, + ) -> None: + """Validates avatar configuration at runtime. + + Parameters + ---------- + config : dict + The avatar configuration dictionary. + + Raises + ------ + ValueError + If the configuration is invalid. + """ + if is_heygen_avatar(config) or is_live_avatar_avatar(config): + label = "HeyGen" if is_heygen_avatar(config) else "LiveAvatar" + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError(f"{label} avatar requires api_key") + if not params.get("quality"): + raise ValueError(f"{label} avatar requires quality (low, medium, or high)") + if not params.get("agora_uid"): + raise ValueError(f"{label} avatar requires agora_uid") + valid_qualities = ("low", "medium", "high") + if params.get("quality") not in valid_qualities: + raise ValueError( + f"Invalid quality for {label}: {params.get('quality')}. " + f"Must be one of: {', '.join(valid_qualities)}" + ) + if require_session_fields and not params.get("agora_token"): + raise ValueError(f"{label} avatar requires agora_token after session enrichment") + elif is_akool_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Akool avatar requires api_key") + elif is_anam_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Anam avatar requires api_key") + elif is_generic_avatar(config): + params = config.get("params", {}) + if not params.get("api_key"): + raise ValueError("Generic avatar requires api_key") + if not params.get("api_base_url"): + raise ValueError("Generic avatar requires api_base_url") + if not params.get("avatar_id"): + raise ValueError("Generic avatar requires avatar_id") + if not params.get("agora_uid"): + raise ValueError("Generic avatar requires agora_uid") + if require_session_fields: + if not params.get("agora_token"): + raise ValueError("Generic avatar requires agora_token after session enrichment") + if not params.get("agora_appid"): + raise ValueError("Generic avatar requires agora_appid after session enrichment") + if not params.get("agora_channel"): + raise ValueError("Generic avatar requires agora_channel after session enrichment") + + + def validate_tts_sample_rate( + avatar_config: typing.Dict[str, typing.Any], + tts_sample_rate: int, + ) -> None: + """Validates that TTS sample rate is compatible with the avatar vendor. + + Different avatar vendors have specific sample rate requirements: + - HeyGen/LiveAvatar: ONLY supports 24,000 Hz + - Akool: ONLY supports 16,000 Hz + + Parameters + ---------- + avatar_config : dict + The avatar configuration dictionary. + tts_sample_rate : int + The sample rate from your TTS configuration (in Hz). + + Raises + ------ + ValueError + If TTS sample rate is incompatible with the avatar vendor. + """ + if is_heygen_avatar(avatar_config) or is_live_avatar_avatar(avatar_config): + if tts_sample_rate != 24000: + label = "HeyGen" if is_heygen_avatar(avatar_config) else "LiveAvatar" + raise ValueError( + f"{label} avatars ONLY support 24,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 24kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/overview" + ) + elif is_akool_avatar(avatar_config): + if tts_sample_rate != 16000: + raise ValueError( + f"Akool avatars ONLY support 16,000 Hz sample rate. " + f"Your TTS is configured with {tts_sample_rate} Hz. " + f"Please update your TTS configuration to use 16kHz sample rate. " + f"See: https://docs.agora.io/en/conversational-ai/models/avatar/akool" + ) + src/agora_agent/agentkit/constants.py: | + """ + Type-safe constants for agent configuration values. + Use these instead of raw strings to avoid typos and get IDE autocomplete. + """ + + # Data channel: "rtm" | "datastream" + class DataChannel: + RTM = "rtm" + DATASTREAM = "datastream" + + class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + + + # Silence action when timeout elapses: "speak" | "think" + # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) + class SilenceActionValues: + SPEAK = "speak" + THINK = "think" + + + # SAL mode: "locking" | "recognition" + # (Use for sal.sal_mode — avoids shadowing SalMode type) + class SalModeValues: + LOCKING = "locking" + RECOGNITION = "recognition" + + + # Geofence area: "GLOBAL" | "NORTH_AMERICA" | "EUROPE" | "ASIA" | "INDIA" | "JAPAN" + class GeofenceArea: + GLOBAL = "GLOBAL" + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Geofence exclude area (when area is GLOBAL) + class GeofenceExcludeArea: + NORTH_AMERICA = "NORTH_AMERICA" + EUROPE = "EUROPE" + ASIA = "ASIA" + INDIA = "INDIA" + JAPAN = "JAPAN" + + + # Filler word selection rule: "shuffle" | "round_robin" + class FillerWordsSelectionRule: + SHUFFLE = "shuffle" + ROUND_ROBIN = "round_robin" + + + # Turn detection type (deprecated; use TurnDetectionNestedConfig.EndOfSpeech instead) + class TurnDetectionTypeValues: + AGORA_VAD = "agora_vad" + SERVER_VAD = "server_vad" + SEMANTIC_VAD = "semantic_vad" + + + # Think action value constants (match Fern wire values) + ThinkOnListeningActionInject = "inject" + ThinkOnListeningActionInterrupt = "interrupt" + ThinkOnListeningActionIgnore = "ignore" + ThinkOnThinkingActionInterrupt = "interrupt" + ThinkOnThinkingActionIgnore = "ignore" + ThinkOnSpeakingActionInterrupt = "interrupt" + ThinkOnSpeakingActionIgnore = "ignore" + status: unresolved + - id: patch-972dd5bd + content_hash: sha256:10f86db20e0b5a3800efce4913b736ff338dee29eb18cb31e89658e0293b848e + original_commit: 972dd5bdafc09b3981ab2ce4e0d02beae165c626 + original_message: updated docs + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/reference/agent.md + patch_content: |+ + From 972dd5bdafc09b3981ab2ce4e0d02beae165c626 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Thu, 21 May 2026 16:13:35 -0400 + Subject: [PATCH] updated docs + + --- + docs/reference/agent.md | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + + diff --git a/docs/reference/agent.md b/docs/reference/agent.md + index 1e88b8b..3163f9c 100644 + --- a/docs/reference/agent.md + +++ b/docs/reference/agent.md + @@ -264,3 +264,18 @@ to_properties( + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + + +## Type aliases + + + +Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + + +Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + + +## Cross-SDK discovery map + + + +| Concept | Python | TypeScript | Go | + +|---|---|---|---| + +| STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + +| xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + +| Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + +| Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + -- + 2.52.0 + + theirs_snapshot: + docs/reference/agent.md: | + --- + sidebar_position: 2 + title: Agent + description: Full API reference for the Python Agent builder class. + --- + + # Agent Reference + + **Import:** `from agora_agent.agentkit import Agent` or `from agora_agent import Agent` + + ## Constructor + + + ```python + Agent( + name: Optional[str] = None, + instructions: Optional[str] = None, + turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, + sal: Optional[SalConfig] = None, + advanced_features: Optional[Dict[str, Any]] = None, + parameters: Optional[SessionParams] = None, + greeting: Optional[str] = None, + failure_message: Optional[str] = None, + max_history: Optional[int] = None, + geofence: Optional[GeofenceConfig] = None, + labels: Optional[Dict[str, str]] = None, + rtc: Optional[RtcConfig] = None, + filler_words: Optional[FillerWordsConfig] = None, + ) + ``` + + | Parameter | Type | Default | Description | + |---|---|---|---| + | `name` | `Optional[str]` | `None` | Agent name, used as default session name | + | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | + | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | + | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | + | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | + | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | + | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | + | `failure_message` | `Optional[str]` | `None` | Spoken on error | + | `max_history` | `Optional[int]` | `None` | Max conversation history length | + | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | + | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | + | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | + | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | + + ## Builder Methods + + All builder methods return a new `Agent` instance (immutable pattern). + + ### `with_llm(vendor: BaseLLM) -> Agent` + + Set the LLM vendor for cascading flow. + + + ```python + from agora_agent.agentkit.vendors import OpenAI + agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) + ``` + + ### `with_tts(vendor: BaseTTS) -> Agent` + + Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. + + + ```python + from agora_agent.agentkit.vendors import ElevenLabsTTS + agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + ``` + + ### `with_stt(vendor: BaseSTT) -> Agent` + + Set the STT (ASR) vendor. + + + ```python + from agora_agent.agentkit.vendors import DeepgramSTT + agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) + ``` + + ### `with_mllm(vendor: BaseMLLM) -> Agent` + + Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. MLLM sessions do not require TTS, STT, or LLM vendors. + + + ```python + from agora_agent.agentkit.vendors import OpenAIRealtime + agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) + ``` + + ### `with_avatar(vendor: BaseAvatar) -> Agent` + + Set the avatar vendor for the cascading ASR + LLM + TTS pipeline. Avatars are not supported when MLLM is enabled — combining `with_mllm()` and an enabled `with_avatar()` is rejected at `to_properties()` and `AgentSession.start()`. A disabled avatar (`enable=False`) is allowed alongside MLLM. + + Raises `ValueError` if the TTS sample rate does not match the avatar's `required_sample_rate`. + + + ```python + from agora_agent.agentkit.vendors import HeyGenAvatar + agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', agora_uid='2')) + ``` + + **Raises:** `ValueError` — `"Avatar requires TTS sample rate of {required} Hz, but TTS is configured with {actual} Hz. Please update your TTS sample_rate to {required}."` + + ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` + + Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + + Pause-state detection is configured under semantic end-of-speech: + + ```python + agent = agent.with_turn_detection({ + "mode": "default", + "config": { + "end_of_speech": { + "mode": "semantic", + "semantic_config": { + "pause_state_enabled": True, + }, + }, + }, + }) + ``` + + ### `with_interruption(config: InterruptionConfig) -> Agent` + + Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. + + ### `with_instructions(instructions: str) -> Agent` + + Override the system prompt. + + ### `with_greeting(greeting: str) -> Agent` + + Override the greeting message. + + ### `with_name(name: str) -> Agent` + + Override the agent name. + + ### `with_sal(config: SalConfig) -> Agent` + + Set SAL (Selective Attention Locking) configuration. + + ### `with_advanced_features(features: AdvancedFeatures) -> Agent` + + Set advanced features (e.g. `{'enable_rtm': True}`). + + When `enable_rtm=True`, AgentKit defaults `parameters.data_channel` to `"rtm"` unless you explicitly set another data channel. + + ### `with_tools(enabled: bool = True) -> Agent` + + Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. + + ### `with_parameters(parameters: SessionParams) -> Agent` + + Set session parameters (silence config, farewell config, data channel, audio scenario, etc.). + + ### `with_audio_scenario(audio_scenario: ParametersAudioScenario) -> Agent` + + Set `parameters.audio_scenario` without replacing existing session parameters. + + ### `with_failure_message(message: str) -> Agent` + + Set the message spoken via TTS when the LLM call fails. + + ### `with_max_history(max_history: int) -> Agent` + + Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. + + ### `with_geofence(geofence: GeofenceConfig) -> Agent` + + Set geofence configuration (restricts backend server regions). + + ### `with_labels(labels: Dict[str, str]) -> Agent` + + Set custom labels (key-value pairs returned in notification callbacks). + + ### `with_rtc(rtc: RtcConfig) -> Agent` + + Set RTC configuration. + + ### `with_filler_words(filler_words: FillerWordsConfig) -> Agent` + + Set filler words configuration (played while waiting for LLM response). + + ## `create_session()` + + + ```python + create_session( + client: Any, + channel: str, + agent_uid: str, + remote_uids: List[str], + name: Optional[str] = None, + token: Optional[str] = None, + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + expires_in: Optional[int] = None, + ) -> AgentSession + ``` + + Creates an `AgentSession` bound to the given client and channel. + + | Parameter | Type | Required | Description | + |---|---|---|---| + | `client` | `Agora` or `AsyncAgora` | Yes | Authenticated client | + | `channel` | `str` | Yes | Channel name | + | `agent_uid` | `str` | Yes | UID for the agent | + | `remote_uids` | `List[str]` | Yes | UIDs of remote participants | + | `name` | `Optional[str]` | No | Session name (defaults to agent name) | + | `token` | `Optional[str]` | No | Pre-built RTC+RTM token | + | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | + | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | + | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | + + **Returns:** `AgentSession` + + ## `to_properties()` + + Converts the agent configuration into a `StartAgentsRequestProperties` object for the Agora API. Called internally by `AgentSession.start()`. + + + ```python + to_properties( + channel: str, + agent_uid: str, + remote_uids: List[str], + idle_timeout: Optional[int] = None, + enable_string_uid: Optional[bool] = None, + token: Optional[str] = None, + app_id: Optional[str] = None, + app_certificate: Optional[str] = None, + expires_in: Optional[int] = None, + ) -> StartAgentsRequestProperties + ``` + + **Raises:** `ValueError` if neither `token` nor `app_id`+`app_certificate` is provided, or if required vendors (LLM, TTS) are missing in cascading mode. + + ## Properties + + | Property | Type | Description | + |---|---|---| + | `name` | `Optional[str]` | Agent name | + | `instructions` | `Optional[str]` | System prompt | + | `greeting` | `Optional[str]` | Greeting message | + | `failure_message` | `Optional[str]` | Message spoken when LLM fails | + | `max_history` | `Optional[int]` | Max conversation history length | + | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | + | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | + | `stt` | `Optional[Dict[str, Any]]` | STT config dict | + | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | + | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | + | `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | + | `sal` | `Optional[SalConfig]` | SAL configuration | + | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | + | `parameters` | `Optional[SessionParams]` | Session parameters | + | `geofence` | `Optional[GeofenceConfig]` | Geofence configuration | + | `labels` | `Optional[Dict[str, str]]` | Custom labels | + | `rtc` | `Optional[RtcConfig]` | RTC configuration | + | `filler_words` | `Optional[FillerWordsConfig]` | Filler words configuration | + | `config` | `Dict[str, Any]` | Full configuration dict | + + ## Type aliases + + Public aliases over Fern-generated types: `LlmConfig`, `SttConfig`, `AsrConfig` (= `SttConfig`), `MllmConfig`, `AvatarConfig`, session/conversation types, and think types (`ThinkOnListeningAction`, etc.). + + Think value constants: `ThinkOnListeningActionInject`, `ThinkOnListeningActionInterrupt`, `ThinkOnListeningActionIgnore`, `ThinkOnThinkingActionInterrupt`, `ThinkOnThinkingActionIgnore`, `ThinkOnSpeakingActionInterrupt`, `ThinkOnSpeakingActionIgnore`. + + ## Cross-SDK discovery map + + | Concept | Python | TypeScript | Go | + |---|---|---|---| + | STT payload alias (wire: `asr`) | `SttConfig` / `AsrConfig` | `SttConfig` / `AsrConfig` | `AsrConfig` / `SttConfig` | + | xAI MLLM (primary) | `XaiGrok` | `XaiGrok` | `XaiGrok` / `NewXaiGrok` | + | Avatar token helper | `is_avatar_token_managed` | `isAvatarTokenManaged` | `IsAvatarTokenManaged` | + | Think inject constant | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | `ThinkOnListeningActionInject` | + status: unresolved + - id: patch-7465fada + content_hash: sha256:9c6ed2e5f48702293eed8b213cc31cce63a7ed5a1ad16a0b23e791c13e77746f + original_commit: 7465fadafa0f1e62051d99b42d0eeda85f31eeee + original_message: "fix(agentkit): resolve Python session typing issues" + original_author: digitallysavvy + base_generation: f652c69edbd1815c832fc9354c193090ac8dde8e + files: + - src/agora_agent/agentkit/agent_session.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + - id: patch-d29165c4 + content_hash: sha256:be59d1d3efc435d5e0b83305b2cd39ce3dad4534a4125de18028c137e692e659 + original_commit: d29165c4ddd8296af703a4e9ed848516f563dd1b + original_message: make python compat package publishable + original_author: chenyuguo + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/pyproject.toml + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From d29165c4ddd8296af703a4e9ed848516f563dd1b Mon Sep 17 00:00:00 2001 + From: chenyuguo + Date: Wed, 27 May 2026 17:24:50 +0800 + Subject: [PATCH] make python compat package publishable + + --- + compat/agora-agent-server-sdk/README.md | 2 ++ + compat/agora-agent-server-sdk/pyproject.toml | 3 +++ + .../src/agora_agent_server_sdk_compat/__init__.py | 1 + + 3 files changed, 6 insertions(+) + create mode 100644 compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index 1388836..cff3cfe 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -9,3 +9,5 @@ pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + + +It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + diff --git a/compat/agora-agent-server-sdk/pyproject.toml b/compat/agora-agent-server-sdk/pyproject.toml + index 8efbe53..ac93128 100644 + --- a/compat/agora-agent-server-sdk/pyproject.toml + +++ b/compat/agora-agent-server-sdk/pyproject.toml + @@ -26,6 +26,9 @@ classifiers = [ + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + +packages = [ + + { include = "agora_agent_server_sdk_compat", from = "src"} + +] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + new file mode 100644 + index 0000000..55522c6 + --- /dev/null + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -0,0 +1 @@ + +"""Compatibility package for the renamed agora-agents distribution.""" + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + + It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + compat/agora-agent-server-sdk/pyproject.toml: | + [project] + name = "agora-agent-server-sdk" + + [tool.poetry] + name = "agora-agent-server-sdk" + version = "v2.0.0" + description = "Compatibility shim for the renamed agora-agents package." + readme = "README.md" + authors = [] + keywords = [] + + classifiers = [ + "Intended Audience :: Developers", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed" + ] + packages = [ + { include = "agora_agent_server_sdk_compat", from = "src"} + ] + + [tool.poetry.urls] + Repository = 'https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python' + + [tool.poetry.dependencies] + python = "^3.8" + agora-agents = ">=2.0.0,<3.0.0" + + [build-system] + requires = ["poetry-core"] + build-backend = "poetry.core.masonry.api" + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility package for the renamed agora-agents distribution.""" + status: unresolved + - id: patch-fae1249a + content_hash: sha256:01bf21f3cc4c784dfcff80a48c9c7bb3123af4327a567b7c990b528e9780e9a2 + original_commit: fae1249a20c53761a2eb5515a1bf92ca666760d1 + original_message: Re-export agora-agents API from legacy PyPI compatibility package The compat distribution delegates to agora_agent via __getattr__ and documents both import paths in its README. + original_author: digitallysavvy + base_generation: f652c69edbd1815c832fc9354c193090ac8dde8e + files: + - compat/agora-agent-server-sdk/README.md + - compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + patch_content: |+ + From fae1249a20c53761a2eb5515a1bf92ca666760d1 Mon Sep 17 00:00:00 2001 + From: digitallysavvy + Date: Wed, 27 May 2026 16:58:18 -0400 + Subject: [PATCH] Re-export agora-agents API from legacy PyPI compatibility + package The compat distribution delegates to agora_agent via __getattr__ and + documents both import paths in its README. + + --- + compat/agora-agent-server-sdk/README.md | 7 +++++-- + .../src/agora_agent_server_sdk_compat/__init__.py | 14 +++++++++++++- + 2 files changed, 18 insertions(+), 3 deletions(-) + + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index cff3cfe..e43d1d8 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -8,6 +8,9 @@ New projects should install: + pip install agora-agents + ``` + + -This compatibility package is kept only to preserve the legacy distribution name during the migration window. It depends on `agora-agents`, which continues to provide the `agora_agent` Python import path. + +This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + -It intentionally contains only a minimal compatibility module so the distribution can be built and published cleanly with Poetry. + +```python + +from agora_agent import Agora, Area + +from agora_agent_server_sdk_compat import Agora, Area + +``` + diff --git a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + index 55522c6..6283244 100644 + --- a/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + +++ b/compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py + @@ -1 +1,13 @@ + -"""Compatibility package for the renamed agora-agents distribution.""" + +"""Compatibility re-exports for the renamed agora-agents package.""" + + + +import agora_agent as _agora_agent + + + +__all__ = getattr(_agora_agent, "__all__", []) + + + + + +def __getattr__(name: str): + + return getattr(_agora_agent, name) + + + + + +def __dir__(): + + return dir(_agora_agent) + -- + 2.52.0 + + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + compat/agora-agent-server-sdk/src/agora_agent_server_sdk_compat/__init__.py: | + """Compatibility re-exports for the renamed agora-agents package.""" + + import agora_agent as _agora_agent + + __all__ = getattr(_agora_agent, "__all__", []) + + + def __getattr__(name: str): + return getattr(_agora_agent, name) + + + def __dir__(): + return dir(_agora_agent) + user_owned: true + - id: patch-fc9d93c3 + content_hash: sha256:93877741bdad745fda5dd549d7c3dd6bc315f4574aabd2defb52c0c795bff011 + original_commit: fc9d93c3026a6109d8a5e8b386418592f8d121c5 + original_message: Document agora-agents PyPI install name and migration notes + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/installation.md + patch_content: | + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index c14bdb2..f6f1750 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -13,13 +13,13 @@ description: Install the Agora Conversational AI Python SDK. + ## Install with pip + + ```sh + -pip install agora-agent-sdk + +pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + -poetry add agora-agent-sdk + +poetry add agora-agents + ``` + + ## Dependencies + theirs_snapshot: + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Dependencies + + The following packages are installed automatically: + + | Package | Purpose | + |---|---| + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + Both clients share the same constructor parameters and capabilities. See [Authentication](./authentication.md) for setup details. + status: unresolved + - id: patch-44c21c14 + content_hash: sha256:34f08060a06ca824943ab02e75c3c83ad43a1b6e7d09ec6f8fa244ef82de6fcd + original_commit: 44c21c14a14aa7ad469a18ce86024ff14ca2bf9b + original_message: Re-export AgentKit symbols from agora_agent package root Extend __getattr__ and __all__ so vendor classes, presets, and helpers are importable via `from agora_agent import ...`. Add tests and update class docstring examples to use the root import path. + original_author: digitallysavvy + base_generation: f652c69edbd1815c832fc9354c193090ac8dde8e + files: + - src/agora_agent/agentkit/agent.py + - src/agora_agent/agentkit/agent_session.py + - tests/custom/test_root_exports.py + patch_content: | + diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py + index f84862c..0d7a4aa 100644 + --- a/src/agora_agent/agentkit/agent.py + +++ b/src/agora_agent/agentkit/agent.py + @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content import + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + +from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py + index a749d1e..ddcd930 100644 + --- a/src/agora_agent/agentkit/agent_session.py + +++ b/src/agora_agent/agentkit/agent_session.py + @@ -24,6 +24,7 @@ from .avatar_types import ( + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + theirs_snapshot: + src/agora_agent/agentkit/agent.py: | + from __future__ import annotations + + import time + import typing + import typing_extensions + + if typing.TYPE_CHECKING: + from .agent_session import AgentSession, AsyncAgentSession + + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr + from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor + from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar + from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor + from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm + from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle + from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm + from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor + from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties + from ..agents.types.get_agents_response import GetAgentsResponse + from ..agents.types.list_agents_response import ListAgentsResponse + from ..agents.types.list_agents_response_data_list_item import ListAgentsResponseDataListItem + from ..agents.types.list_agents_response_data_list_item_status import ListAgentsResponseDataListItemStatus + from ..agents.types.get_history_agents_response import GetHistoryAgentsResponse + from ..agents.types.get_history_agents_response_contents_item import GetHistoryAgentsResponseContentsItem + from ..agents.types.get_history_agents_response_contents_item_role import GetHistoryAgentsResponseContentsItemRole + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.get_turns_agents_response_turns_item import GetTurnsAgentsResponseTurnsItem + from ..agents.types.speak_agents_request_priority import SpeakAgentsRequestPriority + from ..agents.types.start_agents_request_properties_turn_detection import StartAgentsRequestPropertiesTurnDetection + from ..agents.types.start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_keywords_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_start_of_speech_disabled_config_strategy import StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_mode import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_vad_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + from ..agents.types.start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config import StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + from ..agents.types.start_agents_request_properties_turn_detection_type import StartAgentsRequestPropertiesTurnDetectionType + from ..agents.types.start_agents_request_properties_turn_detection_interrupt_mode import StartAgentsRequestPropertiesTurnDetectionInterruptMode + from ..agents.types.start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness + from ..agents.types.start_agents_request_properties_sal import StartAgentsRequestPropertiesSal + from ..agents.types.start_agents_request_properties_sal_sal_mode import StartAgentsRequestPropertiesSalSalMode + from ..agents.types.start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters + from ..agents.types.start_agents_request_properties_parameters_silence_config import StartAgentsRequestPropertiesParametersSilenceConfig + from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction + from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig + from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel + from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario + from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption + from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection + from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode + from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs + from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode + from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem + from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence + from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc + from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures + from ..agents.types.start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords + from ..agents.types.start_agents_request_properties_filler_words_trigger import StartAgentsRequestPropertiesFillerWordsTrigger + from ..agents.types.start_agents_request_properties_filler_words_trigger_fixed_time_config import StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + from ..agents.types.start_agents_request_properties_filler_words_content import StartAgentsRequestPropertiesFillerWordsContent + from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + from ..types.tts import Tts + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse, + ) + from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS + + # Top-level aliases + LlmConfig = StartAgentsRequestPropertiesLlm + LlmStyle = StartAgentsRequestPropertiesLlmStyle + SttConfig = StartAgentsRequestPropertiesAsr + AsrConfig = SttConfig + SttVendor = StartAgentsRequestPropertiesAsrVendor + TtsConfig = Tts + MllmConfig = StartAgentsRequestPropertiesMllm + MllmVendor = StartAgentsRequestPropertiesMllmVendor + AvatarConfig = StartAgentsRequestPropertiesAvatar + AvatarVendor = StartAgentsRequestPropertiesAvatarVendor + TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection + SalConfig = StartAgentsRequestPropertiesSal + SalMode = StartAgentsRequestPropertiesSalSalMode + AdvancedFeatures = StartAgentsRequestPropertiesAdvancedFeatures + SessionParams = StartAgentsRequestPropertiesParameters + + # SOS/EOS turn detection aliases (preferred) + TurnDetectionNestedConfig = StartAgentsRequestPropertiesTurnDetectionConfig + StartOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech + StartOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechMode + StartOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig + StartOfSpeechKeywordsConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechKeywordsConfig + StartOfSpeechDisabledConfig = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfig + StartOfSpeechDisabledConfigStrategy = StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechDisabledConfigStrategy + EndOfSpeechConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech + EndOfSpeechMode = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechMode + EndOfSpeechVadConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechVadConfig + EndOfSpeechSemanticConfig = StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig + + # Deprecated turn detection aliases + # Deprecated: Use TurnDetectionConfig with TurnDetectionNestedConfig.start_of_speech + # and .end_of_speech instead. The `type` field and agora_vad/server_vad/semantic_vad + # values will be removed in a future release. + TurnDetectionType = StartAgentsRequestPropertiesTurnDetectionType + + # Deprecated: Use StartOfSpeechConfig with mode="vad"|"keywords"|"disabled" and the + # corresponding vad_config, keywords_config, or disabled_config instead. + InterruptMode = StartAgentsRequestPropertiesTurnDetectionInterruptMode + + # Deprecated: Only applies to server_vad/semantic_vad modes with OpenAI Realtime + # (MLLM). Has no equivalent in the ASR + LLM + TTS pipeline. + Eagerness = StartAgentsRequestPropertiesTurnDetectionEagerness + + # Parameters (SessionParams) sub-type aliases + SilenceConfig = StartAgentsRequestPropertiesParametersSilenceConfig + SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction + FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig + ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel + ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario + InterruptionConfig = StartAgentsRequestPropertiesInterruption + InterruptionMode = StartAgentsRequestPropertiesInterruptionMode + MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + AgentConfig = StartAgentsRequestProperties + AgentConfigUpdate = UpdateAgentsRequestProperties + SessionInfo = GetAgentsResponse + SessionListResponse = ListAgentsResponse + SessionSummary = ListAgentsResponseDataListItem + SessionStatus = ListAgentsResponseDataListItemStatus + ConversationHistory = GetHistoryAgentsResponse + ConversationTurn = GetHistoryAgentsResponseContentsItem + ConversationRole = GetHistoryAgentsResponseContentsItemRole + ConversationTurns = GetTurnsAgentsResponse + ConversationSessionTurn = GetTurnsAgentsResponseTurnsItem + SpeakPriority = SpeakAgentsRequestPriority + Labels = typing.Dict[str, str] + + + class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario + + + class ThinkOptions(typing_extensions.TypedDict, total=False): + on_listening_action: AgentThinkAgentManagementRequestOnListeningAction + on_thinking_action: AgentThinkAgentManagementRequestOnThinkingAction + on_speaking_action: AgentThinkAgentManagementRequestOnSpeakingAction + interruptable: bool + metadata: typing.Dict[str, str] + + + class GetTurnsOptions(typing_extensions.TypedDict, total=False): + page_index: int + page_size: int + + + class SayOptions(typing_extensions.TypedDict, total=False): + priority: SpeakAgentsRequestPriority + interruptable: bool + + + class SessionOptions(typing_extensions.TypedDict, total=False): + name: str + channel: str + token: str + agent_uid: str + remote_uids: typing.List[str] + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + # LLM sub-type aliases + LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs + LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode + McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem + + # Additional top-level config aliases + GeofenceConfig = StartAgentsRequestPropertiesGeofence + RtcConfig = StartAgentsRequestPropertiesRtc + FillerWordsConfig = StartAgentsRequestPropertiesFillerWords + FillerWordsTrigger = StartAgentsRequestPropertiesFillerWordsTrigger + FillerWordsTriggerFixedTimeConfig = StartAgentsRequestPropertiesFillerWordsTriggerFixedTimeConfig + FillerWordsContent = StartAgentsRequestPropertiesFillerWordsContent + FillerWordsContentStaticConfig = StartAgentsRequestPropertiesFillerWordsContentStaticConfig + FillerWordsContentSelectionRule = StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule + + # Think type aliases and response + ThinkOnListeningAction = AgentThinkAgentManagementRequestOnListeningAction + ThinkOnThinkingAction = AgentThinkAgentManagementRequestOnThinkingAction + ThinkOnSpeakingAction = AgentThinkAgentManagementRequestOnSpeakingAction + ThinkResponse = AgentThinkAgentManagementResponse + + from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in + + + def _dump_optional_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + + class Agent: + """A reusable agent definition. + + Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) + to configure vendor settings after construction. + + Examples + -------- + >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT + >>> + >>> agent = Agent(instructions="You are a helpful voice assistant.") + >>> agent = ( + ... agent + ... .with_llm(OpenAI(api_key="...", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) + ... ) + """ + + def __init__( + self, + name: typing.Optional[str] = None, + instructions: typing.Optional[str] = None, + turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, + sal: typing.Optional[SalConfig] = None, + advanced_features: typing.Optional[AdvancedFeatures] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, + greeting: typing.Optional[str] = None, + failure_message: typing.Optional[str] = None, + max_history: typing.Optional[int] = None, + geofence: typing.Optional[GeofenceConfig] = None, + labels: typing.Optional[typing.Dict[str, str]] = None, + rtc: typing.Optional[RtcConfig] = None, + filler_words: typing.Optional[FillerWordsConfig] = None, + greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + ): + self._name = name + self._instructions = instructions + self._greeting = greeting + self._failure_message = failure_message + self._max_history = max_history + self._llm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts: typing.Optional[typing.Dict[str, typing.Any]] = None + self._stt: typing.Optional[typing.Dict[str, typing.Any]] = None + self._mllm: typing.Optional[typing.Dict[str, typing.Any]] = None + self._tts_sample_rate: typing.Optional[int] = None + self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None + self._avatar_required_sample_rate: typing.Optional[int] = None + self._turn_detection = turn_detection + self._interruption = interruption + self._sal = sal + self._advanced_features = advanced_features + self._parameters = parameters + self._geofence = geofence + self._labels = labels + self._rtc = rtc + self._filler_words = filler_words + self._greeting_configs = greeting_configs + + def with_llm(self, vendor: BaseLLM) -> "Agent": + new_agent = self._clone() + new_agent._llm = vendor.to_config() + return new_agent + + def with_tts(self, vendor: BaseTTS) -> "Agent": + sample_rate = vendor.sample_rate + if ( + self._avatar_required_sample_rate not in (None, 0) + and sample_rate is not None + and sample_rate != self._avatar_required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {self._avatar_required_sample_rate} Hz, " + f"but TTS is configured with {sample_rate} Hz. " + f"Please update your TTS sample_rate to {self._avatar_required_sample_rate}." + ) + new_agent = self._clone() + new_agent._tts = vendor.to_config() + new_agent._tts_sample_rate = sample_rate + return new_agent + + def with_stt(self, vendor: BaseSTT) -> "Agent": + new_agent = self._clone() + new_agent._stt = vendor.to_config() + return new_agent + + def with_mllm(self, vendor: BaseMLLM) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` so callers can still + # configure both for tests, debugging, or disabled-avatar use cases. + new_agent = self._clone() + new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = self._copy_model_update( + new_agent._advanced_features, + {"enable_mllm": None}, + ) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model + return new_agent + + def with_avatar(self, vendor: BaseAvatar) -> "Agent": + # Note: avatars are not supported with MLLM. The combination is rejected + # at ``to_properties`` / ``AgentSession.start`` (only when the avatar is + # enabled) so callers may still combine the two for testing or for the + # disabled-avatar pattern. + required_sample_rate = vendor.required_sample_rate + if ( + required_sample_rate not in (None, 0) + and self._tts_sample_rate is not None + and self._tts_sample_rate != required_sample_rate + ): + raise ValueError( + f"Avatar requires TTS sample rate of {required_sample_rate} Hz, " + f"but TTS is configured with {self._tts_sample_rate} Hz. " + f"Please update your TTS sample_rate to {required_sample_rate}." + ) + new_agent = self._clone() + new_agent._avatar = vendor.to_config() + new_agent._avatar_required_sample_rate = required_sample_rate + return new_agent + + def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": + new_agent = self._clone() + new_agent._turn_detection = config + return new_agent + + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + + def with_instructions(self, instructions: str) -> "Agent": + new_agent = self._clone() + new_agent._instructions = instructions + return new_agent + + def with_greeting(self, greeting: str) -> "Agent": + new_agent = self._clone() + new_agent._greeting = greeting + return new_agent + + def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": + """Returns a new Agent with greeting playback configuration.""" + new_agent = self._clone() + new_agent._greeting_configs = configs + return new_agent + + def with_name(self, name: str) -> "Agent": + new_agent = self._clone() + new_agent._name = name + return new_agent + + def with_sal(self, config: SalConfig) -> "Agent": + """Returns a new Agent with the specified SAL (Selective Attention Locking) configuration.""" + new_agent = self._clone() + new_agent._sal = config + return new_agent + + def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": + """Returns a new Agent with the specified advanced features configuration. + + Use this to enable RTM and other advanced features. + """ + new_agent = self._clone() + new_agent._advanced_features = features + return new_agent + + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = self._copy_model_update( + new_agent._advanced_features, + {"enable_tools": enabled}, + ) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": + """Returns a new Agent with the specified session parameters. + + Use this to configure silence behaviour, graceful hang-up, data channel, and more. + """ + new_agent = self._clone() + new_agent._parameters = parameters + return new_agent + + def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent": + """Returns a new Agent with the specified RTC audio scenario.""" + new_agent = self._clone() + if new_agent._parameters is None: + new_agent._parameters = StartAgentsRequestPropertiesParameters(audio_scenario=audio_scenario) + elif isinstance(new_agent._parameters, dict): + new_agent._parameters = typing.cast( + SessionParamsInput, + {**new_agent._parameters, "audio_scenario": audio_scenario}, + ) + else: + new_agent._parameters = self._copy_model_update( + new_agent._parameters, + {"audio_scenario": audio_scenario}, + ) + return new_agent + + def with_failure_message(self, message: str) -> "Agent": + """Returns a new Agent with the specified failure message. + + The failure message is played via TTS when the LLM call fails. + """ + new_agent = self._clone() + new_agent._failure_message = message + return new_agent + + def with_max_history(self, max_history: int) -> "Agent": + """Returns a new Agent with the specified maximum conversation history length.""" + new_agent = self._clone() + new_agent._max_history = max_history + return new_agent + + def with_geofence(self, geofence: GeofenceConfig) -> "Agent": + """Returns a new Agent with the specified geofence configuration. + + Restricts which geographic regions the agent's backend servers may run in. + """ + new_agent = self._clone() + new_agent._geofence = geofence + return new_agent + + def with_labels(self, labels: typing.Dict[str, str]) -> "Agent": + """Returns a new Agent with the specified custom labels. + + Labels are key-value pairs attached to the agent and returned in notification callbacks. + """ + new_agent = self._clone() + new_agent._labels = dict(labels) + return new_agent + + def with_rtc(self, rtc: RtcConfig) -> "Agent": + """Returns a new Agent with the specified RTC configuration.""" + new_agent = self._clone() + new_agent._rtc = rtc + return new_agent + + def with_filler_words(self, filler_words: FillerWordsConfig) -> "Agent": + """Returns a new Agent with the specified filler words configuration. + + Filler words are played while the agent waits for the LLM to respond. + """ + new_agent = self._clone() + new_agent._filler_words = filler_words + return new_agent + + @staticmethod + def _field_value(value: typing.Any, field: str) -> typing.Any: + if value is None: + return None + if isinstance(value, dict): + return value.get(field) + return getattr(value, field, None) + + @staticmethod + def _copy_model_update(value: typing.Any, update: typing.Dict[str, typing.Any]) -> typing.Any: + if hasattr(value, "model_copy"): + return value.model_copy(update=update) + if hasattr(value, "copy"): + return value.copy(update=update) + raise TypeError(f"Object of type {type(value).__name__} does not support model copying") + + def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + enable_rtm = self._field_value(self._advanced_features, "enable_rtm") is True + data_channel = self._field_value(self._parameters, "data_channel") + if not enable_rtm or data_channel is not None: + return self._parameters + if self._parameters is None: + return StartAgentsRequestPropertiesParameters(data_channel="rtm") + if isinstance(self._parameters, dict): + return typing.cast(SessionParamsInput, {**self._parameters, "data_channel": "rtm"}) + return self._copy_model_update(self._parameters, {"data_channel": "rtm"}) + + @property + def name(self) -> typing.Optional[str]: + return self._name + + @property + def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._llm + + @property + def tts(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._tts + + @property + def tts_sample_rate(self) -> typing.Optional[int]: + return self._tts_sample_rate + + @property + def stt(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._stt + + @property + def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._mllm + + @property + def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: + return self._turn_detection + + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + + @property + def instructions(self) -> typing.Optional[str]: + return self._instructions + + @property + def greeting(self) -> typing.Optional[str]: + return self._greeting + + @property + def greeting_configs(self) -> typing.Optional[LlmGreetingConfigs]: + return self._greeting_configs + + @property + def failure_message(self) -> typing.Optional[str]: + return self._failure_message + + @property + def max_history(self) -> typing.Optional[int]: + return self._max_history + + @property + def avatar(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + return self._avatar + + @property + def sal(self) -> typing.Optional[SalConfig]: + return self._sal + + @property + def advanced_features(self) -> typing.Optional[AdvancedFeatures]: + return self._advanced_features + + @property + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: + return self._parameters + + @property + def geofence(self) -> typing.Optional[GeofenceConfig]: + return self._geofence + + @property + def labels(self) -> typing.Optional[typing.Dict[str, str]]: + return self._labels + + @property + def rtc(self) -> typing.Optional[RtcConfig]: + return self._rtc + + @property + def filler_words(self) -> typing.Optional[FillerWordsConfig]: + return self._filler_words + + @property + def config(self) -> typing.Dict[str, typing.Any]: + return { + "name": self._name, + "instructions": self._instructions, + "greeting": self._greeting, + "failure_message": self._failure_message, + "max_history": self._max_history, + "llm": self._llm, + "tts": self._tts, + "stt": self._stt, + "mllm": self._mllm, + "turn_detection": self._turn_detection, + "interruption": self._interruption, + "sal": self._sal, + "avatar": self._avatar, + "advanced_features": self._advanced_features, + "parameters": self._parameters, + "geofence": self._geofence, + "labels": self._labels, + "rtc": self._rtc, + "filler_words": self._filler_words, + "greeting_configs": self._greeting_configs, + } + + def create_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AgentSession": + from .agent_session import AgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def create_async_session( + self, + client: typing.Any, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + name: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ) -> "AsyncAgentSession": + """Create an async session for use with :class:`~agora_agent.AsyncAgora`. + + Equivalent to :meth:`create_session` but returns an + :class:`~agora_agent.agentkit.AsyncAgentSession`. + """ + from .agent_session import AsyncAgentSession + + session_name = name or self._name or f"agent-{int(time.time())}" + return AsyncAgentSession( + client=client, + agent=self, + app_id=client.app_id if hasattr(client, "app_id") else "", + app_certificate=client.app_certificate if hasattr(client, "app_certificate") else None, + name=session_name, + channel=channel, + token=token, + agent_uid=agent_uid, + remote_uids=remote_uids, + idle_timeout=idle_timeout, + enable_string_uid=enable_string_uid, + preset=preset, + pipeline_id=pipeline_id, + expires_in=expires_in, + debug=debug, + warn=warn, + ) + + def to_properties( + self, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + token: typing.Optional[str] = None, + app_id: typing.Optional[str] = None, + app_certificate: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + skip_vendor_validation: bool = False, + ) -> StartAgentsRequestProperties: + # Validate the MLLM + enabled-avatar combination BEFORE generating the + # RTC token so callers get a clear, actionable error first (matches the + # TypeScript and Go SDKs' fail-fast contract). + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) + avatar_enabled = ( + isinstance(self._avatar, dict) and self._avatar.get("enable") is not False + ) + if is_mllm_mode and avatar_enabled: + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if token is None: + if app_id is None or app_certificate is None: + raise ValueError("Either token or app_id+app_certificate must be provided") + validated_expires_in = _validate_expires_in(expires_in) if expires_in is not None else None + # Use generate_convo_ai_token (RTC + RTM) so the token works whether or + # not the caller enables advanced_features.enable_rtm. + token_kwargs: typing.Dict[str, typing.Any] = {} + if validated_expires_in is not None: + token_kwargs["token_expire"] = validated_expires_in + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=channel, + uid=_parse_numeric_uid(agent_uid, "agent_uid"), + **token_kwargs, + ) + + base_kwargs: typing.Dict[str, typing.Any] = { + "channel": channel, + "token": token, + "agent_rtc_uid": agent_uid, + "remote_rtc_uids": remote_uids, + } + + if idle_timeout is not None: + base_kwargs["idle_timeout"] = idle_timeout + if enable_string_uid is not None: + base_kwargs["enable_string_uid"] = enable_string_uid + if self._mllm is not None: + base_kwargs["mllm"] = self._mllm + if self._turn_detection is not None: + base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption + if self._sal is not None: + base_kwargs["sal"] = self._sal + if self._avatar is not None: + base_kwargs["avatar"] = self._avatar + if self._advanced_features is not None: + base_kwargs["advanced_features"] = self._advanced_features + parameters = self._resolved_parameters() + if parameters is not None: + if isinstance(parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**parameters) + else: + base_kwargs["parameters"] = parameters + if self._geofence is not None: + base_kwargs["geofence"] = self._geofence + if self._labels is not None: + base_kwargs["labels"] = self._labels + if self._rtc is not None: + base_kwargs["rtc"] = self._rtc + if self._filler_words is not None: + base_kwargs["filler_words"] = self._filler_words + + if is_mllm_mode: + if self._mllm is not None: + mllm_config = dict(self._mllm) + if self._greeting is not None: + mllm_config.setdefault("greeting_message", self._greeting) + if self._failure_message is not None: + mllm_config.setdefault("failure_message", self._failure_message) + base_kwargs["mllm"] = mllm_config + return StartAgentsRequestProperties(**base_kwargs) + + if skip_vendor_validation: + return StartAgentsRequestProperties(**base_kwargs) + + if self._tts is None: + raise ValueError("TTS configuration is required. Use with_tts() to set it.") + + if self._llm is None: + raise ValueError("LLM configuration is required. Use with_llm() to set it.") + + llm_config = dict(self._llm) + # Agent-level fields take priority over the vendor's defaults. + # This matches the TS SDK where agent-level values override vendor config. + if self._instructions is not None: + llm_config["system_messages"] = [{"role": "system", "content": self._instructions}] + if self._greeting is not None: + llm_config["greeting_message"] = self._greeting + if self._greeting_configs is not None: + llm_config["greeting_configs"] = _dump_optional_model(self._greeting_configs) + if self._failure_message is not None: + llm_config["failure_message"] = self._failure_message + if self._max_history is not None: + llm_config["max_history"] = self._max_history + + base_kwargs["llm"] = llm_config + base_kwargs["tts"] = self._tts + if self._stt is not None: + base_kwargs["asr"] = self._stt + + return StartAgentsRequestProperties(**base_kwargs) + + def _clone(self) -> "Agent": + new_agent = Agent.__new__(Agent) + new_agent._name = self._name + new_agent._llm = self._llm + new_agent._tts = self._tts + new_agent._stt = self._stt + new_agent._mllm = self._mllm + new_agent._tts_sample_rate = self._tts_sample_rate + new_agent._avatar = self._avatar + new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate + new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption + new_agent._sal = self._sal + new_agent._advanced_features = self._advanced_features + new_agent._parameters = self._parameters + new_agent._instructions = self._instructions + new_agent._greeting = self._greeting + new_agent._failure_message = self._failure_message + new_agent._max_history = self._max_history + new_agent._geofence = self._geofence + new_agent._labels = self._labels + new_agent._rtc = self._rtc + new_agent._filler_words = self._filler_words + new_agent._greeting_configs = self._greeting_configs + return new_agent + src/agora_agent/agentkit/agent_session.py: | + import typing + import warnings + + from ..core.api_error import ApiError + from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, + ) + from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, + ) + from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, + ) + from ..agents.types.get_turns_agents_response import GetTurnsAgentsResponse + from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties + from .agent import Agent, GetTurnsOptions, SayOptions, ThinkOptions + from .avatar_types import ( + is_akool_avatar, + is_anam_avatar, + is_avatar_token_managed, + is_generic_avatar, + is_heygen_avatar, + is_live_avatar_avatar, + is_rtc_avatar, + validate_avatar_config, + validate_tts_sample_rate, + ) + from .presets import resolve_session_presets + from .token import generate_convo_ai_token, _parse_numeric_uid + + + class _AgentSessionRequiredOptions(typing.TypedDict, total=True): + """Required fields shared by both sync and async session constructors.""" + + client: typing.Any + agent: Agent + app_id: str + name: str + channel: str + agent_uid: str + remote_uids: typing.List[str] + + + class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): + """Configuration options for creating an agent session. + + Required fields + --------------- + client, agent, app_id, name, channel, agent_uid, remote_uids + + Optional fields + --------------- + app_certificate, token, idle_timeout, enable_string_uid, expires_in + """ + + app_certificate: str + token: str + idle_timeout: int + enable_string_uid: bool + preset: typing.Union[str, typing.Sequence[str]] + pipeline_id: str + expires_in: int + debug: bool + warn: typing.Callable[[str], None] + + + class _AgentSessionBase: + """Shared state and helpers for :class:`AgentSession` and :class:`AsyncAgentSession`. + + Not intended for direct use — instantiate one of the concrete subclasses or + call :meth:`Agent.create_session` / :meth:`Agent.create_async_session`. + """ + + def __init__( + self, + client: typing.Any, + agent: Agent, + app_id: str, + name: str, + channel: str, + agent_uid: str, + remote_uids: typing.List[str], + app_certificate: typing.Optional[str] = None, + token: typing.Optional[str] = None, + idle_timeout: typing.Optional[int] = None, + enable_string_uid: typing.Optional[bool] = None, + preset: typing.Optional[typing.Union[str, typing.Sequence[str]]] = None, + pipeline_id: typing.Optional[str] = None, + expires_in: typing.Optional[int] = None, + debug: typing.Optional[bool] = None, + warn: typing.Optional[typing.Callable[[str], None]] = None, + ): + self._client = client + self._agent = agent + self._app_id = app_id + self._app_certificate = app_certificate + self._name = name + self._channel = channel + self._token = token + self._agent_uid = agent_uid + self._remote_uids = remote_uids + self._idle_timeout = idle_timeout + self._enable_string_uid = enable_string_uid + self._preset = preset + self._pipeline_id = pipeline_id + self._expires_in = expires_in + self._debug = debug + self._warn = warn or warnings.warn + self._agent_id: typing.Optional[str] = None + self._status: str = "idle" + self._event_handlers: typing.Dict[str, typing.List[typing.Callable[..., None]]] = {} + + # ------------------------------------------------------------------ + # Public read-only properties + # ------------------------------------------------------------------ + + @property + def id(self) -> typing.Optional[str]: + return self._agent_id + + @property + def status(self) -> str: + return self._status + + @property + def agent(self) -> Agent: + return self._agent + + @property + def app_id(self) -> str: + return self._app_id + + @property + def raw(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentsClient. + + Use this to access any new endpoints that Fern generates without + waiting for agentkit method updates. + """ + return self._client.agents + + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _convo_ai_headers(self) -> typing.Optional[typing.Dict[str, str]]: + """Return per-request auth headers when client is in app-credentials mode. + + In app-credentials mode a fresh ConvoAI token (RTC + RTM) is generated + for every request and returned as ``Authorization: agora token=``. + In basic-auth mode this returns ``None`` (the client-level header is used). + """ + if getattr(self._client, "auth_mode", None) != "app-credentials": + return None + app_id: str = getattr(self._client, "app_id", self._app_id) + app_certificate: typing.Optional[str] = getattr( + self._client, "app_certificate", self._app_certificate + ) + if not app_certificate: + raise RuntimeError("app_certificate is required for app-credentials auth mode") + token = generate_convo_ai_token( + app_id=app_id, + app_certificate=app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(self._agent_uid, "agent_uid"), + ) + return {"Authorization": f"agora token={token}"} + + def _request_options(self) -> typing.Optional[typing.Dict[str, typing.Any]]: + """Build request_options dict with per-request auth headers if needed.""" + headers = self._convo_ai_headers() + if headers is None: + return None + return {"additional_headers": headers} + + def _validate_avatar_config(self) -> None: + avatar = self._agent.avatar + tts = self._agent.tts + if not avatar or avatar.get("enable", True) is False: + return + if self._is_mllm_mode(): + raise ValueError( + "Avatars are only supported with the cascading ASR + LLM + TTS pipeline. " + "Remove the avatar configuration when using MLLM, or switch to a cascading session." + ) + + if ( + is_heygen_avatar(avatar) + or is_live_avatar_avatar(avatar) + or is_akool_avatar(avatar) + or is_anam_avatar(avatar) + or is_generic_avatar(avatar) + ): + validate_avatar_config(avatar) + + tts_params = tts.get("params") if isinstance(tts, dict) else None + sample_rate = self._agent.tts_sample_rate + if sample_rate is None and isinstance(tts_params, dict): + sample_rate = ( + tts_params.get("sample_rate") + or tts_params.get("sample_rate_hertz") + or tts_params.get("samplingRate") + ) + if isinstance(sample_rate, int): + validate_tts_sample_rate(avatar, sample_rate) + elif is_heygen_avatar(avatar): + self._warn( + "Warning: HeyGen avatar detected but TTS sample_rate is not explicitly set. " + "HeyGen requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_live_avatar_avatar(avatar): + self._warn( + "Warning: LiveAvatar avatar detected but TTS sample_rate is not explicitly set. " + "LiveAvatar requires 24,000 Hz. Please ensure your TTS provider is configured for 24kHz." + ) + elif is_akool_avatar(avatar): + self._warn( + "Warning: Akool avatar detected but TTS sample_rate is not explicitly set. " + "Akool requires 16,000 Hz. Please ensure your TTS provider is configured for 16kHz." + ) + + def _enrich_avatar_for_session(self, properties: typing.Dict[str, typing.Any]) -> None: + avatar = properties.get("avatar") + if not isinstance(avatar, dict) or avatar.get("enable", True) is False: + return + + params = avatar.get("params") + if not isinstance(params, dict): + params = {} + avatar["params"] = params + + if is_generic_avatar(avatar): + if not params.get("agora_appid"): + params["agora_appid"] = self._app_id + if not params.get("agora_channel"): + params["agora_channel"] = self._channel + + if not is_avatar_token_managed(avatar): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_uid"): + validate_avatar_config(avatar, require_session_fields=is_generic_avatar(avatar)) + return + + if not params.get("agora_token"): + if not self._app_certificate: + raise ValueError( + "Cannot auto-generate avatar RTC token: app_certificate is required when agora_token is omitted. " + "Pass app_certificate on the Agora client or supply agora_token explicitly on the avatar vendor." + ) + token_kwargs: typing.Dict[str, typing.Any] = {} + if self._expires_in is not None: + token_kwargs["token_expire"] = self._expires_in + params["agora_token"] = generate_convo_ai_token( + app_id=self._app_id, + app_certificate=self._app_certificate, + channel_name=self._channel, + uid=_parse_numeric_uid(str(params["agora_uid"]), "avatar agora_uid"), + **token_kwargs, + ) + + if str(params.get("agora_uid")) == self._agent_uid: + self._warn( + "Warning: avatar agora_uid matches agent_rtc_uid. Use a unique UID for the avatar video publisher." + ) + + validate_avatar_config(avatar, require_session_fields=True) + + @staticmethod + def _dump_model(value: typing.Any) -> typing.Any: + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if isinstance(value, dict): + return {k: _AgentSessionBase._dump_model(v) for k, v in value.items() if v is not None} + if isinstance(value, list): + return [_AgentSessionBase._dump_model(item) for item in value] + return value + + def _is_mllm_mode(self) -> bool: + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None + + def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + base_properties = self._agent.to_properties( + channel=self._channel, + agent_uid=self._agent_uid, + remote_uids=self._remote_uids, + idle_timeout=self._idle_timeout, + enable_string_uid=self._enable_string_uid, + skip_vendor_validation=True, + **token_opts, + ) + properties = self._dump_model(base_properties) + self._enrich_avatar_for_session(properties) + + if self._is_mllm_mode(): + if self._agent.mllm is not None: + mllm = self._dump_model(self._agent.mllm) + if not isinstance(mllm, dict): + mllm = {} + if self._agent.greeting is not None: + mllm.setdefault("greeting_message", self._agent.greeting) + if self._agent.failure_message is not None: + mllm.setdefault("failure_message", self._agent.failure_message) + properties["mllm"] = mllm + return properties + + if self._agent.tts is not None: + properties["tts"] = self._dump_model(self._agent.tts) + if self._agent.llm is not None: + llm = dict(self._agent.llm) + if self._agent.instructions is not None: + llm["system_messages"] = [{"role": "system", "content": self._agent.instructions}] + if self._agent.greeting is not None: + llm["greeting_message"] = self._agent.greeting + if self._agent.greeting_configs is not None: + llm["greeting_configs"] = self._dump_model(self._agent.greeting_configs) + if self._agent.failure_message is not None: + llm["failure_message"] = self._agent.failure_message + if self._agent.max_history is not None: + llm["max_history"] = self._agent.max_history + properties["llm"] = llm + if self._agent.stt is not None: + properties["asr"] = self._dump_model(self._agent.stt) + + return properties + + @staticmethod + def _page_value(pagination: typing.Any, field: str) -> typing.Any: + if pagination is None: + return None + if isinstance(pagination, dict): + return pagination.get(field) + return getattr(pagination, field, None) + + @staticmethod + def _response_turns(response: typing.Any) -> typing.List[typing.Any]: + turns = response.get("turns") if isinstance(response, dict) else getattr(response, "turns", None) + return list(turns or []) + + @staticmethod + def _response_pagination(response: typing.Any) -> typing.Any: + if isinstance(response, dict): + return response.get("pagination") + return getattr(response, "pagination", None) + + @classmethod + def _with_all_turns(cls, first_response: typing.Any, turns: typing.List[typing.Any]) -> GetTurnsAgentsResponse: + data = cls._dump_model(first_response) + if not isinstance(data, dict): + data = {} + data["turns"] = turns + return GetTurnsAgentsResponse(**data) + + # ------------------------------------------------------------------ + # Event handling + # ------------------------------------------------------------------ + + def on(self, event: str, handler: typing.Callable[..., None]) -> None: + """Register an event handler. + + Parameters + ---------- + event : str + The event type (``started``, ``stopped``, ``error``). + handler : callable + The event handler to invoke when the event fires. + """ + if event not in self._event_handlers: + self._event_handlers[event] = [] + self._event_handlers[event].append(handler) + + def off(self, event: str, handler: typing.Callable[..., None]) -> None: + """Unregister a previously registered event handler.""" + handlers = self._event_handlers.get(event) + if handlers and handler in handlers: + handlers.remove(handler) + + def _emit(self, event: str, data: typing.Any) -> None: + handlers = self._event_handlers.get(event) + if handlers: + for handler in handlers: + try: + handler(data) + except Exception as exc: + # Prevent a misbehaving handler from blocking other handlers or + # the session lifecycle. Warn so the error is not silently lost. + warnings.warn( + f"Event handler for '{event}' raised an exception: {exc}", + stacklevel=2, + ) + + + class AgentSession(_AgentSessionBase): + """Manages the lifecycle of an agent session (synchronous). + + This class provides a high-level interface for managing agent sessions, + including starting, stopping, and interacting with the agent. + + Use :meth:`Agent.create_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import Agora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = session.start() + >>> session.say("Hello!") + >>> session.stop() + """ + + def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + + + class AsyncAgentSession(_AgentSessionBase): + """Async version of :class:`AgentSession` for use with :class:`AsyncAgora`. + + Use :meth:`Agent.create_async_session` to create a session — this is the + recommended entry point. + + Examples + -------- + >>> from agora_agent import AsyncAgora, Area, Agent, OpenAI, ElevenLabsTTS + >>> + >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") + >>> agent = Agent(name="assistant", instructions="You are helpful.") + >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) + >>> agent_id = await session.start() + >>> await session.say("Hello!") + >>> await session.stop() + """ + + async def start(self) -> str: + """Start the agent session. + + Returns + ------- + str + The agent ID. + + Raises + ------ + RuntimeError + If the session is not in a startable state. + ValueError + If avatar/TTS configuration is invalid. + """ + if self._status not in ("idle", "stopped", "error"): + raise RuntimeError(f"Cannot start session in {self._status} state") + + self._validate_avatar_config() + self._status = "starting" + + try: + if self._token: + token_opts: typing.Dict[str, typing.Any] = {"token": self._token} + else: + token_opts = { + "app_id": self._app_id, + "app_certificate": self._app_certificate, + "expires_in": self._expires_in, + } + + properties = self._build_start_properties(token_opts) + resolved_preset, resolved_properties = resolve_session_presets( + self._preset, + properties, + ) + + if self._debug: + print("[Agora Debug] Starting agent session...") + print("[Agora Debug] Request:", { + "appid": self._app_id, + "name": self._name, + "preset": resolved_preset, + "pipeline_id": self._pipeline_id, + "properties": resolved_properties, + }) + + try: + request_properties: typing.Any = StartAgentsRequestProperties(**resolved_properties) + except Exception: + request_properties = resolved_properties + + response = await self._client.agents.start( + self._app_id, + name=self._name, + properties=request_properties, + preset=resolved_preset, + pipeline_id=self._pipeline_id, + request_options=self._request_options(), + ) + + self._agent_id = response.agent_id if hasattr(response, "agent_id") else None + self._status = "running" + self._emit("started", {"agent_id": self._agent_id}) + return self._agent_id or "" + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def stop(self) -> None: + """Stop the agent session. + + If the agent has already stopped (e.g., crashed or timed out), the + server returns 404, which this method treats as a successful stop + rather than raising an error. + """ + if self._status != "running": + raise RuntimeError(f"Cannot stop session in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + self._status = "stopping" + + try: + await self._client.agents.stop( + self._app_id, self._agent_id, request_options=self._request_options() + ) + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + except ApiError as e: + if e.status_code == 404: + self._status = "stopped" + self._emit("stopped", {"agent_id": self._agent_id}) + return + self._status = "error" + self._emit("error", e) + raise + except Exception as e: + self._status = "error" + self._emit("error", e) + raise + + async def say( + self, + text: str, + priority: typing.Optional[str] = None, + interruptable: typing.Optional[bool] = None, + *, + options: typing.Optional["SayOptions"] = None, + ) -> None: + """Send a message to be spoken by the agent. + + Parameters + ---------- + text : str + The text to speak. + priority : str, optional + Priority of the message (``INTERRUPT``, ``APPEND``, ``IGNORE``). + interruptable : bool, optional + Whether the message can be interrupted by the user. + """ + if self._status != "running": + raise RuntimeError(f"Cannot say in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if priority is not None: + kwargs["priority"] = priority + if interruptable is not None: + kwargs["interruptable"] = interruptable + + await self._client.agents.speak( + self._app_id, self._agent_id, request_options=self._request_options(), **kwargs + ) + + async def interrupt(self) -> None: + """Interrupt the agent while it is speaking or thinking.""" + if self._status != "running": + raise RuntimeError(f"Cannot interrupt in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.interrupt( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + options: typing.Optional["ThinkOptions"] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline. + + In API v2.7, omitting ``on_listening_action`` uses the server default + ``"interrupt"``. Pass ``on_listening_action="inject"`` explicitly to + preserve the pre-v2.7 behavior. + """ + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if options is not None: + kwargs.update(options) + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def update(self, properties: typing.Any) -> None: + """Update the agent configuration at runtime. + + Parameters + ---------- + properties : UpdateAgentsRequestProperties + Partial configuration to update. + """ + if self._status != "running": + raise RuntimeError(f"Cannot update in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + await self._client.agents.update( + self._app_id, + self._agent_id, + properties=properties, + request_options=self._request_options(), + ) + + async def get_history(self) -> typing.Any: + """Get the conversation history.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get_history( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_info(self) -> typing.Any: + """Get the current session info.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + return await self._client.agents.get( + self._app_id, self._agent_id, request_options=self._request_options() + ) + + async def get_turns( + self, + *, + page_index: typing.Optional[int] = None, + page_size: typing.Optional[int] = None, + options: typing.Optional["GetTurnsOptions"] = None, + ) -> GetTurnsAgentsResponse: + """Get turn-by-turn analytics and timing details for this session.""" + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {} + if options is not None: + kwargs.update(options) + if page_index is not None: + kwargs["page_index"] = page_index + if page_size is not None: + kwargs["page_size"] = page_size + + return await self._client.agents.get_turns( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + + async def get_all_turns(self, *, page_size: typing.Optional[int] = None) -> GetTurnsAgentsResponse: + """Get all turn analytics pages for this session. + + Raises ``RuntimeError`` if the server's pagination metadata is missing + the fields required to advance, or if requesting the next page returns + a page index that did not advance. + """ + response = await self.get_turns(page_index=1, page_size=page_size) + all_turns = self._response_turns(response) + pagination = self._response_pagination(response) + current_page = self._page_value(pagination, "page_index") or 1 + while pagination is not None and self._page_value(pagination, "is_last_page") is False: + total_pages = self._page_value(pagination, "total_pages") + returned_index = self._page_value(pagination, "page_index") + if returned_index is None and total_pages is None: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + if total_pages is not None and current_page >= total_pages: + break + next_page = current_page + 1 + response = await self.get_turns(page_index=next_page, page_size=page_size) + all_turns.extend(self._response_turns(response)) + pagination = self._response_pagination(response) + returned_index = self._page_value(pagination, "page_index") if pagination else None + if returned_index is not None: + if returned_index <= current_page and self._page_value(pagination, "is_last_page") is not True: + raise RuntimeError( + f"get_all_turns pagination did not advance: requested page {next_page}, " + f"received page {returned_index}." + ) + current_page = returned_index + else: + total_pages = self._page_value(pagination, "total_pages") if pagination else None + is_last_page = self._page_value(pagination, "is_last_page") if pagination else None + if total_pages is None and is_last_page is not True: + raise RuntimeError( + "get_all_turns pagination cannot continue: response must include " + "page_index, total_pages, or is_last_page=true." + ) + current_page = next_page + return self._with_all_turns(response, all_turns) + tests/custom/test_root_exports.py: | + import pytest + + import agora_agent + import agora_agent.agentkit as agentkit + + + def test_root_exports_match_agentkit_for_common_symbols() -> None: + for name in ("Agent", "DeepgramSTT", "OpenAI", "AgentPresets", "generate_rtc_token", "DataChannel"): + assert getattr(agora_agent, name) is getattr(agentkit, name) + + + def test_root_exports_fern_client_symbols() -> None: + assert agora_agent.Agora is not None + assert agora_agent.Area is not None + assert agora_agent.AsyncAgora is not None + + + def test_unknown_root_export_raises_attribute_error() -> None: + with pytest.raises(AttributeError): + _ = agora_agent.NotARealExportName + + + def test_dir_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in dir(agora_agent) + + + def test_all_includes_agentkit_vendor_exports() -> None: + assert "DeepgramSTT" in agora_agent.__all__ + assert "OpenAI" in agora_agent.__all__ + - id: patch-d475306b + content_hash: sha256:407af5e7564d6e8d0b91f1e117cb433aec931f083225af53c6df2abfff281b22 + original_commit: d475306bd42279984bcf4934b900003e8e02c4eb + original_message: Move package rename guidance to installation docs and protect manual paths in Fern ignore. Consolidate migration notes into the installation guide with next-step links, add a brief README pointer, and exclude README, compat, and workflow files from Fern generation. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - compat/agora-agent-server-sdk/README.md + - docs/getting-started/installation.md + patch_content: | + diff --git a/compat/agora-agent-server-sdk/README.md b/compat/agora-agent-server-sdk/README.md + index e43d1d8..1da36aa 100644 + --- a/compat/agora-agent-server-sdk/README.md + +++ b/compat/agora-agent-server-sdk/README.md + @@ -14,3 +14,5 @@ This compatibility package re-exports the public API from `agora-agents` to supp + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + + +Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md + index 04b48da..8fca9ab 100644 + --- a/docs/getting-started/installation.md + +++ b/docs/getting-started/installation.md + @@ -53,4 +53,15 @@ from agora_agent import AsyncAgora, AsyncAgentSession, Area + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + -See [Authentication](./authentication.md) for setup details. + +## Next steps + + + +- [Authentication](./authentication.md) — configure your credentials + +- [Quick Start](./quick-start.md) — build your first conversational agent + + + +## Migrating from a previous package name + + + +The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + + +The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + + +For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + theirs_snapshot: + compat/agora-agent-server-sdk/README.md: | + # agora-agent-server-sdk + + This package has been renamed to `agora-agents`. + + New projects should install: + + ```sh + pip install agora-agents + ``` + + This compatibility package re-exports the public API from `agora-agents` to support existing installs during the migration window. The primary import path remains `agora_agent`; you can also import from `agora_agent_server_sdk_compat`: + + ```python + from agora_agent import Agora, Area + from agora_agent_server_sdk_compat import Agora, Area + ``` + + Maintainers: dual-publish steps live in the repository release workflow, not in the root README. + docs/getting-started/installation.md: | + --- + sidebar_position: 1 + title: Installation + description: Install the Agora Conversational AI Python SDK. + --- + + # Installation + + ## Prerequisites + + - Python >= 3.8 + + ## Install with pip + + ```sh + pip install agora-agents + ``` + + ## Install with Poetry + + ```sh + poetry add agora-agents + ``` + + ## Imports + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI + ``` + + The package installs as `agora-agents` and imports as `agora_agent`. + + ## Sync vs. Async + + The SDK supports both synchronous and asynchronous usage: + + - **Synchronous** — import `Agora` from `agora_agent` and use blocking method calls + - **Asynchronous** — import `AsyncAgora` and `AsyncAgentSession` from `agora_agent` and use `await` with all API calls + + ```python + # Sync + from agora_agent import Agora, Area + + # Async + from agora_agent import AsyncAgora, AsyncAgentSession, Area + ``` + + ## Dependencies + + | Package | Purpose | + | ------------------------------ | ------------------------------------------------------ | + | `httpx` (>= 0.21.2) | HTTP client for sync and async requests | + | `pydantic` (>= 1.9.2) | Data validation for vendor configuration and API types | + | `typing_extensions` (>= 4.0.0) | Backported type hints for Python 3.8+ | + + ## Next steps + + - [Authentication](./authentication.md) — configure your credentials + - [Quick Start](./quick-start.md) — build your first conversational agent + + ## Migrating from a previous package name + + The PyPI distribution was renamed from `agora-agent-server-sdk` to `agora-agents` in v2.0.0. Install `agora-agents`; the import path remains `agora_agent`. + + The legacy PyPI name remains available as a compatibility shim that re-exports `agora-agents`. See [compat/agora-agent-server-sdk](../../compat/agora-agent-server-sdk/README.md). + + For release and version details, see [changelog — Migration notes](../../changelog.md#migration-notes). + status: unresolved + - id: patch-c9355576 + content_hash: sha256:83b3b6148b21f2b4d53ee67321777522f5f4e871b61ea3b23f3a6b88ca052769 + original_commit: c93555763ffd63267a737b3e430217a890f203db + original_message: Streamline Python docs and README for app-credentials-first onboarding. Remove duplicated low-level client examples from the README, de-emphasize legacy auth modes, refocus the low-level API guide on AgentKit with telephony escape hatches, and update Agora-managed model terminology. + original_author: digitallysavvy + base_generation: a217c8ecfd919345831eebaca8295e292d65ebcf + files: + - docs/getting-started/authentication.md + - docs/guides/low-level-api.md + patch_content: | + diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md + index 31dcc56..74c62cd 100644 + --- a/docs/getting-started/authentication.md + +++ b/docs/getting-started/authentication.md + @@ -46,41 +46,6 @@ session = agent.create_session( + print(client.auth_mode) # "app-credentials" + ``` + + -## Other auth modes + +## Legacy auth modes + + -The SDK also supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. These are not recommended for new applications. + - + -### Token auth (`auth_token`) + - + -Pass a pre-minted Agora REST token on the client. You must also supply the RTC join token on `create_session(..., token=...)`. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - auth_token="your-rest-auth-token", + -) + - + -session = agent.create_session( + - client, + - channel="room-123", + - agent_uid="1", + - remote_uids=["100"], + - token="your-rtc-join-token", + -) + -``` + - + -### Basic Auth (`customer_id` + `customer_secret`) + - + -Uses HTTP Basic Auth with Customer ID and Secret from Agora Console. Avoid for new integrations — the same credentials are sent on every request instead of minting fresh tokens. + - + -```python + -client = Agora( + - area=Area.US, + - app_id="your-app-id", + - app_certificate="your-app-certificate", + - customer_id="your-customer-id", + - customer_secret="your-customer-secret", + -) + -``` + +The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md + index 6677b45..47397b7 100644 + --- a/docs/guides/low-level-api.md + +++ b/docs/guides/low-level-api.md + @@ -1,187 +1,55 @@ + --- + sidebar_position: 10 + title: Low-Level API + -description: Direct client.agents.start() usage without the builder pattern. + +description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + -For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. + +Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + -## Raw telephony and phone-number APIs + - + -AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + - + -- `client.telephony` for call status and hangup operations + -- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + +Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + -## Cascading flow (ASR → LLM → TTS) + +## Client setup + + ```python + from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + + client = Agora( + area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + -client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + + app_id="your-app-id", + + app_certificate="your-app-certificate", + ) + ``` + + -## Async (low-level) + +## Raw telephony and phone-number APIs + + -```python + -import asyncio + -from agora_agent import Area, AsyncAgora + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesAsr, + - StartAgentsRequestPropertiesLlm, + -) + -from agora_agent.types.eleven_labs_tts_params import ElevenLabsTtsParams + -from agora_agent.types.tts import Tts_Elevenlabs + +AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + -client = AsyncAgora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + +- `client.telephony` for call status and hangup operations + +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + + +```python + +calls = client.telephony.list( + + appid=client.app_id, + + type="sip", + ) + + -async def main() -> None: + - await client.agents.start( + - client.app_id, + - name="unique_name", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - asr=StartAgentsRequestPropertiesAsr( + - language="en-US", + - vendor="deepgram", + - params={"api_key": "YOUR_DEEPGRAM_API_KEY"}, + - ), + - tts=Tts_Elevenlabs( + - params=ElevenLabsTtsParams( + - key="YOUR_ELEVENLABS_API_KEY", + - model_id="eleven_flash_v2_5", + - voice_id="pNInz6obpgDQGcFmaJgB", + - sample_rate=24000, + - ), + - ), + - llm=StartAgentsRequestPropertiesLlm( + - url="https://api.openai.com/v1/chat/completions", + - api_key="", + - system_messages=[ + - {"role": "system", "content": "You are a helpful chatbot."} + - ], + - params={"model": "gpt-4o-mini"}, + - max_history=32, + - greeting_message="Hello, how can I assist you today?", + - failure_message="Please hold on a second.", + - ), + - ), + - ) + - + -asyncio.run(main()) + +for call in calls: + + print(call.id, call.state) + ``` + + -## MLLM flow (multimodal) + +## Direct agent APIs + + -For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flow instead of the cascading ASR → LLM → TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview). + +`client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + -```python + -from agora_agent import Agora, Area + -from agora_agent.agents import ( + - StartAgentsRequestProperties, + - StartAgentsRequestPropertiesMllm, + - StartAgentsRequestPropertiesMllmVendor, + - StartAgentsRequestPropertiesTts, + - StartAgentsRequestPropertiesTtsVendor, + - StartAgentsRequestPropertiesLlm, + -) + +If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + -client = Agora( + - area=Area.US, + - app_id="YOUR_APP_ID", + - app_certificate="YOUR_APP_CERTIFICATE", + - auth_token="your-rest-auth-token", + -) + - + -client.agents.start( + - client.app_id, + - name="mllm_agent", + - properties=StartAgentsRequestProperties( + - channel="channel_name", + - token="your_token", + - agent_rtc_uid="1001", + - remote_rtc_uids=["1002"], + - idle_timeout=120, + - mllm=StartAgentsRequestPropertiesMllm( + - enable=True, + - url="wss://api.openai.com/v1/realtime", + - api_key="", + - vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + - params={ + - "model": "gpt-4o-realtime-preview", + - "voice": "alloy", + - }, + - input_modalities=["audio"], + - output_modalities=["text", "audio"], + - greeting_message="Hello! I'm ready to chat in real-time.", + - turn_detection={ + - "mode": "server_vad", + - "server_vad_config": { + - "idle_timeout_ms": 5000, + - }, + - }, + - ), + - ), + +```python + +info = session.raw.get( + + appid=session.app_id, + + agent_id=session.id, + ) + ``` + + -For more on the agentkit-based MLLM flow, see [MLLM Flow](./mllm-flow.md). + +You must pass `appid` and `agent_id` manually when using generated raw methods. + theirs_snapshot: + docs/getting-started/authentication.md: | + --- + sidebar_position: 2 + title: Authentication + description: Configure the Python SDK with app credentials and understand other supported auth modes. + --- + + # Authentication + + Create `Agora` or `AsyncAgora` with `app_id` and `app_certificate` only. The SDK mints a fresh ConvoAI REST token for each API call and generates the RTC join token when the session starts. + + ## App credentials + + ```python + from agora_agent import Agent, Agora, Area, DeepgramSTT, OpenAI, MiniMaxTTS + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + + agent = ( + Agent(instructions="Be concise.") + .with_stt(DeepgramSTT(model="nova-3")) + .with_llm(OpenAI(model="gpt-4o-mini")) + .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) + ) + + session = agent.create_session( + client, + channel="room-123", + agent_uid="1", + remote_uids=["100"], + ) + ``` + + ## Why app credentials + + - Fresh short-lived tokens per API call instead of reusing long-lived credentials + - No Customer ID / Customer Secret in request headers + - No manual REST or RTC token provisioning in application code + + ## Inspecting auth mode + + ```python + print(client.auth_mode) # "app-credentials" + ``` + + ## Legacy auth modes + + The generated client still supports pre-minted REST tokens and HTTP Basic Auth for legacy integrations. Do not use those modes for new session integrations. Use app credentials so AgentKit can mint short-lived ConvoAI REST auth and RTC join tokens for each session. + docs/guides/low-level-api.md: | + --- + sidebar_position: 10 + title: Low-Level API + description: Use generated clients for escape-hatch APIs while keeping agent sessions on AgentKit. + --- + + # Low-Level API + + Use the `Agent` builder and `AgentSession` for conversational agent starts. That path generates ConvoAI REST auth and RTC join tokens from `app_id` and `app_certificate`, so application code does not need prebuilt REST tokens, RTC tokens, Customer ID, or Customer Secret. + + Generated clients are still available for API surface that AgentKit does not wrap yet, such as telephony and phone-number management. + + ## Client setup + + ```python + from agora_agent import Agora, Area + + client = Agora( + area=Area.US, + app_id="your-app-id", + app_certificate="your-app-certificate", + ) + ``` + + ## Raw telephony and phone-number APIs + + AgentKit focuses on realtime agent session helpers. Use generated clients for operational APIs: + + - `client.telephony` for call status and hangup operations + - `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + + ```python + calls = client.telephony.list( + appid=client.app_id, + type="sip", + ) + + for call in calls: + print(call.id, call.state) + ``` + + ## Direct agent APIs + + `client.agents` exposes the generated REST surface for advanced integrations. Prefer `agent.create_session(...).start()` for new session starts because it handles auth, token generation, vendor serialization, lifecycle state, and avatar enrichment. + + If you need an endpoint that is not wrapped by `AgentSession`, use `session.raw` after creating the session: + + ```python + info = session.raw.get( + appid=session.app_id, + agent_id=session.id, + ) + ``` + + You must pass `appid` and `agent_id` manually when using generated raw methods. + status: unresolved diff --git a/reference.md b/reference.md index 55a516e..57fc92a 100644 --- a/reference.md +++ b/reference.md @@ -27,11 +27,16 @@ Create and start a Conversational AI agent instance.
```python -from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft +from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, +) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -51,9 +56,7 @@ client.agents.start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -61,13 +64,15 @@ client.agents.start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f84862c..0d7a4aa 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -67,6 +67,8 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule +from ..types.tts import Tts from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( AgentThinkAgentManagementRequestOnListeningAction, ) diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index a749d1e..ddcd930 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -24,6 +24,7 @@ is_generic_avatar, is_heygen_avatar, is_live_avatar_avatar, + is_rtc_avatar, validate_avatar_config, validate_tts_sample_rate, ) diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 50bdd08..f48098c 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -177,6 +177,49 @@ def to_config(self) -> Dict[str, Any]: return {"enable": enable, "vendor": "generic", "params": params} +class GenericAvatarOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Generic avatar provider API key") + api_base_url: str = Field(..., description="Avatar provider API base URL") + avatar_id: str = Field(..., description="Avatar ID") + agora_uid: str = Field(..., description="Agora UID for the avatar video stream") + agora_appid: Optional[str] = Field(default=None, description="Agora App ID; filled by AgentSession when omitted") + agora_token: Optional[str] = Field(default=None, description="RTC token; generated by AgentSession when omitted") + agora_channel: Optional[str] = Field(default=None, description="Agora channel; filled by AgentSession when omitted") + enable: Optional[bool] = Field(default=None, description="Enable avatar (default: true)") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional vendor-specific parameters") + + +class GenericAvatar(BaseAvatar): + def __init__(self, **kwargs: Any): + self.options = GenericAvatarOptions(**kwargs) + + @property + def required_sample_rate(self) -> int: + return 0 + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "api_base_url": self.options.api_base_url, + "avatar_id": self.options.avatar_id, + "agora_uid": self.options.agora_uid, + } + + if self.options.agora_appid is not None: + params["agora_appid"] = self.options.agora_appid + if self.options.agora_token is not None: + params["agora_token"] = self.options.agora_token + if self.options.agora_channel is not None: + params["agora_channel"] = self.options.agora_channel + if self.options.additional_params is not None: + params = {**self.options.additional_params, **params} + + enable = self.options.enable if self.options.enable is not None else True + return {"enable": enable, "vendor": "generic", "params": params} + + class AnamAvatarOptions(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index b58f040..62cb3f2 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, List, Optional from pydantic import BaseModel, ConfigDict, Field diff --git a/src/agora_agent/agents/client.py b/src/agora_agent/agents/client.py index 3f6af4c..e923c9a 100644 --- a/src/agora_agent/agents/client.py +++ b/src/agora_agent/agents/client.py @@ -84,11 +84,16 @@ def start( Examples -------- - from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -108,9 +113,7 @@ def start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -118,13 +121,15 @@ def start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", @@ -641,11 +646,16 @@ async def start( -------- import asyncio - from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Asr_Ares, + AsyncAgora, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -668,9 +678,7 @@ async def main() -> None: agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -678,13 +686,15 @@ async def main() -> None: voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py index 06c3482..3cddb7e 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties.py +++ b/src/agora_agent/agents/types/start_agents_request_properties.py @@ -5,15 +5,15 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr import Asr +from ...types.llm import Llm +from ...types.mllm import Mllm from ...types.tts import Tts from .start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures -from .start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr from .start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from .start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords from .start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence from .start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption -from .start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm -from .start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm from .start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters from .start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc from .start_agents_request_properties_sal import StartAgentsRequestPropertiesSal @@ -67,7 +67,7 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Advanced features configuration. """ - asr: typing.Optional[StartAgentsRequestPropertiesAsr] = pydantic.Field(default=None) + asr: typing.Optional[Asr] = pydantic.Field(default=None) """ Automatic Speech Recognition (ASR) configuration. """ @@ -77,12 +77,12 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Text-to-speech (TTS) module configuration. """ - llm: typing.Optional[StartAgentsRequestPropertiesLlm] = pydantic.Field(default=None) + llm: typing.Optional[Llm] = pydantic.Field(default=None) """ Large language model (LLM) configuration. """ - mllm: typing.Optional[StartAgentsRequestPropertiesMllm] = pydantic.Field(default=None) + mllm: typing.Optional[Mllm] = pydantic.Field(default=None) """ Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr.py b/src/agora_agent/agents/types/start_agents_request_properties_asr.py deleted file mode 100644 index 7385e17..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr.py +++ /dev/null @@ -1,47 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor - - -class StartAgentsRequestPropertiesAsr(UncheckedBaseModel): - """ - Automatic Speech Recognition (ASR) configuration. - """ - - language: typing.Optional[str] = pydantic.Field(default=None) - """ - The BCP-47 language tag identifying the primary language used for agent interaction. If `params` contains a vendor-specific language code, it takes precedence over this setting. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesAsrVendor] = pydantic.Field(default=None) - """ - ASR provider: - - `ares`: Adaptive Recognition Engine for Speech - - `microsoft`: Microsoft Azure - - `deepgram`: Deepgram - - `openai`: OpenAI (Beta) - - `speechmatics`: Speechmatics - - `assemblyai`: AssemblyAI (Beta) - - `amazon`: Amazon Transcribe (Beta) - - `google`: Google (Beta) - - `sarvam`: Sarvam (Beta) - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - The configuration parameters for the ASR vendor. See [ASR Overview](https://docs.agora.io/en/conversational-ai/models/asr/overview) for details. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py deleted file mode 100644 index 973d62c..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py +++ /dev/null @@ -1,10 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesAsrVendor = typing.Union[ - typing.Literal[ - "ares", "microsoft", "deepgram", "openai", "google", "amazon", "assemblyai", "speechmatics", "sarvam" - ], - typing.Any, -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py deleted file mode 100644 index 9ab0f62..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ /dev/null @@ -1,115 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs -from .start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem -from .start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle - - -class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): - """ - Large language model (LLM) configuration. - """ - - url: str = pydantic.Field() - """ - The LLM callback address. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The LLM verification API key. The default value is an empty string. Ensure that you enable the API key in a production environment. - """ - - system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - A set of predefined information used as input to the LLM, including prompt words and examples. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional LLM configuration parameters, such as the `model` used, and the maximum token limit. For details about each supported LLM, refer to [Supported LLMs](https://docs.agora.io/en/conversational-ai/models/llm/overview#supported-llms). - """ - - max_history: typing.Optional[int] = pydantic.Field(default=None) - """ - The number of conversation history messages cached in the custom LLM. History includes user and agent dialog messages, tool call information, and timestamps. Agent and user messages are recorded separately. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM input modalities: - - `["text"]`: Text only - - `["text", "image"]`: Text plus image. Recommended configuration, requires the selected LLM to support visual input - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM output modalities: - - `["text"]`: The output text is converted to speech by the TTS module and then published to the RTC channel. - - `["audio"]`: Voice only. Voice is published directly to the RTC channel. - - `["text", "audio"]`: Text plus voice. Write your own logic to process the output of LLM as needed. - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting. If provided, the first user in the channel is automatically greeted with the message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Prompt for agent activation failure. If provided, it is returned through TTS when the custom LLM call fails. - """ - - vendor: typing.Optional[str] = pydantic.Field(default=None) - """ - LLM provider, supports the following settings: - - `custom`: Custom LLM. When you set this option, the agent includes the following fields, in addition to `role` and `content` when making requests to the custom LLM: - - `turn_id`: A unique identifier for each conversation turn. It starts from `0` and increments with each turn. One user-agent interaction corresponds to one `turn_id`. - - `timestamp`: The request timestamp, in milliseconds. - - `azure`: Use this value for Azure OpenAI - """ - - style: typing.Optional[StartAgentsRequestPropertiesLlmStyle] = pydantic.Field(default=None) - """ - The request style for chat completion: - - `openai`: For OpenAI and OpenAI-compatible APIs - - `gemini`: For Google Gemini and Google Vertex API format - - `anthropic`: For Anthropic Claude API format - - `dify`: For Dify API format - """ - - greeting_configs: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigs] = pydantic.Field(default=None) - """ - Agent greeting broadcast configuration. - """ - - template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Template parameter configuration used to insert variables into the agent's `system_messages`, `greeting_message`, `failure_message`, and `parameters.silence_config.content` text. Uses key-value pairs, where the key is the variable name and the value is the variable's value. To insert defined variables in the prompt text, use the syntax `{{variable_name}}`. The system automatically replaces each variable with the corresponding value defined in `template_variables`. Variable values cannot reference other variables. - """ - - mcp_servers: typing.Optional[typing.List[StartAgentsRequestPropertiesLlmMcpServersItem]] = pydantic.Field( - default=None - ) - """ - MCP (Model Context Protocol) server configuration. By configuring MCP servers, agents can call tools provided by external services to implement advanced functionality. - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Custom headers to include in requests to the LLM. Use this field to pass business-specific information such as custom fields or tenant identifiers. These headers are merged with the headers generated by the Conversational AI Engine. If a key conflict occurs, the engine-generated header takes precedence. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py deleted file mode 100644 index c0d7046..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ /dev/null @@ -1,43 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs_mode import ( - StartAgentsRequestPropertiesLlmGreetingConfigsMode, -) - - -class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): - """ - Agent greeting broadcast configuration. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigsMode] = pydantic.Field(default=None) - """ - Determines when the agent sends greeting messages to users joining the channel. - - `single_every`: Broadcasts a greeting every time a user joins the channel. - - `single_first`: Broadcasts a greeting only once to the first user who joins the channel. - """ - - delay_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The delay in milliseconds before the agent plays the greeting message after a user joins the channel. - """ - - interruptable: typing.Optional[bool] = pydantic.Field(default=None) - """ - - `true`: Follows the global `interruption` configuration. - - `false`: Uninterruptible. The greeting plays in its entirety. If the user speaks multiple times while the greeting plays, the system merges the speech segments after the greeting ends and sends them to the LLM for a single response. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py deleted file mode 100644 index 44e4a55..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmGreetingConfigsMode = typing.Union[ - typing.Literal["single_every", "single_first"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py deleted file mode 100644 index 0474072..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py +++ /dev/null @@ -1,54 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesLlmMcpServersItem(UncheckedBaseModel): - name: str = pydantic.Field() - """ - A unique identifier for the MCP server. Maximum 48 characters. Accepts only English letters and numbers. - """ - - endpoint: str = pydantic.Field() - """ - The endpoint address of the MCP server. The agent uses this to communicate with the MCP server. - """ - - transport: typing.Optional[typing.Literal["streamable_http"]] = pydantic.Field(default=None) - """ - Transport protocol type. - - `streamable_http`: Streaming HTTP protocol - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - HTTP header information to include when requesting the MCP server, such as authentication information. - """ - - allowed_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - A list of tools that the agent is allowed to invoke. The agent can only use tools on this list. - - Empty or omitted: All tools are enabled. - - Empty array `[]`: No tools are enabled. - - `["*"]`: All tools are enabled. - - Specific tools `["aa", "bb"]`: Only listed tools are enabled. - - Mix with wildcard `["aa", "*"]`: All tools are enabled (wildcard takes precedence). - """ - - timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The MCP server request timeout in milliseconds. After timeout, the agent stops waiting for the MCP server's response and continues executing subsequent logic. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py deleted file mode 100644 index eaa9a0d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py deleted file mode 100644 index 0993ebc..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection -from .start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor - - -class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - """ - Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. - """ - - enable: typing.Optional[bool] = pydantic.Field(default=None) - """ - Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. Replaces the deprecated `advanced_features.enable_mllm`. - """ - - url: typing.Optional[str] = pydantic.Field(default=None) - """ - The MLLM WebSocket URL for real-time communication. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The API key used for MLLM authentication. - """ - - messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - Array of conversation items used for short-term memory management. Uses the same structure as `item.content` from the OpenAI Realtime API. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional MLLM configuration parameters. The `modalities` setting is overridden by `input_modalities` and `output_modalities`. The `turn_detection` setting is overridden by `mllm.turn_detection`. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM input modalities: - - `["audio"]`: Audio only - - `["audio", "text"]`: Audio plus text - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM output modalities: - - `["text", "audio"]`: Text plus audio - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting message. If provided, the first user in the channel is automatically greeted with this message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent failure message. If provided, the agent speaks this message when an MLLM request fails. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) - """ - MLLM provider. Currently supports: - - `openai`: OpenAI Realtime API - - `gemini`: Google Gemini Live - - `vertexai`: Google Gemini Live (Vertex AI) - - `xai`: xAI Grok Realtime API - """ - - turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py deleted file mode 100644 index 032979d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_agora_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig, -) - - -class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionMode] = pydantic.Field(default=None) - """ - Turn detection mode for MLLM: - - `agora_vad`: Agora VAD-based detection. - - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API, Gemini Live, and xAI Grok. - - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. - """ - - agora_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - server_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - semantic_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig] = ( - pydantic.Field(default=None) - ) - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py deleted file mode 100644 index ec30215..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py +++ /dev/null @@ -1,42 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - interrupt_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Minimum duration of speech in milliseconds required to trigger an interruption. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. A higher value reduces false positives. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py deleted file mode 100644 index 0d004e8..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionMode = typing.Union[ - typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py deleted file mode 100644 index 1e310f0..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py +++ /dev/null @@ -1,32 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - eagerness: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness] = ( - pydantic.Field(default=None) - ) - """ - Controls how eagerly the model ends its turn. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py deleted file mode 100644 index 8b67b1d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness = typing.Union[ - typing.Literal["auto", "low", "medium", "high"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py deleted file mode 100644 index c74d8d7..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py +++ /dev/null @@ -1,62 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBaseModel): - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. Applicable to OpenAI Realtime API and xAI Grok. - """ - - idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Idle timeout in milliseconds. Applicable to OpenAI Realtime API only. - """ - - start_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for start of speech detection. Applicable to Gemini Live only. - """ - - end_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for end of speech detection. Applicable to Gemini Live only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py deleted file mode 100644 index 0233696..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index c44e886..acd9073 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.0.0", + "User-Agent": "agora-agents/v2.1.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.0.0", + "X-Fern-SDK-Version": "v2.1.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/amazon_asr.py b/src/agora_agent/types/amazon_asr.py new file mode 100644 index 0000000..4054518 --- /dev/null +++ b/src/agora_agent/types/amazon_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_asr_params import AmazonAsrParams +from .asr_language import AsrLanguage + + +class AmazonAsr(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_asr_params.py b/src/agora_agent/types/amazon_asr_params.py new file mode 100644 index 0000000..1d30688 --- /dev/null +++ b/src/agora_agent/types/amazon_asr_params.py @@ -0,0 +1,52 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AmazonAsrParams(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration parameters. + """ + + region: str = pydantic.Field() + """ + AWS region + """ + + access_key_id: str = pydantic.Field() + """ + AWS access key ID + """ + + secret_access_key: str = pydantic.Field() + """ + AWS secret access key + """ + + language_code: str = pydantic.Field() + """ + Language code for speech recognition + """ + + media_sample_rate_hz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hertz for the audio input + """ + + media_encoding: typing.Optional[str] = pydantic.Field(default=None) + """ + Encoding format of the audio input + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_tts_params.py b/src/agora_agent/types/amazon_tts_params.py index baaa6fa..7995911 100644 --- a/src/agora_agent/types/amazon_tts_params.py +++ b/src/agora_agent/types/amazon_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_tts_params_engine import AmazonTtsParamsEngine class AmazonTtsParams(UncheckedBaseModel): @@ -12,26 +13,31 @@ class AmazonTtsParams(UncheckedBaseModel): Amazon Polly TTS configuration parameters. """ - access_key: str = pydantic.Field() + aws_access_key_id: str = pydantic.Field() """ - AWS access key + AWS access key ID """ - secret_key: str = pydantic.Field() + aws_secret_access_key: str = pydantic.Field() """ AWS secret key """ - region: str = pydantic.Field() + region_name: str = pydantic.Field() """ AWS region (e.g., "us-east-1") """ - voice_id: str = pydantic.Field() + voice: str = pydantic.Field() """ Amazon Polly voice ID """ + engine: typing.Optional[AmazonTtsParamsEngine] = pydantic.Field(default=None) + """ + Amazon Polly engine type + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/amazon_tts_params_engine.py b/src/agora_agent/types/amazon_tts_params_engine.py new file mode 100644 index 0000000..d9e3cfe --- /dev/null +++ b/src/agora_agent/types/amazon_tts_params_engine.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AmazonTtsParamsEngine = typing.Union[typing.Literal["standard", "neural", "long-form", "generative"], typing.Any] diff --git a/src/agora_agent/types/ares_asr.py b/src/agora_agent/types/ares_asr.py new file mode 100644 index 0000000..cf42216 --- /dev/null +++ b/src/agora_agent/types/ares_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage + + +class AresAsr(UncheckedBaseModel): + """ + Adaptive Recognition Engine for Speech ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/ares_asr_params.py b/src/agora_agent/types/ares_asr_params.py new file mode 100644 index 0000000..afa1d76 --- /dev/null +++ b/src/agora_agent/types/ares_asr_params.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AresAsrParams = typing.Dict[str, typing.Any] diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py new file mode 100644 index 0000000..f08086f --- /dev/null +++ b/src/agora_agent/types/asr.py @@ -0,0 +1,172 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import typing + +import pydantic +import typing_extensions +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel, UnionMetadata +from .amazon_asr_params import AmazonAsrParams +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams +from .deepgram_asr_params import DeepgramAsrParams +from .google_asr_params import GoogleAsrParams +from .microsoft_asr_params import MicrosoftAsrParams +from .open_ai_asr_params import OpenAiAsrParams +from .sarvam_asr_params import SarvamAsrParams +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class Asr_Ares(UncheckedBaseModel): + vendor: typing.Literal["ares"] = "ares" + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Microsoft(UncheckedBaseModel): + vendor: typing.Literal["microsoft"] = "microsoft" + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Deepgram(UncheckedBaseModel): + vendor: typing.Literal["deepgram"] = "deepgram" + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Openai(UncheckedBaseModel): + vendor: typing.Literal["openai"] = "openai" + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Google(UncheckedBaseModel): + vendor: typing.Literal["google"] = "google" + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Amazon(UncheckedBaseModel): + vendor: typing.Literal["amazon"] = "amazon" + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Assemblyai(UncheckedBaseModel): + vendor: typing.Literal["assemblyai"] = "assemblyai" + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Speechmatics(UncheckedBaseModel): + vendor: typing.Literal["speechmatics"] = "speechmatics" + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Sarvam(UncheckedBaseModel): + vendor: typing.Literal["sarvam"] = "sarvam" + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +Asr = typing_extensions.Annotated[ + typing.Union[ + Asr_Ares, + Asr_Microsoft, + Asr_Deepgram, + Asr_Openai, + Asr_Google, + Asr_Amazon, + Asr_Assemblyai, + Asr_Speechmatics, + Asr_Sarvam, + ], + UnionMetadata(discriminant="vendor"), +] diff --git a/src/agora_agent/types/asr_language.py b/src/agora_agent/types/asr_language.py new file mode 100644 index 0000000..4ff3c88 --- /dev/null +++ b/src/agora_agent/types/asr_language.py @@ -0,0 +1,41 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AsrLanguage = typing.Union[ + typing.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ], + typing.Any, +] diff --git a/src/agora_agent/types/assembly_ai_asr.py b/src/agora_agent/types/assembly_ai_asr.py new file mode 100644 index 0000000..ea2ebf4 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams + + +class AssemblyAiAsr(UncheckedBaseModel): + """ + AssemblyAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/assembly_ai_asr_params.py b/src/agora_agent/types/assembly_ai_asr_params.py new file mode 100644 index 0000000..f3a5818 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AssemblyAiAsrParams(UncheckedBaseModel): + """ + AssemblyAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + AssemblyAI API key + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for AssemblyAI's streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_output_format.py b/src/agora_agent/types/cartesia_tts_output_format.py new file mode 100644 index 0000000..ab7e122 --- /dev/null +++ b/src/agora_agent/types/cartesia_tts_output_format.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class CartesiaTtsOutputFormat(UncheckedBaseModel): + """ + Cartesia audio output format configuration. + """ + + container: typing.Optional[str] = pydantic.Field(default=None) + """ + Audio container format for the output stream + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sampling rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_params.py b/src/agora_agent/types/cartesia_tts_params.py index 2aaf069..1478570 100644 --- a/src/agora_agent/types/cartesia_tts_params.py +++ b/src/agora_agent/types/cartesia_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .cartesia_tts_output_format import CartesiaTtsOutputFormat from .cartesia_tts_voice import CartesiaTtsVoice @@ -18,15 +19,21 @@ class CartesiaTtsParams(UncheckedBaseModel): Cartesia API key """ - voice: CartesiaTtsVoice - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: str = pydantic.Field() """ - Model ID (optional) + Model ID (for example, sonic-2) """ - sample_rate: typing.Optional[int] = pydantic.Field(default=None) + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Cartesia streaming API + """ + + voice: CartesiaTtsVoice + output_format: typing.Optional[CartesiaTtsOutputFormat] = None + language: typing.Optional[str] = pydantic.Field(default=None) """ - Audio sampling rate in Hz + Target language for speech synthesis """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py new file mode 100644 index 0000000..1c79c7b --- /dev/null +++ b/src/agora_agent/types/deepgram_asr.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .deepgram_asr_params import DeepgramAsrParams + + +class DeepgramAsr(UncheckedBaseModel): + """ + Deepgram ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands for preset-backed Deepgram usage. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py new file mode 100644 index 0000000..259958e --- /dev/null +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class DeepgramAsrParams(UncheckedBaseModel): + """ + Deepgram ASR configuration parameters. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for Deepgram's streaming API + """ + + key: str = pydantic.Field() + """ + Deepgram API key + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Speech recognition model + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for speech recognition + """ + + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/eleven_labs_tts_params.py b/src/agora_agent/types/eleven_labs_tts_params.py index c6127fd..b61e3de 100644 --- a/src/agora_agent/types/eleven_labs_tts_params.py +++ b/src/agora_agent/types/eleven_labs_tts_params.py @@ -37,6 +37,31 @@ class ElevenLabsTtsParams(UncheckedBaseModel): Audio sample rate in Hz (16kHz for Akool, 24kHz for HeyGen) """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech speed multiplier. + """ + + stability: typing.Optional[float] = pydantic.Field(default=None) + """ + Voice stability. Higher values produce more consistent speech. + """ + + similarity_boost: typing.Optional[float] = pydantic.Field(default=None) + """ + Similarity boost for the selected voice. + """ + + style: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking style and expressiveness control. + """ + + use_speaker_boost: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to improve voice quality and similarity. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/fish_audio_tts_params.py b/src/agora_agent/types/fish_audio_tts_params.py index 0ad77aa..60bcff4 100644 --- a/src/agora_agent/types/fish_audio_tts_params.py +++ b/src/agora_agent/types/fish_audio_tts_params.py @@ -12,7 +12,7 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Fish Audio API key """ @@ -22,6 +22,11 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio reference ID """ + backend: typing.Optional[str] = pydantic.Field(default=None) + """ + Backend model version to use + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/google_asr.py b/src/agora_agent/types/google_asr.py new file mode 100644 index 0000000..8473a04 --- /dev/null +++ b/src/agora_agent/types/google_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .google_asr_params import GoogleAsrParams + + +class GoogleAsr(UncheckedBaseModel): + """ + Google ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_asr_params.py b/src/agora_agent/types/google_asr_params.py new file mode 100644 index 0000000..9d17db6 --- /dev/null +++ b/src/agora_agent/types/google_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleAsrParams(UncheckedBaseModel): + """ + Google ASR configuration parameters. + """ + + project_id: str = pydantic.Field() + """ + Google Cloud project ID + """ + + location: str = pydantic.Field() + """ + Google Cloud region for the speech service + """ + + adc_credentials_string: str = pydantic.Field() + """ + Google Cloud service account credentials JSON string + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Recognition model to use + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_audio_config.py b/src/agora_agent/types/google_tts_audio_config.py new file mode 100644 index 0000000..9c2a405 --- /dev/null +++ b/src/agora_agent/types/google_tts_audio_config.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsAudioConfig(UncheckedBaseModel): + """ + Google audio output configuration. + """ + + speaking_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_params.py b/src/agora_agent/types/google_tts_params.py index dc00322..4a9ee38 100644 --- a/src/agora_agent/types/google_tts_params.py +++ b/src/agora_agent/types/google_tts_params.py @@ -3,8 +3,12 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel +from .google_tts_audio_config import GoogleTtsAudioConfig +from .google_tts_voice_selection_params import GoogleTtsVoiceSelectionParams class GoogleTtsParams(UncheckedBaseModel): @@ -12,25 +16,17 @@ class GoogleTtsParams(UncheckedBaseModel): Google TTS configuration parameters. """ - key: str = pydantic.Field() + credentials: str = pydantic.Field() """ - Google Cloud API key + Google Cloud service account credentials JSON string """ - voice_name: str = pydantic.Field() - """ - Google voice name - """ - - language_code: typing.Optional[str] = pydantic.Field(default=None) - """ - Language code (e.g., "en-US") - """ - - sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) - """ - Sample rate in Hz (default depends on selected voice) - """ + voice_selection_params: typing_extensions.Annotated[ + GoogleTtsVoiceSelectionParams, FieldMetadata(alias="VoiceSelectionParams") + ] + audio_config: typing_extensions.Annotated[ + typing.Optional[GoogleTtsAudioConfig], FieldMetadata(alias="AudioConfig") + ] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/google_tts_voice_selection_params.py b/src/agora_agent/types/google_tts_voice_selection_params.py new file mode 100644 index 0000000..ee75953 --- /dev/null +++ b/src/agora_agent/types/google_tts_voice_selection_params.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsVoiceSelectionParams(UncheckedBaseModel): + """ + Google voice selection parameters. + """ + + name: str = pydantic.Field() + """ + Google voice name + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/hume_ai_tts_params.py b/src/agora_agent/types/hume_ai_tts_params.py index 08cb12b..1480fd4 100644 --- a/src/agora_agent/types/hume_ai_tts_params.py +++ b/src/agora_agent/types/hume_ai_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .hume_ai_tts_params_provider import HumeAiTtsParamsProvider class HumeAiTtsParams(UncheckedBaseModel): @@ -17,9 +18,34 @@ class HumeAiTtsParams(UncheckedBaseModel): Hume AI API key """ + voice_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Hume AI voice ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Base URL for the Hume AI API + """ + + provider: typing.Optional[HumeAiTtsParamsProvider] = pydantic.Field(default=None) + """ + Voice provider type + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Playback speed of the generated speech + """ + + trailing_silence: typing.Optional[float] = pydantic.Field(default=None) + """ + Duration of silence in seconds to add at the end of each utterance + """ + config_id: typing.Optional[str] = pydantic.Field(default=None) """ - Hume AI configuration ID + Hume AI configuration ID. Deprecated; use voice_id for the documented TTS shape. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/hume_ai_tts_params_provider.py b/src/agora_agent/types/hume_ai_tts_params_provider.py new file mode 100644 index 0000000..cf07e73 --- /dev/null +++ b/src/agora_agent/types/hume_ai_tts_params_provider.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +HumeAiTtsParamsProvider = typing.Union[typing.Literal["HUME_AI", "CUSTOM_VOICE"], typing.Any] diff --git a/src/agora_agent/types/llm.py b/src/agora_agent/types/llm.py new file mode 100644 index 0000000..2b0283d --- /dev/null +++ b/src/agora_agent/types/llm.py @@ -0,0 +1,120 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .llm_params import LlmParams +from .llm_style import LlmStyle + + +class Llm(UncheckedBaseModel): + """ + Large language model (LLM) configuration. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM callback address. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM verification API key. + """ + + access_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS access key ID. Used by Amazon Bedrock when api_key is not provided. + """ + + secret_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS secret access key. Used by Amazon Bedrock when api_key is not provided. + """ + + region: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS region. Used by Amazon Bedrock. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Top-level model identifier. Used by Amazon Bedrock. + """ + + system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + A set of predefined information used as input to the LLM. + """ + + params: typing.Optional[LlmParams] = None + max_history: typing.Optional[int] = pydantic.Field(default=None) + """ + The number of conversation history messages cached in the custom LLM. + """ + + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Prompt for agent activation failure. + """ + + vendor: typing.Optional[str] = pydantic.Field(default=None) + """ + LLM provider identifier. + """ + + style: typing.Optional[LlmStyle] = pydantic.Field(default=None) + """ + The request style for chat completion. + """ + + ignore_empty: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to handle empty Gemini responses. + """ + + greeting_configs: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) + """ + Agent greeting broadcast configuration. + """ + + template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Template parameter configuration. + """ + + mcp_servers: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + MCP server configuration. + """ + + headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Custom headers to include in requests to the LLM. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_params.py b/src/agora_agent/types/llm_params.py new file mode 100644 index 0000000..f6df01f --- /dev/null +++ b/src/agora_agent/types/llm_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class LlmParams(UncheckedBaseModel): + """ + Additional LLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM model identifier. + """ + + max_tokens: typing.Optional[int] = pydantic.Field(default=None) + """ + Maximum tokens in the response. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_style.py b/src/agora_agent/types/llm_style.py new file mode 100644 index 0000000..8319ca1 --- /dev/null +++ b/src/agora_agent/types/llm_style.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +LlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify", "bedrock"], typing.Any] diff --git a/src/agora_agent/types/microsoft_asr.py b/src/agora_agent/types/microsoft_asr.py new file mode 100644 index 0000000..f602e09 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .microsoft_asr_params import MicrosoftAsrParams + + +class MicrosoftAsr(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_asr_params.py b/src/agora_agent/types/microsoft_asr_params.py new file mode 100644 index 0000000..bea79e4 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr_params.py @@ -0,0 +1,42 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MicrosoftAsrParams(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration parameters. + """ + + key: str = pydantic.Field() + """ + Microsoft Azure API key + """ + + region: str = pydantic.Field() + """ + Azure region + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + phrase_list: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + Words or phrases to improve recognition accuracy + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_tts_params.py b/src/agora_agent/types/microsoft_tts_params.py index 3c9e80c..12f441e 100644 --- a/src/agora_agent/types/microsoft_tts_params.py +++ b/src/agora_agent/types/microsoft_tts_params.py @@ -32,6 +32,16 @@ class MicrosoftTtsParams(UncheckedBaseModel): Audio sampling rate in Hz """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. Values between 0.5 and 2.0. + """ + + volume: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio volume. Values between 0.0 and 100.0. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/mllm.py b/src/agora_agent/types/mllm.py new file mode 100644 index 0000000..3bcdb95 --- /dev/null +++ b/src/agora_agent/types/mllm.py @@ -0,0 +1,88 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_params import MllmParams +from .mllm_turn_detection import MllmTurnDetection +from .mllm_vendor import MllmVendor + + +class Mllm(UncheckedBaseModel): + """ + Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. + """ + + enable: typing.Optional[bool] = pydantic.Field(default=None) + """ + Enable Multimodal Large Language Model. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM WebSocket URL for real-time communication. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The API key used for MLLM authentication. + """ + + adc_credentials_string: typing.Optional[str] = pydantic.Field(default=None) + """ + Base64-encoded Google Cloud Application Default Credentials. Used by Vertex AI. + """ + + project_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud project ID. Used by Vertex AI. + """ + + location: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud location or region. Used by Vertex AI. + """ + + messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + Array of conversation items used for short-term memory management. + """ + + params: typing.Optional[MllmParams] = None + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting message. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent failure message. + """ + + vendor: typing.Optional[MllmVendor] = pydantic.Field(default=None) + """ + MLLM provider. + """ + + turn_detection: typing.Optional[MllmTurnDetection] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_http_options.py b/src/agora_agent/types/mllm_http_options.py new file mode 100644 index 0000000..19baebb --- /dev/null +++ b/src/agora_agent/types/mllm_http_options.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmHttpOptions(UncheckedBaseModel): + """ + HTTP request options for the MLLM provider. + """ + + api_version: typing.Optional[str] = pydantic.Field(default=None) + """ + API version to use. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_input_audio_transcription.py b/src/agora_agent/types/mllm_input_audio_transcription.py new file mode 100644 index 0000000..6bb3d9d --- /dev/null +++ b/src/agora_agent/types/mllm_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmInputAudioTranscription(UncheckedBaseModel): + """ + Configuration for audio input transcription. + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language of the input audio. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Model to use for transcription. + """ + + prompt: typing.Optional[str] = pydantic.Field(default=None) + """ + Text to guide the transcription model. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_params.py b/src/agora_agent/types/mllm_params.py new file mode 100644 index 0000000..5437b69 --- /dev/null +++ b/src/agora_agent/types/mllm_params.py @@ -0,0 +1,71 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_http_options import MllmHttpOptions +from .mllm_input_audio_transcription import MllmInputAudioTranscription + + +class MllmParams(UncheckedBaseModel): + """ + Additional MLLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM model identifier. + """ + + voice: typing.Optional[str] = pydantic.Field(default=None) + """ + Voice identifier for audio output. + """ + + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + System instructions that define the agent behavior or tone. + """ + + input_audio_transcription: typing.Optional[MllmInputAudioTranscription] = None + affective_dialog: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to enable Gemini affective dialog. + """ + + proactive_audio: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether Gemini may choose not to respond when no reply is needed. + """ + + transcribe_agent: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the agent speech in real time. + """ + + transcribe_user: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the user speech in real time. + """ + + http_options: typing.Optional[MllmHttpOptions] = None + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for xAI Grok speech recognition and synthesis. + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sample rate in Hz. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection.py b/src/agora_agent/types/mllm_turn_detection.py new file mode 100644 index 0000000..2cd3503 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection.py @@ -0,0 +1,35 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_agora_vad_config import MllmTurnDetectionAgoraVadConfig +from .mllm_turn_detection_mode import MllmTurnDetectionMode +from .mllm_turn_detection_semantic_vad_config import MllmTurnDetectionSemanticVadConfig +from .mllm_turn_detection_server_vad_config import MllmTurnDetectionServerVadConfig + + +class MllmTurnDetection(UncheckedBaseModel): + """ + Turn detection configuration for the MLLM module. + """ + + mode: typing.Optional[MllmTurnDetectionMode] = pydantic.Field(default=None) + """ + Turn detection mode for MLLM. + """ + + agora_vad_config: typing.Optional[MllmTurnDetectionAgoraVadConfig] = None + server_vad_config: typing.Optional[MllmTurnDetectionServerVadConfig] = None + semantic_vad_config: typing.Optional[MllmTurnDetectionSemanticVadConfig] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py new file mode 100644 index 0000000..4168ef3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py @@ -0,0 +1,23 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): + interrupt_duration_ms: typing.Optional[int] = None + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_mode.py b/src/agora_agent/types/mllm_turn_detection_mode.py new file mode 100644 index 0000000..f6cd693 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_mode.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionMode = typing.Union[typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py new file mode 100644 index 0000000..aeaf440 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py @@ -0,0 +1,21 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_semantic_vad_config_eagerness import MllmTurnDetectionSemanticVadConfigEagerness + + +class MllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): + eagerness: typing.Optional[MllmTurnDetectionSemanticVadConfigEagerness] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py new file mode 100644 index 0000000..dbf9b4d --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionSemanticVadConfigEagerness = typing.Union[typing.Literal["auto", "low", "medium", "high"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_server_vad_config.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py new file mode 100644 index 0000000..b2976b3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, +) +from .mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, +) + + +class MllmTurnDetectionServerVadConfig(UncheckedBaseModel): + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + idle_timeout_ms: typing.Optional[int] = None + start_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity] = None + end_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py index e92d3f1..b9b3377 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ typing.Literal["END_SENSITIVITY_HIGH", "END_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py index 25860c1..90ccf51 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ typing.Literal["START_SENSITIVITY_HIGH", "START_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/types/mllm_vendor.py b/src/agora_agent/types/mllm_vendor.py new file mode 100644 index 0000000..61c4d1a --- /dev/null +++ b/src/agora_agent/types/mllm_vendor.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/types/murf_tts_params.py b/src/agora_agent/types/murf_tts_params.py index 5107f62..94d68db 100644 --- a/src/agora_agent/types/murf_tts_params.py +++ b/src/agora_agent/types/murf_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,19 +14,44 @@ class MurfTtsParams(UncheckedBaseModel): Murf TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Murf API key """ - voice_id: str = pydantic.Field() + base_url: str = pydantic.Field() """ - Voice ID (e.g., Ariana, Natalie, Ken) + WebSocket endpoint for streaming TTS output """ - style: typing.Optional[str] = pydantic.Field(default=None) + voice_id: typing_extensions.Annotated[str, FieldMetadata(alias="voiceId")] = pydantic.Field() """ - Voice style (e.g., Angry, Sad, Conversational, Newscast) + Voice ID (e.g., Matthew) + """ + + locale: typing.Optional[str] = pydantic.Field(default=None) + """ + Locale for the selected voice + """ + + rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech rate adjustment + """ + + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + TTS model to use + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/open_ai_asr.py b/src/agora_agent/types/open_ai_asr.py new file mode 100644 index 0000000..eec2aab --- /dev/null +++ b/src/agora_agent/types/open_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .open_ai_asr_params import OpenAiAsrParams + + +class OpenAiAsr(UncheckedBaseModel): + """ + OpenAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_asr_params.py b/src/agora_agent/types/open_ai_asr_params.py new file mode 100644 index 0000000..a5fadc8 --- /dev/null +++ b/src/agora_agent/types/open_ai_asr_params.py @@ -0,0 +1,30 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .open_ai_input_audio_transcription import OpenAiInputAudioTranscription + + +class OpenAiAsrParams(UncheckedBaseModel): + """ + OpenAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + OpenAI API key + """ + + input_audio_transcription: OpenAiInputAudioTranscription + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_input_audio_transcription.py b/src/agora_agent/types/open_ai_input_audio_transcription.py new file mode 100644 index 0000000..9db45b1 --- /dev/null +++ b/src/agora_agent/types/open_ai_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class OpenAiInputAudioTranscription(UncheckedBaseModel): + """ + OpenAI audio transcription configuration. + """ + + model: str = pydantic.Field() + """ + OpenAI ASR model to use for transcription + """ + + prompt: str = pydantic.Field() + """ + Prompt that guides the transcription process + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 3839646..c8f6e51 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,12 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key. Optional for Agora-managed OpenAI TTS usage. + OpenAI API key. Optional for preset-backed OpenAI TTS usage. + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Endpoint URL for the OpenAI TTS service. """ voice: str = pydantic.Field() @@ -27,6 +32,16 @@ class OpenAiTtsParams(UncheckedBaseModel): Model name (e.g., "tts-1", "tts-1-hd") """ + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + Custom instructions for voice style, accent, pace, and tone. + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/rime_tts_params.py b/src/agora_agent/types/rime_tts_params.py index 6d18375..aae3ef2 100644 --- a/src/agora_agent/types/rime_tts_params.py +++ b/src/agora_agent/types/rime_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,7 +14,7 @@ class RimeTtsParams(UncheckedBaseModel): Rime TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Rime API key """ @@ -22,9 +24,16 @@ class RimeTtsParams(UncheckedBaseModel): Rime speaker ID """ - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="modelId")] = pydantic.Field( + default=None + ) """ - Model ID (optional) + Rime TTS model ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Rime streaming API """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/sarvam_asr.py b/src/agora_agent/types/sarvam_asr.py new file mode 100644 index 0000000..ec95847 --- /dev/null +++ b/src/agora_agent/types/sarvam_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .sarvam_asr_params import SarvamAsrParams + + +class SarvamAsr(UncheckedBaseModel): + """ + Sarvam ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_asr_params.py b/src/agora_agent/types/sarvam_asr_params.py new file mode 100644 index 0000000..f29769d --- /dev/null +++ b/src/agora_agent/types/sarvam_asr_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SarvamAsrParams(UncheckedBaseModel): + """ + Sarvam ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Sarvam API key + """ + + language: str = pydantic.Field() + """ + Language code for transcription. Set to unknown for automatic language detection. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_tts_params.py b/src/agora_agent/types/sarvam_tts_params.py index 93457a4..855299f 100644 --- a/src/agora_agent/types/sarvam_tts_params.py +++ b/src/agora_agent/types/sarvam_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .sarvam_tts_params_target_language_code import SarvamTtsParamsTargetLanguageCode class SarvamTtsParams(UncheckedBaseModel): @@ -12,7 +13,7 @@ class SarvamTtsParams(UncheckedBaseModel): Sarvam TTS configuration parameters. """ - key: str = pydantic.Field() + api_subscription_key: str = pydantic.Field() """ Sarvam API subscription key """ @@ -22,11 +23,31 @@ class SarvamTtsParams(UncheckedBaseModel): Voice ID (e.g., anushka, abhilash, karun, hitesh, manisha, vidya, arya) """ - target_language_code: str = pydantic.Field() + target_language_code: SarvamTtsParamsTargetLanguageCode = pydantic.Field() """ Target language code (e.g., en-IN) """ + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment for the voice + """ + + pace: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + loudness: typing.Optional[float] = pydantic.Field(default=None) + """ + Volume level of the speech + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/sarvam_tts_params_target_language_code.py b/src/agora_agent/types/sarvam_tts_params_target_language_code.py new file mode 100644 index 0000000..b1722ec --- /dev/null +++ b/src/agora_agent/types/sarvam_tts_params_target_language_code.py @@ -0,0 +1,8 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +SarvamTtsParamsTargetLanguageCode = typing.Union[ + typing.Literal["en-IN", "hi-IN", "bn-IN", "ta-IN", "te-IN", "kn-IN", "ml-IN", "mr-IN", "gu-IN", "pa-IN", "or-IN"], + typing.Any, +] diff --git a/src/agora_agent/types/speechmatics_asr.py b/src/agora_agent/types/speechmatics_asr.py new file mode 100644 index 0000000..644db25 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class SpeechmaticsAsr(UncheckedBaseModel): + """ + Speechmatics ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/speechmatics_asr_params.py b/src/agora_agent/types/speechmatics_asr_params.py new file mode 100644 index 0000000..4709d22 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SpeechmaticsAsrParams(UncheckedBaseModel): + """ + Speechmatics ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Speechmatics API key + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Speechmatics streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/tests/custom/test_agentkit_agent.py b/tests/custom/test_agentkit_agent.py new file mode 100644 index 0000000..9719b04 --- /dev/null +++ b/tests/custom/test_agentkit_agent.py @@ -0,0 +1,298 @@ +from agora_agent.agentkit import ( + Agent, + AvatarConfig, + AvatarVendor, + LlmConfig, + LlmStyle, + MllmConfig, + MllmVendor, + SttConfig, + SttVendor, + TtsConfig, +) +import pytest + +from agora_agent.agentkit.vendors import ( + AkoolAvatar, + ElevenLabsTTS, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, +) + + +def _parameter(config, key): + parameters = config["parameters"] + if isinstance(parameters, dict): + return parameters[key] + return getattr(parameters, key) + + +class _CopyOnlyModel: + def __init__(self, **values): + self.values = values + + def copy(self, update=None): + return _CopyOnlyModel(**{**self.values, **(update or {})}) + + +def test_generated_core_aliases_are_public(): + assert LlmConfig is not None + assert LlmStyle is not None + assert SttConfig is not None + assert SttVendor is not None + assert TtsConfig is not None + assert MllmConfig is not None + assert MllmVendor is not None + assert AvatarConfig is not None + assert AvatarVendor is not None + + +def test_model_copy_helper_supports_pydantic_v1_copy_api(): + copied = Agent._copy_model_update(_CopyOnlyModel(enable_rtm=True), {"data_channel": "rtm"}) # noqa: SLF001 + + assert copied.values == {"enable_rtm": True, "data_channel": "rtm"} + + +def test_with_audio_scenario_sets_session_parameter(): + agent = Agent(name="test").with_audio_scenario("chorus") + + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_with_audio_scenario_preserves_existing_parameters(): + agent = Agent(name="test", parameters={"enable_metrics": True}).with_audio_scenario( + "chorus" + ) + + assert _parameter(agent.config, "enable_metrics") is True + assert _parameter(agent.config, "audio_scenario") == "chorus" + + +def test_enable_rtm_defaults_data_channel_to_rtm(): + properties = Agent(name="test", advanced_features={"enable_rtm": True}).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "rtm" + + +def test_enable_rtm_preserves_explicit_data_channel(): + properties = Agent( + name="test", + advanced_features={"enable_rtm": True}, + parameters={"data_channel": "datastream"}, + ).to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + skip_vendor_validation=True, + ) + + assert properties.parameters.data_channel == "datastream" + + +def test_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + + properties = agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="token", + ) + + assert properties.llm.greeting_message == "agent greeting" + assert properties.llm.failure_message == "agent failure" + assert properties.llm.max_history == 2 + + +def test_avatar_sample_rate_validation_works_when_tts_added_after_avatar(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2") + ) + + with pytest.raises(ValueError, match="24000"): + agent.with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + + +def test_avatar_sample_rate_validation_uses_wrapper_sample_rate(): + agent = ( + Agent(name="test") + .with_avatar(AkoolAvatar(api_key="avatar-key")) + .with_tts( + ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice", sample_rate=16000) + ) + ) + + assert agent.tts_sample_rate == 16000 + + +def test_with_mllm_removes_deprecated_advanced_features_enable_mllm(): + properties = ( + Agent( + name="test", + advanced_features={"enable_mllm": True, "enable_rtm": True}, + greeting="hello from agent", + failure_message="try again", + max_history=5, + ) + .with_mllm(OpenAIRealtime(api_key="openai-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None + assert properties.mllm.enable is True + assert properties.mllm.greeting_message == "hello from agent" + assert properties.mllm.failure_message == "try again" + mllm_dump = properties.mllm.model_dump(exclude_none=True) + assert "max_history" not in mllm_dump + assert properties.advanced_features is not None + af_dump = properties.advanced_features.model_dump(exclude_none=True) + assert "enable_mllm" not in af_dump + assert af_dump.get("enable_rtm") is True + + +def test_to_properties_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_mllm_with_avatar_fires_before_token_generation(): + """The guard must fire before the token-generation step so callers get a + clear, actionable error even when app_id/app_certificate are empty. + """ + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + app_id="", + app_certificate="", + ) + + +def test_to_properties_rejects_mllm_with_default_enabled_avatar(): + """Avatar with no `enable` field should be treated as enabled.""" + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + agent._avatar = { # noqa: SLF001 + "vendor": "liveavatar", + "params": { + "api_key": "avatar-key", + "quality": "high", + "agora_uid": "200", + "agora_token": "avatar-token", + }, + } + + with pytest.raises(ValueError, match="cascading"): + agent.to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + + +def test_to_properties_allows_mllm_with_disabled_avatar_and_no_tts(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is not None and properties.avatar.enable is False + + +def test_to_properties_mllm_without_tts_or_llm_succeeds(): + properties = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .to_properties( + channel="room", + agent_uid="1", + remote_uids=["100"], + token="rtc-token", + ) + ) + + assert properties.mllm is not None and properties.mllm.enable is True + assert properties.tts is None + assert properties.llm is None + assert properties.asr is None + assert properties.avatar is None diff --git a/tests/custom/test_agentkit_session.py b/tests/custom/test_agentkit_session.py new file mode 100644 index 0000000..198fcd0 --- /dev/null +++ b/tests/custom/test_agentkit_session.py @@ -0,0 +1,383 @@ +from types import SimpleNamespace + +import pytest + +from agora_agent.agentkit import Agent, AgentSession +from agora_agent.agentkit.vendors import ( + ElevenLabsTTS, + GenericAvatar, + LiveAvatarAvatar, + OpenAI, + OpenAIRealtime, + RimeTTS, +) +from agora_agent.agents.types.get_turns_agents_response import GetTurnsAgentsResponse + + +APP_ID = "0" * 32 +APP_CERTIFICATE = "1" * 32 + + +class _Agents: + def __init__(self): + self.calls = [] + self.start_calls = [] + + def start(self, app_id, name, properties, preset=None, pipeline_id=None, request_options=None): + self.start_calls.append((app_id, name, properties, preset, pipeline_id, request_options)) + return SimpleNamespace(agent_id="agent-1") + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls.append((app_id, agent_id, page_index, page_size, request_options)) + is_last_page = page_index != 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={ + "page_index": page_index or 1, + "total_pages": 2, + "is_last_page": is_last_page, + }, + turns=[{"turn_id": float(page_index or 1)}], + ) + + +class _Client: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _Agents() + self.agent_management = object() + + +def _session(agent, warn=None): + return AgentSession( + client=_Client(), + agent=agent, + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + warn=warn, + ) + + +def test_generic_avatar_enrichment_adds_session_context_and_token(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + assert params["agora_token"] != properties["token"] + + +def test_generic_avatar_empty_session_fields_are_filled(): + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + agora_appid="", + agora_channel="", + agora_token="", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + params = properties["avatar"]["params"] + assert params["agora_appid"] == APP_ID + assert params["agora_channel"] == "room" + assert params["agora_token"] + + +def test_avatar_uid_matching_agent_uid_warns(): + warnings = [] + agent = Agent(name="test").with_avatar( + GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="1", + ) + ) + session = _session(agent, warn=warnings.append) + + session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert any("matches agent_rtc_uid" in warning for warning in warnings) + + +def test_session_start_properties_agent_level_llm_fields_override_vendor_defaults(): + agent = ( + Agent(name="test") + .with_llm( + OpenAI( + api_key="llm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + max_history=1, + ) + ) + .with_tts(ElevenLabsTTS(key="tts-key", model_id="model", voice_id="voice")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["llm"]["greeting_message"] == "agent greeting" + assert properties["llm"]["failure_message"] == "agent failure" + assert properties["llm"]["max_history"] == 2 + + +def test_session_start_properties_applies_mllm_agent_level_defaults(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "agent greeting" + assert properties["mllm"]["failure_message"] == "agent failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_properties_preserves_mllm_vendor_defaults(): + agent = ( + Agent(name="test") + .with_mllm( + OpenAIRealtime( + api_key="mllm-key", + greeting_message="vendor greeting", + failure_message="vendor failure", + ) + ) + .with_greeting("agent greeting") + .with_failure_message("agent failure") + .with_max_history(2) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["mllm"]["greeting_message"] == "vendor greeting" + assert properties["mllm"]["failure_message"] == "vendor failure" + assert "max_history" not in properties["mllm"] + + +def test_session_start_allows_mllm_without_tts(): + agent = Agent(name="test").with_mllm(OpenAIRealtime(api_key="mllm-key")) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_session_start_rejects_mllm_with_enabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + ) + ) + ) + session = _session(agent) + + with pytest.raises(ValueError, match="cascading"): + session.start() + assert session._client.agents.start_calls == [] # noqa: SLF001 + + +def test_session_start_allows_mllm_with_disabled_avatar(): + agent = ( + Agent(name="test") + .with_mllm(OpenAIRealtime(api_key="mllm-key")) + .with_avatar( + LiveAvatarAvatar( + api_key="avatar-key", + quality="medium", + agora_uid="2", + agora_token="avatar-token", + enable=False, + ) + ) + ) + session = _session(agent) + + assert session.start() == "agent-1" + + +def test_avatar_sample_rate_validation_uses_serialized_vendor_keys(): + warnings = [] + agent = ( + Agent(name="test") + .with_avatar(LiveAvatarAvatar(api_key="avatar-key", quality="medium", agora_uid="2")) + .with_tts(RimeTTS(key="tts-key", speaker="speaker", sampling_rate=24000)) + ) + session = _session(agent, warn=warnings.append) + + session._validate_avatar_config() # noqa: SLF001 + + assert warnings == [] + + +def test_avatar_user_token_is_not_overwritten(): + agent = Agent(name="test").with_avatar( + LiveAvatarAvatar( + api_key="live-key", + quality="medium", + agora_uid="2", + agora_token="user-token", + ) + ) + session = _session(agent) + + properties = session._build_start_properties({ # noqa: SLF001 + "app_id": APP_ID, + "app_certificate": APP_CERTIFICATE, + }) + + assert properties["avatar"]["params"]["agora_token"] == "user-token" + + +def test_get_turns_forwards_pagination_args(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + session.get_turns(page_index=3, page_size=25) + + assert session._client.agents.calls[-1][:4] == (APP_ID, "agent-id", 3, 25) # noqa: SLF001 + + +def test_get_all_turns_aggregates_pages(): + session = _session(Agent(name="test")) + session._agent_id = "agent-id" # noqa: SLF001 + + response = session.get_all_turns(page_size=1) + + assert [turn.turn_id for turn in response.turns] == [1.0, 2.0] + assert response.pagination.page_index == 2 + + +def test_get_all_turns_raises_when_pagination_does_not_advance(): + class _StuckAgents: + def __init__(self): + self.calls = 0 + + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + self.calls += 1 + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=2, + pagination={"page_index": 1, "is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _StuckClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _StuckAgents() + self.agent_management = object() + + session = AgentSession( + client=_StuckClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="did not advance"): + session.get_all_turns(page_size=1) + + +def test_get_all_turns_raises_when_pagination_metadata_missing(): + class _NoMetaAgents: + def get_turns(self, app_id, agent_id, page_index=None, page_size=None, request_options=None): + return GetTurnsAgentsResponse( + agent_id=agent_id, + channel="room", + total_turn_count=1, + pagination={"is_last_page": False}, + turns=[{"turn_id": 1.0}], + ) + + class _NoMetaClient: + auth_mode = "basic" + app_id = APP_ID + app_certificate = APP_CERTIFICATE + + def __init__(self): + self.agents = _NoMetaAgents() + self.agent_management = object() + + session = AgentSession( + client=_NoMetaClient(), + agent=Agent(name="test"), + app_id=APP_ID, + app_certificate=APP_CERTIFICATE, + name="test", + channel="room", + agent_uid="1", + remote_uids=["100"], + ) + session._agent_id = "agent-id" # noqa: SLF001 + + with pytest.raises(RuntimeError, match="cannot continue"): + session.get_all_turns(page_size=1) diff --git a/tests/custom/test_agentkit_vendors.py b/tests/custom/test_agentkit_vendors.py new file mode 100644 index 0000000..8473821 --- /dev/null +++ b/tests/custom/test_agentkit_vendors.py @@ -0,0 +1,122 @@ +import pytest +from pydantic import ValidationError + +from agora_agent.agentkit import LlmGreetingConfigs +import warnings + +from agora_agent.agentkit.vendors import GenericAvatar, OpenAI, OpenAIRealtime, XaiGrok, XaiRealtime + + +def test_xai_grok_serializes_v27_shape_without_style(): + config = XaiGrok( + api_key="xai-key", + voice="eve", + language="en", + sample_rate=24000, + output_modalities=["audio", "text"], + params={"temperature": 0.2}, + ).to_config() + + assert config["vendor"] == "xai" + assert config["url"] == "wss://api.x.ai/v1/realtime" + assert config["api_key"] == "xai-key" + assert config["params"] == { + "temperature": 0.2, + "voice": "eve", + "language": "en", + "sample_rate": 24000, + } + assert config["output_modalities"] == ["audio", "text"] + assert "style" not in config + + +def test_xai_grok_emits_params_even_when_empty(): + assert XaiGrok(api_key="xai-key").to_config()["params"] == {} + + +def test_xai_realtime_deprecated_alias_emits_same_vendor(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always", DeprecationWarning) + config = XaiRealtime(api_key="xai-key").to_config() + assert len(caught) == 1 + assert issubclass(caught[0].category, DeprecationWarning) + assert config["vendor"] == "xai" + + +def test_mllm_rejects_fields_not_in_core_contract(): + with pytest.raises(ValidationError): + OpenAIRealtime(api_key="openai-key", predefined_tools=["_publish_message"]) + + with pytest.raises(ValidationError): + XaiGrok(api_key="xai-key", max_history=10) + + +def test_generic_avatar_omits_session_enriched_fields_when_unset(): + config = GenericAvatar( + api_key="avatar-key", + api_base_url="https://avatar.example.com", + avatar_id="avatar-1", + agora_uid="2", + ).to_config() + + assert config == { + "enable": True, + "vendor": "generic", + "params": { + "api_key": "avatar-key", + "api_base_url": "https://avatar.example.com", + "avatar_id": "avatar-1", + "agora_uid": "2", + }, + } + + +def test_vertex_ai_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import VertexAI + + config = VertexAI( + model="explicit-model", + project_id="explicit-project", + location="explicit-region", + adc_credentials_string="{}", + additional_params={ + "model": "should-be-overridden", + "project_id": "should-be-overridden", + "location": "should-be-overridden", + "adc_credentials_string": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["vendor"] == "vertexai" + assert config["params"]["model"] == "explicit-model" + assert config["params"]["project_id"] == "explicit-project" + assert config["params"]["location"] == "explicit-region" + assert config["params"]["adc_credentials_string"] == "{}" + assert config["params"]["extra_key"] == "kept" + + +def test_gemini_live_explicit_fields_override_additional_params(): + from agora_agent.agentkit.vendors import GeminiLive + + config = GeminiLive( + api_key="key", + model="explicit-model", + additional_params={ + "model": "should-be-overridden", + "extra_key": "kept", + }, + ).to_config() + + assert config["params"]["model"] == "explicit-model" + assert config["params"]["extra_key"] == "kept" + + +def test_llm_greeting_configs_interruptable_serializes(): + config = OpenAI( + api_key="openai-key", + greeting_configs=LlmGreetingConfigs(mode="single_first", interruptable=False), + ).to_config() + + assert config["greeting_configs"]["mode"] == "single_first" + assert config["greeting_configs"]["interruptable"] is False