diff --git a/.fernignore b/.fernignore index b1e6d75..16d4a5a 100644 --- a/.fernignore +++ b/.fernignore @@ -10,6 +10,10 @@ src/agora_agent/agentkit/ # Documentation - managed manually, not generated by Fern docs/ README.md +reference.md + +# Tests - managed manually, not generated by Fern +tests/ # Compatibility shim and CI/release workflows are managed manually compat/ diff --git a/README.md b/README.md index 983932b..c8cbabf 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ pip install agora-agents ## Quick Start Start with the `Agent` builder: create a client with app credentials, choose your ASR, LLM, and TTS providers, then start a session. Omit vendor API keys for supported Agora-managed models, or provide keys when you want BYOK. +Set Agora interaction language with `turn_detection.language`; provider-specific STT language values remain under `asr.params`. ```python import os @@ -29,12 +30,9 @@ from agora_agent import ( Agent, Agora, Area, - DataChannel, DeepgramSTT, - GenericAvatar, MiniMaxTTS, OpenAI, - XaiGrok, expires_in_hours, ) @@ -56,39 +54,7 @@ def start_conversation() -> str: app_certificate=app_certificate, ) - agent = Agent( - name=f"conversation-{int(time.time())}", - instructions=AGENT_PROMPT, - greeting=GREETING, - failure_message="Please wait a moment.", - max_history=50, - turn_detection={ - "config": { - "speech_threshold": 0.5, - "start_of_speech": { - "mode": "vad", - "vad_config": { - "interrupt_duration_ms": 160, - "prefix_padding_ms": 300, - }, - }, - "end_of_speech": { - "mode": "vad", - "vad_config": { - "silence_duration_ms": 480, - }, - }, - }, - }, - advanced_features={ - "enable_rtm": True, - "enable_tools": True, - }, - parameters={ - "data_channel": DataChannel.RTM, - "enable_error_message": True, - }, - ).with_stt( + agent = Agent(name=f"conversation-{int(time.time())}", turn_detection={"language": "en-US"}).with_stt( DeepgramSTT( model="nova-3", language="en", @@ -96,9 +62,10 @@ def start_conversation() -> str: ).with_llm( OpenAI( model="gpt-4o-mini", + system_messages=[{"role": "system", "content": AGENT_PROMPT}], greeting_message=GREETING, failure_message="Please wait a moment.", - max_history=15, + max_history=50, params={ "max_tokens": 1024, "temperature": 0.7, @@ -129,15 +96,44 @@ def start_conversation() -> str: `Agora` generates the required ConvoAI REST auth and RTC join tokens automatically when you provide `app_id` and `app_certificate`. For supported Agora-managed models, leave vendor API keys unset; provide keys when you want BYOK. +## AI Studio pipeline IDs + +Use `pipeline_id` when you want a published AI Studio pipeline to provide the base agent configuration: + +```python +agent = Agent( + name="support", + pipeline_id="studio-pipeline-id", +) + +session = agent.create_session( + client, + channel="support-room", + agent_uid="1", + remote_uids=["100"], +) +``` + +You can override it per session: + +```python +session = agent.create_session( + client, + channel="support-room", + agent_uid="1", + remote_uids=["100"], + pipeline_id="session-pipeline-id", +) +``` + +AgentKit sends the resolved value as the top-level `/join` field `pipeline_id`, not inside `properties`. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, and `advanced_features` may send `properties` fields that override the saved pipeline settings. + ### BYOK version Use the same `Agent` builder shape, but provide credentials explicitly when you want vendor-managed billing and routing instead of Agora-managed models. ```python -agent = Agent( - instructions=AGENT_PROMPT, - greeting=GREETING, -).with_stt( +agent = Agent(turn_detection={"language": "en-US"}).with_stt( DeepgramSTT( api_key=os.environ["DEEPGRAM_API_KEY"], model="nova-3", @@ -146,7 +142,10 @@ agent = Agent( ).with_llm( OpenAI( api_key=os.environ["OPENAI_API_KEY"], + base_url="https://api.openai.com/v1/chat/completions", model="gpt-4o-mini", + system_messages=[{"role": "system", "content": AGENT_PROMPT}], + greeting_message=GREETING, max_tokens=1024, temperature=0.7, top_p=0.95, diff --git a/changelog.md b/changelog.md index 1174850..dc8dcc6 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [v2.1.0] — 2026-06-02 + +### Added + +- **Turn detection language** — AgentKit now manages Agora interaction language through `turn_detection.language`, validates it against the supported BCP-47 language list, and sends the default `en-US` when no language is provided. +- **Provider parameter parity** — ASR, LLM, MLLM, TTS, and avatar wrappers expose typed provider parameters plus passthrough fields where the generated core supports additional properties. + +### Changed + +- **Generated core refresh** — Regenerated core types from the v2.1 API schema. +- **Deepgram TTS passthrough** — `DeepgramTTS` now uses `additional_params` for passthrough fields and flattens them into `tts.params`; the removed nested `params.params` shape is no longer documented or emitted. +- **OpenAI TTS** — Docs and tests now reflect the generated core shape, including `instructions` and `speed` under `tts.params`. +- **TTS provider docs** — Updated TTS provider reference tables to match implemented wrapper fields and generated core params. + +### Fixed + +- **Managed-provider validation** — AgentKit validation now distinguishes preset-backed providers from BYOK providers so required provider fields are only required when credentials are caller-supplied. +- **Language placement** — Provider-specific STT language values remain under `asr.params`, while Agora interaction language is emitted separately as `turn_detection.language`. + ## [v2.0.0] — 2026-05-21 ### Added @@ -52,7 +71,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **`DeepgramTTS`** — New TTS vendor wrapper for Deepgram (Beta). Accepts `api_key`, `model`, `base_url`, `sample_rate`, `params`, and `skip_patterns`. +- **`DeepgramTTS`** — New TTS vendor wrapper for Deepgram (Beta). Accepts `api_key`, `model`, `base_url`, `sample_rate`, `additional_params`, and `skip_patterns`. - **`Agent.with_tools(enabled=True)`** — Dedicated builder method to enable MCP tool invocation (`advanced_features.enable_tools`). Replaces the raw `with_advanced_features(AdvancedFeatures(enable_tools=True))` call. - **LLM vendors: `headers` field** — All four LLM vendors (`OpenAI`, `AzureOpenAI`, `Anthropic`, `Gemini`) now accept an optional `headers: Dict[str, str]` parameter. Use this to pass custom HTTP headers to the LLM provider (e.g., tenant identifiers, routing headers). - **`AgentSession.think()` / `AsyncAgentSession.think()`** — Send a custom instruction to a running agent through the `agent_management` API. @@ -107,7 +126,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ### Added -- **`OpenAITTS`** — New optional parameters: `response_format` (str, e.g. `"pcm"`) and `speed` (float). +- **`OpenAITTS`** — New optional parameters: `instructions` (str) and `speed` (float). - **`CartesiaTTS`** — `voice_id` user-facing field is preserved; voice is serialized to the required nested object format automatically. - **`RimeTTS`** — New optional parameters: `lang` (str), `sampling_rate` (int, serialized as `samplingRate`), `speed_alpha` (float, serialized as `speedAlpha`). - **`OpenAIRealtime`** — New optional parameter: `failure_message` (str). diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index dd9d3ed..8a75762 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -12,24 +12,28 @@ The `Agent` class is a fluent builder for configuring AI agent properties. It co ```python -from agora_agent import Agent - -agent = Agent( - name='support-assistant', - instructions='You are a helpful voice assistant.', - greeting='Hello! How can I help you?', - failure_message='Sorry, something went wrong.', - max_history=20, +from agora_agent import Agent, OpenAI + +agent = Agent(name='support-assistant').with_llm( + OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful voice assistant.'}], + greeting_message='Hello! How can I help you?', + failure_message='Sorry, something went wrong.', + max_history=20, + ) ) ``` | Parameter | Type | Required | Description | |---|---|---|---| | `name` | `str` | No | Agent display name (used as session name if not overridden) | -| `instructions` | `str` | No | System prompt for the LLM | -| `greeting` | `str` | No | Message spoken when the agent joins | -| `failure_message` | `str` | No | Message spoken on error | -| `max_history` | `int` | No | Maximum conversation history length | +| `instructions` | `str` | No | Deprecated. Use LLM vendor `system_messages` instead. | +| `greeting` | `str` | No | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | +| `failure_message` | `str` | No | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | +| `max_history` | `int` | No | Deprecated. Use LLM vendor `max_history` instead. | | `turn_detection` | `TurnDetectionConfig` | No | Turn detection settings | | `sal` | `SalConfig` | No | SAL (Speech Activity Level) configuration | | `advanced_features` | `Dict[str, Any]` | No | Advanced features (e.g., `{'enable_rtm': True}`) | @@ -57,15 +61,15 @@ Each `with_*` method returns a **new** `Agent` instance — the original is unch | Method | Accepts | Purpose | |---|---|---| -| `with_instructions(text)` | `str` | Override the system prompt | -| `with_greeting(text)` | `str` | Override the greeting message | +| `with_instructions(text)` | `str` | Deprecated. Use LLM vendor `system_messages` instead. | +| `with_greeting(text)` | `str` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | | `with_name(name)` | `str` | Override the agent name | -| `with_turn_detection(config)` | `TurnDetectionConfig` | Override cascading-flow SOS/EOS detection; use `with_interruption()` for interruption behavior | +| `with_turn_detection(config)` | `TurnDetectionConfig` | Configure `turn_detection.language` and cascading-flow SOS/EOS detection; use `with_interruption()` for interruption behavior | | `with_sal(config)` | `SalConfig` | Set SAL configuration | | `with_advanced_features(features)` | `Dict[str, Any]` | Set advanced features | | `with_parameters(parameters)` | `SessionParams` | Set session parameters | -| `with_failure_message(message)` | `str` | Set failure message | -| `with_max_history(max_history)` | `int` | Set max history length | +| `with_failure_message(message)` | `str` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | +| `with_max_history(max_history)` | `int` | Deprecated. Use LLM vendor `max_history` instead. | | `with_geofence(geofence)` | `GeofenceConfig` | Set geofence configuration | | `with_labels(labels)` | `Dict[str, str]` | Set custom labels | | `with_rtc(rtc)` | `RtcConfig` | Set RTC configuration | @@ -79,9 +83,14 @@ from agora_agent import Agent from agora_agent import OpenAI, ElevenLabsTTS, DeepgramSTT agent = ( - Agent(name='my-agent', instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + Agent(name='my-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) ``` @@ -97,9 +106,14 @@ from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') base = ( - Agent(instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + Agent() + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) diff --git a/docs/concepts/session.md b/docs/concepts/session.md index e4883f2..8d70add 100644 --- a/docs/concepts/session.md +++ b/docs/concepts/session.md @@ -40,9 +40,14 @@ from agora_agent import Agent, Agora, Area, OpenAI, ElevenLabsTTS, DeepgramSTT client = Agora(area=Area.US, app_id='your-app-id', app_certificate='your-app-certificate') agent = ( - Agent(name='my-agent', instructions='You are helpful.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) + Agent(name='my-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are helpful.'}], + )) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) ) diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 8d58cd1..c59ae7c 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -21,21 +21,21 @@ Used with `agent.with_llm()` for the cascading flow (ASR → LLM → TTS). | Class | Provider | Required Parameters | |---|---|---| -| `OpenAI` | OpenAI | `api_key` | -| `AzureOpenAI` | Azure OpenAI | `api_key`, `endpoint`, `deployment_name` | -| `Anthropic` | Anthropic | `api_key` | -| `Gemini` | Google Gemini | `api_key` | -| `Groq` | Groq | `api_key` | -| `VertexAILLM` | Google Vertex AI | `api_key`, `project_id`, `location` | -| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | -| `Dify` | Dify | `api_key`, `url` | +| `OpenAI` | OpenAI | `model` for Agora-managed models; `api_key`, `base_url`, `model` for BYOK | +| `AzureOpenAI` | Azure OpenAI | `api_key`, `model`, `endpoint`, `deployment_name` | +| `Anthropic` | Anthropic | `api_key`, `model`, `url`, `headers`, `max_tokens` | +| `Gemini` | Google Gemini | `api_key`, `model` | +| `Groq` | Groq | `api_key`, `model`, `base_url` | +| `VertexAILLM` | Google Vertex AI | `api_key`, `model`, `project_id`, `location` | +| `AmazonBedrock` | Amazon Bedrock | `access_key`, `secret_key`, `region`, `model` | +| `Dify` | Dify | `api_key`, `url`, `model` | | `CustomLLM` | OpenAI-compatible LLM | `api_key`, `base_url`, `model` | ```python from agora_agent import OpenAI -llm = OpenAI(api_key='your-openai-key', model='gpt-4o-mini') +llm = OpenAI(api_key='your-openai-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini') ``` ## TTS Vendors @@ -44,17 +44,17 @@ Used with `agent.with_tts()`. Each TTS vendor produces audio at a specific sampl | Class | Provider | Required Parameters | Sample Rate | |---|---|---|---| -| `ElevenLabsTTS` | ElevenLabs | `key`, `model_id`, `voice_id` | 16000, 22050, 24000, or 44100 Hz | +| `ElevenLabsTTS` | ElevenLabs | `key`, `model_id`, `voice_id`, `base_url` | 16000, 22050, 24000, or 44100 Hz | | `MicrosoftTTS` | Microsoft Azure | `key`, `region`, `voice_name` | 8000, 16000, 24000, or 48000 Hz | -| `OpenAITTS` | OpenAI | `key`, `voice` | 24000 Hz (fixed) | -| `CartesiaTTS` | Cartesia | `key`, `voice_id` | 8000–48000 Hz | +| `OpenAITTS` | OpenAI | `voice` for Agora-managed `tts-1`; `api_key`, `model`, `base_url`, `voice` for BYOK | 24000 Hz (fixed) | +| `CartesiaTTS` | Cartesia | `api_key`, `voice_id`, `model_id` | 8000–48000 Hz | | `GoogleTTS` | Google Cloud | `key`, `voice_name` | — | -| `AmazonTTS` | Amazon Polly | `access_key`, `secret_key`, `region`, `voice_id` | — | -| `HumeAITTS` | Hume AI | `key` | — | -| `RimeTTS` | Rime | `key`, `speaker` | — | -| `FishAudioTTS` | Fish Audio | `key`, `reference_id` | — | +| `AmazonTTS` | Amazon Polly | `access_key`, `secret_key`, `region`, `voice_id`, `engine` | — | +| `HumeAITTS` | Hume AI | `key`, `voice_id`, `provider` | — | +| `RimeTTS` | Rime | `key`, `speaker`, `model_id` | — | +| `FishAudioTTS` | Fish Audio | `key`, `reference_id`, `backend` | — | | `GroqTTS` | Groq | `key` | — | -| `MiniMaxTTS` | MiniMax | `key` | — | +| `MiniMaxTTS` | MiniMax | `model` for supported Agora-managed models; `key`, `group_id`, `model`, `voice_id`, `url` for BYOK | — | | `DeepgramTTS` | Deepgram | `api_key`, `model` | Configurable | | `SarvamTTS` | Sarvam | `api_key` | — | @@ -66,6 +66,7 @@ tts = ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=24000, ) ``` @@ -74,15 +75,17 @@ tts = ElevenLabsTTS( Used with `agent.with_stt()`. +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. STT vendor `language` options are serialized under `asr.params` using each provider's own format. + | Class | Provider | Required Parameters | |---|---|---| | `SpeechmaticsSTT` | Speechmatics | `api_key`, `language` | -| `DeepgramSTT` | Deepgram | — (all optional) | -| `MicrosoftSTT` | Microsoft Azure | `key`, `region` | +| `DeepgramSTT` | Deepgram | `model` for Agora-managed `nova-2`/`nova-3`; `api_key` for BYOK | +| `MicrosoftSTT` | Microsoft Azure | `key`, `region`, `language` | | `OpenAISTT` | OpenAI | `api_key` | -| `GoogleSTT` | Google Cloud | `api_key` | -| `AmazonSTT` | Amazon Transcribe | `access_key`, `secret_key`, `region` | -| `AssemblyAISTT` | AssemblyAI | `api_key` | +| `GoogleSTT` | Google Cloud | `project_id`, `location`, `adc_credentials_string`, `language` | +| `AmazonSTT` | Amazon Transcribe | `access_key`, `secret_key`, `region`, `language` | +| `AssemblyAISTT` | AssemblyAI | `api_key`, `language` | | `AresSTT` | Ares | — (all optional) | | `SarvamSTT` | Sarvam | `api_key`, `language` | diff --git a/docs/getting-started/authentication.md b/docs/getting-started/authentication.md index 74c62cd..a1a87e3 100644 --- a/docs/getting-started/authentication.md +++ b/docs/getting-started/authentication.md @@ -20,9 +20,12 @@ client = Agora( ) agent = ( - Agent(instructions="Be concise.") + Agent() .with_stt(DeepgramSTT(model="nova-3")) - .with_llm(OpenAI(model="gpt-4o-mini")) + .with_llm(OpenAI( + model="gpt-4o-mini", + system_messages=[{"role": "system", "content": "Be concise."}], + )) .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) ) diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 472ac57..e477920 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -27,14 +27,14 @@ def main() -> None: ) agent = ( - Agent( - name="support-assistant", - instructions="You are a concise support voice assistant.", - greeting="Hello! How can I help you today?", - max_history=10, - ) + Agent(name="support-assistant") .with_stt(DeepgramSTT(model="nova-3", language="en")) - .with_llm(OpenAI(model="gpt-4o-mini")) + .with_llm(OpenAI( + model="gpt-4o-mini", + system_messages=[{"role": "system", "content": "You are a concise support voice assistant."}], + greeting_message="Hello! How can I help you today?", + max_history=10, + )) .with_tts(MiniMaxTTS(model="speech_2_6_turbo", voice_id="English_captivating_female1")) ) diff --git a/docs/guides/agent-builder-features.md b/docs/guides/agent-builder-features.md index 3b55b49..731cb6e 100644 --- a/docs/guides/agent-builder-features.md +++ b/docs/guides/agent-builder-features.md @@ -18,8 +18,8 @@ For string values with a finite set of options (e.g. `data_channel`, `sal_mode`, | `advanced_features` | `with_advanced_features(features)` | Enable MLLM, RTM, SAL, tools | | `tools` | `with_tools(enabled=True)` | Enable MCP tool invocation | | `parameters` | `with_parameters(params)` | Silence config, farewell config, data channel | -| `failure_message` | `with_failure_message(msg)` | Message spoken when LLM fails | -| `max_history` | `with_max_history(n)` | Max conversation turns in LLM context | +| `failure_message` | LLM/MLLM vendor option | Message spoken when LLM fails | +| `max_history` | LLM vendor option | Max conversation turns in LLM context | | `geofence` | `with_geofence(config)` | Restrict backend server regions | | `labels` | `with_labels(labels)` | Custom key-value labels (returned in callbacks) | | `rtc` | `with_rtc(config)` | RTC media encryption | @@ -45,15 +45,19 @@ from agora_agent import ( agent = ( Agent( name='sal-assistant', - instructions='You are a helpful assistant.', advanced_features=AdvancedFeatures(enable_sal=True), ) .with_sal(SalConfig( sal_mode=SalModeValues.LOCKING, sample_urls={'primary-speaker': 'https://example.com/voiceprint.pcm'}, )) - .with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + .with_llm(OpenAI( + api_key='your-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) + .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-key', model='nova-2', language='en-US')) ) ``` @@ -105,8 +109,8 @@ agent = ( ), data_channel=DataChannel.RTM, # or DataChannel.DATASTREAM )) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -114,24 +118,16 @@ agent = ( ## Failure Message and Max History ```python -agent = ( - Agent( - name='assistant', - failure_message='Sorry, I encountered an error. Please try again.', - max_history=20, - ) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) - .with_stt(DeepgramSTT(api_key='...', model='nova-2')) -) - -# Or via builder methods agent = ( Agent() - .with_failure_message('Something went wrong.') - .with_max_history(15) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_llm(OpenAI( + api_key='...', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + failure_message='Something went wrong.', + max_history=15, + )) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -146,8 +142,8 @@ from agora_agent import Agent, GeofenceConfig, GeofenceArea, GeofenceExcludeArea agent = ( Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.NORTH_AMERICA)) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) @@ -155,8 +151,8 @@ agent = ( agent = ( Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.GLOBAL, exclude_area=GeofenceExcludeArea.EUROPE)) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -175,8 +171,8 @@ agent = ( 'team': 'support', 'version': '1.2.0', }) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -194,8 +190,8 @@ agent = ( encryption_key='your-32-byte-key', encryption_mode=5, # AES_128_GCM )) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -231,8 +227,8 @@ agent = ( ), ), )) - .with_llm(OpenAI(api_key='...', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', sample_rate=24000)) + .with_llm(OpenAI(api_key='...', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) + .with_tts(ElevenLabsTTS(key='...', model_id='...', voice_id='...', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='...', model='nova-2')) ) ``` @@ -245,13 +241,12 @@ Read back configuration via properties: from agora_agent import Agent, GeofenceConfig, GeofenceArea agent = ( - Agent(max_history=20) + Agent() .with_geofence(GeofenceConfig(area=GeofenceArea.EUROPE)) .with_labels({'env': 'staging'}) ) agent.name # str | None -agent.max_history # 20 agent.geofence # GeofenceConfig(area='EUROPE') agent.labels # {'env': 'staging'} agent.sal # SalConfig | None @@ -293,15 +288,17 @@ client = Agora( ) agent = ( - Agent( - name='full-featured-assistant', - instructions='You are a helpful voice assistant.', - greeting='Hello! How can I help?', + Agent(name='full-featured-assistant') + .with_llm(OpenAI( + api_key='your-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful voice assistant.'}], + greeting_message='Hello! How can I help?', failure_message='Sorry, I had trouble processing that.', max_history=20, - ) - .with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + )) + .with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-key', model='nova-2', language='en-US')) .with_advanced_features(AdvancedFeatures(enable_rtm=True)) .with_parameters(SessionParams( diff --git a/docs/guides/avatars.md b/docs/guides/avatars.md index ca50966..c370b80 100644 --- a/docs/guides/avatars.md +++ b/docs/guides/avatars.md @@ -54,12 +54,18 @@ client = Agora( ) agent = ( - Agent(name='avatar-agent', instructions='You are a helpful assistant with a visual avatar.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='avatar-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant with a visual avatar.'}], + )) .with_tts(ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=24000, # Must be 24000 for HeyGen )) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) @@ -100,12 +106,18 @@ Akool requires a TTS vendor configured at 16000 Hz: from agora_agent import ElevenLabsTTS, AkoolAvatar agent = ( - Agent(name='akool-agent', instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='akool-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) .with_tts(ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=16000, # Must be 16000 for Akool )) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) @@ -124,12 +136,18 @@ This example shows what happens when the TTS sample rate does not match the avat ```python # This raises ValueError at build time agent = ( - Agent(name='broken-agent', instructions='You are a helpful assistant.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) + Agent(name='broken-agent') + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + )) .with_tts(ElevenLabsTTS( key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', + base_url='wss://api.elevenlabs.io/v1', sample_rate=16000, # 16 kHz )) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US')) @@ -152,8 +170,8 @@ The `with_avatar()` call validates against the currently configured TTS. Always ```python # Correct order: TTS first, then avatar agent = ( - Agent(name='my-agent', instructions='You are helpful.') - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + Agent(name='my-agent') + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_avatar(HeyGenAvatar(api_key='your-heygen-key', quality='medium', agora_uid='2')) ) ``` diff --git a/docs/guides/byok.md b/docs/guides/byok.md index 3b03ebe..9a66414 100644 --- a/docs/guides/byok.md +++ b/docs/guides/byok.md @@ -32,12 +32,7 @@ def main() -> None: # In BYOK mode, each vendor carries its own credentials. agent = ( - Agent( - name="support-assistant", - instructions="You are a concise support voice assistant.", - greeting="Hello! How can I help you today?", - max_history=10, - ) + Agent(name="support-assistant") .with_stt( DeepgramSTT( api_key=os.environ["DEEPGRAM_API_KEY"], @@ -48,7 +43,11 @@ def main() -> None: .with_llm( OpenAI( api_key=os.environ["OPENAI_API_KEY"], + base_url="https://api.openai.com/v1/chat/completions", model="gpt-4o-mini", + system_messages=[{"role": "system", "content": "You are a concise support voice assistant."}], + greeting_message="Hello! How can I help you today?", + max_history=10, ) ) .with_tts( @@ -56,6 +55,7 @@ def main() -> None: key=os.environ["ELEVENLABS_API_KEY"], model_id="eleven_flash_v2_5", voice_id=os.environ["ELEVENLABS_VOICE_ID"], + base_url="wss://api.elevenlabs.io/v1", sample_rate=24000, ) ) diff --git a/docs/guides/cascading-flow.md b/docs/guides/cascading-flow.md index 43ff2af..45d44ce 100644 --- a/docs/guides/cascading-flow.md +++ b/docs/guides/cascading-flow.md @@ -26,9 +26,14 @@ client = Agora( ) agent = ( - Agent(name='assistant', instructions='You are a friendly customer support agent.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + Agent(name='assistant') + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], + )) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2')) ) @@ -53,9 +58,14 @@ async def main(): ) agent = ( - Agent(name='assistant', instructions='You are a friendly customer support agent.') - .with_llm(OpenAI(api_key='your-openai-key', model='gpt-4o-mini')) - .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', sample_rate=24000)) + Agent(name='assistant') + .with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a friendly customer support agent.'}], + )) + .with_tts(ElevenLabsTTS(key='your-elevenlabs-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1', sample_rate=24000)) .with_stt(DeepgramSTT(api_key='your-deepgram-key', language='en-US', model='nova-2')) ) @@ -82,11 +92,12 @@ client = Agora( ) agent = ( - Agent(name='azure-agent', instructions='You are a helpful assistant for enterprise customers.') + Agent(name='azure-agent') .with_llm(AzureOpenAI( api_key='your-azure-key', endpoint='https://your-resource.openai.azure.com', deployment_name='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant for enterprise customers.'}], )) .with_tts(MicrosoftTTS( key='your-azure-speech-key', @@ -116,6 +127,7 @@ from agora_agent import OpenAI llm = OpenAI( api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', temperature=0.7, top_p=0.9, @@ -125,14 +137,16 @@ llm = OpenAI( ## Adding a Greeting -The `greeting` parameter on `Agent` makes the agent speak automatically when the session starts: +Configure greetings on the LLM vendor so message ownership stays with the LLM configuration: ```python -agent = Agent( - name='greeter', - instructions='You are a helpful assistant.', - greeting='Hi there! What can I do for you?', -) +agent = Agent(name='greeter').with_llm(OpenAI( + api_key='your-openai-key', + base_url='https://api.openai.com/v1/chat/completions', + model='gpt-4o-mini', + system_messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}], + greeting_message='Hi there! What can I do for you?', +)) ``` ## Next Steps diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 93770e9..5693e0b 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -27,25 +27,31 @@ Agent( labels: Optional[Dict[str, str]] = None, rtc: Optional[RtcConfig] = None, filler_words: Optional[FillerWordsConfig] = None, + pipeline_id: Optional[str] = None, ) ``` | Parameter | Type | Default | Description | |---|---|---|---| | `name` | `Optional[str]` | `None` | Agent name, used as default session name | -| `instructions` | `Optional[str]` | `None` | System prompt for the LLM | -| `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | +| `instructions` | `Optional[str]` | `None` | Deprecated. Use LLM vendor `system_messages` instead. | +| `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Interaction language and turn detection configuration | | `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | | `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | -| `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | -| `failure_message` | `Optional[str]` | `None` | Spoken on error | -| `max_history` | `Optional[int]` | `None` | Max conversation history length | +| `greeting` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `greeting_message` instead. | +| `failure_message` | `Optional[str]` | `None` | Deprecated. Use LLM/MLLM vendor `failure_message` instead. | +| `max_history` | `Optional[int]` | `None` | Deprecated. Use LLM vendor `max_history` instead. | | `geofence` | `Optional[GeofenceConfig]` | `None` | Regional access restriction | | `labels` | `Optional[Dict[str, str]]` | `None` | Custom key-value labels (returned in callbacks) | | `rtc` | `Optional[RtcConfig]` | `None` | RTC media encryption | | `filler_words` | `Optional[FillerWordsConfig]` | `None` | Filler words while waiting for LLM | +| `pipeline_id` | `Optional[str]` | `None` | Published AI Studio pipeline ID used as this agent's base configuration | + +`pipeline_id` is an AI Studio base configuration. Explicit Agent config such as `with_llm()`, `with_tts()`, `with_stt()`, `with_mllm()`, `advanced_features`, and other builder options may send fields in `properties` that override the saved pipeline settings. Session-level `pipeline_id` overrides the agent-level value. + +The Agent-level `instructions`, `greeting`, `failure_message`, `max_history`, and `greeting_configs` fields are compatibility shims. New code should configure those values on the LLM or MLLM vendor because that matches the core request schema. ## Builder Methods @@ -58,7 +64,7 @@ Set the LLM vendor for cascading flow. ```python from agora_agent import OpenAI -agent = Agent().with_llm(OpenAI(api_key='your-key', model='gpt-4o-mini')) +agent = Agent().with_llm(OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini')) ``` ### `with_tts(vendor: BaseTTS) -> Agent` @@ -68,7 +74,7 @@ Set the TTS vendor. Records the vendor's `sample_rate` for avatar validation. ```python from agora_agent import ElevenLabsTTS -agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id')) +agent = Agent().with_tts(ElevenLabsTTS(key='your-key', model_id='eleven_flash_v2_5', voice_id='your-voice-id', base_url='wss://api.elevenlabs.io/v1')) ``` ### `with_stt(vendor: BaseSTT) -> Agent` @@ -107,7 +113,7 @@ agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', ago ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` -Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. +Override cascading-flow turn detection settings. Use `language` for the Agora interaction language, `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection, `with_interruption()` for interruption behavior, and MLLM vendor `turn_detection` for MLLM turn detection. Pause-state detection is configured under semantic end-of-speech: @@ -131,11 +137,11 @@ Configure unified interruption behavior using the top-level `interruption` objec ### `with_instructions(instructions: str) -> Agent` -Override the system prompt. +Deprecated. Configure `system_messages` on the LLM vendor instead. ### `with_greeting(greeting: str) -> Agent` -Override the greeting message. +Deprecated. Configure `greeting_message` on the LLM or MLLM vendor instead. ### `with_name(name: str) -> Agent` @@ -165,11 +171,11 @@ Set `parameters.audio_scenario` without replacing existing session parameters. ### `with_failure_message(message: str) -> Agent` -Set the message spoken via TTS when the LLM call fails. +Deprecated. Configure `failure_message` on the LLM or MLLM vendor instead. ### `with_max_history(max_history: int) -> Agent` -Set the maximum conversation history length for the standard ASR + LLM + TTS pipeline. The v2.7 MLLM core type does not expose `max_history`. +Deprecated. Configure `max_history` on the LLM vendor instead. ### `with_geofence(geofence: GeofenceConfig) -> Agent` @@ -200,6 +206,8 @@ create_session( token: Optional[str] = None, idle_timeout: Optional[int] = None, enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, expires_in: Optional[int] = None, ) -> AgentSession ``` @@ -217,6 +225,10 @@ Creates an `AgentSession` bound to the given client and channel. | `expires_in` | `Optional[int]` | No | Token lifetime in seconds (default: `86400` = 24 h, Agora max). Only applies when the token is auto-generated. Use `expires_in_hours()` or `expires_in_minutes()` for clarity. Valid range: 1–86400. | | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. **Returns:** `AgentSession` @@ -246,16 +258,16 @@ to_properties( | Property | Type | Description | |---|---|---| | `name` | `Optional[str]` | Agent name | -| `instructions` | `Optional[str]` | System prompt | -| `greeting` | `Optional[str]` | Greeting message | -| `failure_message` | `Optional[str]` | Message spoken when LLM fails | -| `max_history` | `Optional[int]` | Max conversation history length | +| `instructions` | `Optional[str]` | Deprecated Agent-level system prompt | +| `greeting` | `Optional[str]` | Deprecated Agent-level greeting message | +| `failure_message` | `Optional[str]` | Deprecated Agent-level failure message | +| `max_history` | `Optional[int]` | Deprecated Agent-level max history | | `llm` | `Optional[Dict[str, Any]]` | LLM config dict (from `to_config()`) | | `tts` | `Optional[Dict[str, Any]]` | TTS config dict | | `stt` | `Optional[Dict[str, Any]]` | STT config dict | | `mllm` | `Optional[Dict[str, Any]]` | MLLM config dict | | `avatar` | `Optional[Dict[str, Any]]` | Avatar config dict | -| `turn_detection` | `Optional[TurnDetectionConfig]` | Turn detection settings | +| `turn_detection` | `Optional[TurnDetectionConfig]` | Interaction language and turn detection settings | | `sal` | `Optional[SalConfig]` | SAL configuration | | `advanced_features` | `Optional[Dict[str, Any]]` | Advanced features | | `parameters` | `Optional[SessionParams]` | Session parameters | diff --git a/docs/reference/session.md b/docs/reference/session.md index 63402f6..76e1367 100644 --- a/docs/reference/session.md +++ b/docs/reference/session.md @@ -33,6 +33,11 @@ AgentSession( token: Optional[str] = None, idle_timeout: Optional[int] = None, enable_string_uid: Optional[bool] = None, + preset: Optional[Union[str, Sequence[str]]] = None, + pipeline_id: Optional[str] = None, + expires_in: Optional[int] = None, + debug: Optional[bool] = None, + warn: Optional[Callable[[str], None]] = None, ) ``` @@ -51,6 +56,13 @@ AgentSession( | `token` | `Optional[str]` | No | Pre-built RTC token | | `idle_timeout` | `Optional[int]` | No | Idle timeout in seconds | | `enable_string_uid` | `Optional[bool]` | No | Enable string UIDs | +| `preset` | `Optional[Union[str, Sequence[str]]]` | No | Advanced preset value for project-specific routing | +| `pipeline_id` | `Optional[str]` | No | Published AI Studio pipeline ID for this session. Overrides `agent.pipeline_id`. | +| `expires_in` | `Optional[int]` | No | Auto-generated token lifetime in seconds | +| `debug` | `Optional[bool]` | No | Enable debug logging of the start request | +| `warn` | `Optional[Callable[[str], None]]` | No | Custom warning sink | + +`pipeline_id` is sent as the top-level `/join` field `pipeline_id`, not inside `properties`. If unset, `AgentSession.start()` uses the agent-level value from `Agent(..., pipeline_id=...)`. ## Methods diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index 7395eea..cfa8580 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -23,9 +23,9 @@ from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIR | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | Yes | — | OpenAI API key | -| `model` | `str` | No | `gpt-4o-mini` | Model name | -| `base_url` | `str` | No | `None` | Custom base URL (overrides default OpenAI endpoint) | +| `api_key` | `str` | BYOK only | `None` | OpenAI API key. Optional for supported Agora-managed OpenAI models. | +| `model` | `str` | Yes | — | Model name | +| `base_url` | `str` | BYOK only | `None` | OpenAI Chat Completions endpoint URL. Required when `api_key` is set. | | `temperature` | `float` | No | `None` | Sampling temperature (0.0–2.0) | | `top_p` | `float` | No | `None` | Nucleus sampling (0.0–1.0) | | `max_tokens` | `int` | No | `None` | Maximum tokens to generate | @@ -43,7 +43,7 @@ from agora_agent import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIR ```python from agora_agent import OpenAI -llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) +llm = OpenAI(api_key='your-key', base_url='https://api.openai.com/v1/chat/completions', model='gpt-4o-mini', temperature=0.7) ``` ### `AzureOpenAI` @@ -51,6 +51,7 @@ llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Azure OpenAI API key | +| `model` | `str` | Yes | — | Deployment's base model name. Emitted as `params.model`. | | `endpoint` | `str` | Yes | — | Azure endpoint URL | | `deployment_name` | `str` | Yes | — | Azure deployment name | | `api_version` | `str` | No | `2024-08-01-preview` | Azure API version | @@ -73,6 +74,7 @@ from agora_agent import AzureOpenAI llm = AzureOpenAI( api_key='your-azure-key', + model='gpt-4o-mini', endpoint='https://your-resource.openai.azure.com', deployment_name='gpt-4o-mini', ) @@ -83,8 +85,10 @@ llm = AzureOpenAI( | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Anthropic API key | -| `model` | `str` | No | `claude-3-5-sonnet-20241022` | Model name | -| `max_tokens` | `int` | No | `None` | Maximum tokens | +| `model` | `str` | Yes | — | Model name | +| `url` | `str` | Yes | — | Anthropic messages endpoint URL | +| `headers` | `Dict[str, str]` | Yes | — | Request headers, including Anthropic API version | +| `max_tokens` | `int` | Yes | — | Maximum tokens | | `temperature` | `float` | No | `None` | Sampling temperature (0.0–1.0) | | `top_p` | `float` | No | `None` | Nucleus sampling (0.0–1.0) | | `system_messages` | `List[Dict]` | No | `None` | System messages | @@ -93,7 +97,6 @@ llm = AzureOpenAI( | `input_modalities` | `List[str]` | No | `None` | Input modalities | | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `params` | `Dict[str, Any]` | No | `None` | Additional model parameters | -| `headers` | `Dict[str, str]` | No | `None` | Custom HTTP headers forwarded to the LLM provider | | `greeting_configs` | `Dict[str, Any]` | No | `None` | Greeting playback configuration | | `template_variables` | `Dict[str, str]` | No | `None` | Template variables for messages | @@ -101,7 +104,13 @@ llm = AzureOpenAI( ```python from agora_agent import Anthropic -llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022') +llm = Anthropic( + api_key='your-anthropic-key', + url='https://api.anthropic.com/v1/messages', + headers={'anthropic-version': '2023-06-01'}, + model='claude-3-5-sonnet-20241022', + max_tokens=1024, +) ``` ### `Gemini` @@ -109,7 +118,7 @@ llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022' | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Google AI API key | -| `model` | `str` | No | `gemini-2.0-flash-exp` | Model name | +| `model` | `str` | Yes | — | Model name | | `temperature` | `float` | No | `None` | Sampling temperature (0.0–2.0) | | `top_p` | `float` | No | `None` | Nucleus sampling (0.0–1.0) | | `top_k` | `int` | No | `None` | Top-k sampling | @@ -137,10 +146,10 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | Class | Provider | Key parameters | |---|---|---| -| `Groq` | Groq | `api_key`, `model`, `base_url?` | +| `Groq` | Groq | `api_key`, `model`, `base_url` | | `VertexAILLM` | Google Vertex AI | `api_key`, `model`, `project_id`, `location`, `url?` | -| `AmazonBedrock` | Amazon Bedrock | `api_key`, `url`, `model` | -| `Dify` | Dify | `api_key`, `url`, `user?`, `conversation_id?` | +| `AmazonBedrock` | Amazon Bedrock | `access_key`, `secret_key`, `region`, `model` | +| `Dify` | Dify | `api_key`, `url`, `model`, `user?`, `conversation_id?` | | `CustomLLM` | OpenAI-compatible LLM | `api_key`, `model`, `base_url` | --- @@ -154,7 +163,7 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `key` | `str` | Yes | — | ElevenLabs API key | | `model_id` | `str` | Yes | — | Model ID (e.g., `eleven_flash_v2_5`) | | `voice_id` | `str` | Yes | — | Voice ID | -| `base_url` | `str` | No | `None` | Custom WebSocket base URL | +| `base_url` | `str` | Yes | — | WebSocket base URL | | `sample_rate` | `int` | No | `None` | Sample rate: 16000, 22050, 24000, or 44100 Hz | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | | `optimize_streaming_latency` | `int` | No | `None` | Latency optimization level (0–4) | @@ -171,20 +180,23 @@ The SDK also includes named helpers for the remaining Agora-supported LLM provid | `region` | `str` | Yes | — | Azure region (e.g., `eastus`) | | `voice_name` | `str` | Yes | — | Voice name (e.g., `en-US-JennyNeural`) | | `sample_rate` | `int` | No | `None` | Sample rate: 8000, 16000, 24000, or 48000 Hz | +| `speed` | `float` | No | `None` | Speaking rate multiplier | +| `volume` | `float` | No | `None` | Audio volume | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `OpenAITTS` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | Yes | — | OpenAI API key | +| `api_key` | `str` | BYOK only | `None` | OpenAI API key | | `voice` | `str` | Yes | — | Voice: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` | -| `model` | `str` | No | `None` | Model: `tts-1` or `tts-1-hd` | -| `response_format` | `str` | No | `None` | Audio format (e.g., `pcm`) | +| `model` | `str` | BYOK only | `None` | Model: `tts-1` or `tts-1-hd` | +| `base_url` | `str` | BYOK only | `None` | OpenAI TTS endpoint URL | +| `instructions` | `str` | No | `None` | Custom instructions for voice style, accent, pace, and tone | | `speed` | `float` | No | `None` | Speech speed multiplier | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | -Fixed sample rate: 24000 Hz. +`api_key`, `model`, and `base_url` are required together for BYOK. Without `api_key`, `model` must be omitted or set to the Agora-managed `tts-1` path. Fixed sample rate: 24000 Hz. ### `CartesiaTTS` @@ -192,7 +204,9 @@ Fixed sample rate: 24000 Hz. |---|---|---|---|---| | `api_key` | `str` | Yes | — | Cartesia API key | | `voice_id` | `str` | Yes | — | Voice ID (serialized as `{"mode": "id", "id": "..."}`) | -| `model_id` | `str` | No | `None` | Model ID | +| `model_id` | `str` | Yes | — | Model ID | +| `base_url` | `str` | No | `None` | WebSocket URL | +| `language` | `str` | No | `None` | Target language | | `sample_rate` | `int` | No | `None` | Sample rate: 8000–48000 Hz | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | @@ -203,6 +217,7 @@ Fixed sample rate: 24000 Hz. | `key` | `str` | Yes | — | Google Cloud API key | | `voice_name` | `str` | Yes | — | Voice name | | `language_code` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `sample_rate_hertz` | `int` | No | `None` | Sample rate in Hz | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `AmazonTTS` @@ -213,6 +228,7 @@ Fixed sample rate: 24000 Hz. | `secret_key` | `str` | Yes | — | AWS secret key | | `region` | `str` | Yes | — | AWS region (e.g., `us-east-1`) | | `voice_id` | `str` | Yes | — | Amazon Polly voice ID | +| `engine` | `str` | Yes | — | Amazon Polly engine type | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `DeepgramTTS` @@ -223,7 +239,7 @@ Fixed sample rate: 24000 Hz. | `model` | `str` | Yes | — | Deepgram TTS model (e.g., `aura-2-thalia-en`) | | `base_url` | `str` | No | `None` | WebSocket endpoint; defaults server-side to `wss://api.deepgram.com/v1/speak` | | `sample_rate` | `int` | No | `None` | Sample rate in Hz (for example, `24000`) | -| `params` | `Dict[str, Any]` | No | `None` | Additional Deepgram TTS parameters | +| `additional_params` | `Dict[str, Any]` | No | `None` | Additional Deepgram TTS parameters, flattened into `params` | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `HumeAITTS` @@ -231,7 +247,12 @@ Fixed sample rate: 24000 Hz. | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `key` | `str` | Yes | — | Hume AI API key | +| `voice_id` | `str` | Yes | — | Hume AI voice ID | +| `provider` | `str` | Yes | — | Voice provider type, such as `CUSTOM_VOICE` or `HUME_AI` | | `config_id` | `str` | No | `None` | Configuration ID | +| `base_url` | `str` | No | `None` | Base URL | +| `speed` | `float` | No | `None` | Playback speed | +| `trailing_silence` | `float` | No | `None` | Trailing silence in seconds | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `RimeTTS` @@ -240,10 +261,8 @@ Fixed sample rate: 24000 Hz. |---|---|---|---|---| | `key` | `str` | Yes | — | Rime API key | | `speaker` | `str` | Yes | — | Speaker ID | -| `model_id` | `str` | No | `None` | Model ID | -| `lang` | `str` | No | `None` | Language code | -| `sampling_rate` | `int` | No | `None` | Sampling rate in Hz (serialized as `samplingRate`) | -| `speed_alpha` | `float` | No | `None` | Speed multiplier (serialized as `speedAlpha`) | +| `model_id` | `str` | Yes | — | Model ID | +| `base_url` | `str` | No | `None` | WebSocket URL | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `FishAudioTTS` @@ -252,26 +271,34 @@ Fixed sample rate: 24000 Hz. |---|---|---|---|---| | `key` | `str` | Yes | — | Fish Audio API key | | `reference_id` | `str` | Yes | — | Reference ID | +| `backend` | `str` | Yes | — | Backend model version | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `MiniMaxTTS` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `key` | `str` | Yes | — | MiniMax API key | -| `group_id` | `str` | Yes | — | MiniMax group ID | +| `key` | `str` | BYOK only | `None` | MiniMax API key. Optional for supported Agora-managed MiniMax models | +| `group_id` | `str` | BYOK only | `None` | MiniMax group ID | | `model` | `str` | Yes | — | Model name (e.g., `speech-02-turbo`) | -| `voice_id` | `str` | Yes | — | Voice style identifier | -| `url` | `str` | Yes | — | WebSocket endpoint | +| `voice_id` | `str` | BYOK only | `None` | Voice style identifier | +| `url` | `str` | BYOK only | `None` | WebSocket endpoint | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | +`key`, `group_id`, `voice_id`, and `url` are required together for BYOK. Without `key`, `model` must be one of the supported Agora-managed MiniMax models. + ### `MurfTTS` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `key` | `str` | Yes | — | Murf API key | -| `voice_id` | `str` | Yes | — | Voice ID (e.g., `Ariana`, `Natalie`) | -| `style` | `str` | No | `None` | Voice style (e.g., `Conversational`) | +| `voice_id` | `str` | No | `None` | Voice ID (e.g., `Ariana`, `Natalie`) | +| `base_url` | `str` | No | `None` | WebSocket endpoint | +| `locale` | `str` | No | `None` | Voice locale | +| `rate` | `float` | No | `None` | Speech rate | +| `pitch` | `float` | No | `None` | Pitch adjustment | +| `model` | `str` | No | `None` | TTS model | +| `sample_rate` | `int` | No | `None` | Audio sample rate | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | ### `SarvamTTS` @@ -281,38 +308,47 @@ Fixed sample rate: 24000 Hz. | `key` | `str` | Yes | — | Sarvam API key | | `speaker` | `str` | Yes | — | Speaker name | | `target_language_code` | `str` | Yes | — | Target language code | +| `pitch` | `float` | No | `None` | Pitch adjustment | +| `pace` | `float` | No | `None` | Speed of speech | +| `loudness` | `float` | No | `None` | Volume level | +| `sample_rate` | `int` | No | `None` | Audio sample rate | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | --- ## STT Vendors +Use `turn_detection.language` for Agora interaction language; it defaults to `en-US`. Provider-specific language values remain under `asr.params` and may use a different format. + ### `SpeechmaticsSTT` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | Speechmatics API key | | `language` | `str` | Yes | — | Language code (e.g., `en`) | +| `uri` | `str` | No | `None` | Speechmatics streaming WebSocket URL | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `DeepgramSTT` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | No | `None` | Deepgram API key | +| `api_key` | `str` | BYOK only | `None` | Deepgram API key. Optional only for Agora-managed `nova-2` and `nova-3`. | | `model` | `str` | No | `None` | Model (e.g., `nova-2`) | | `language` | `str` | No | `None` | Language code (e.g., `en-US`) | | `smart_format` | `bool` | No | `None` | Enable smart formatting | | `punctuation` | `bool` | No | `None` | Enable punctuation | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | +For `nova-2` and `nova-3`, omit `api_key` to use Agora-managed credentials. For all other Deepgram models, AgentKit requires `api_key`. + ### `MicrosoftSTT` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `key` | `str` | Yes | — | Azure subscription key | | `region` | `str` | Yes | — | Azure region (e.g., `eastus`) | -| `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `language` | `str` | Yes | — | Language code (e.g., `en-US`) | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `OpenAISTT` @@ -322,14 +358,19 @@ Fixed sample rate: 24000 Hz. | `api_key` | `str` | Yes | — | OpenAI API key | | `model` | `str` | No | `None` | Model (default: `whisper-1`) | | `language` | `str` | No | `None` | Language code | +| `prompt` | `str` | No | `None` | Prompt for OpenAI transcription | +| `input_audio_transcription` | `Dict[str, Any]` | No | `None` | OpenAI transcription settings | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `GoogleSTT` | Parameter | Type | Required | Default | Description | |---|---|---|---|---| -| `api_key` | `str` | Yes | — | Google Cloud API key | -| `language` | `str` | No | `None` | Language code (e.g., `en-US`) | +| `project_id` | `str` | Yes | — | Google Cloud project ID | +| `location` | `str` | Yes | — | Google Cloud region | +| `adc_credentials_string` | `str` | Yes | — | Google service account credentials JSON string | +| `language` | `str` | Yes | — | Language code (e.g., `en-US`) | +| `model` | `str` | No | `None` | Recognition model | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `AmazonSTT` @@ -339,7 +380,7 @@ Fixed sample rate: 24000 Hz. | `access_key` | `str` | Yes | — | AWS Access Key ID | | `secret_key` | `str` | Yes | — | AWS Secret Access Key | | `region` | `str` | Yes | — | AWS region (e.g., `us-east-1`) | -| `language` | `str` | No | `None` | Language code | +| `language` | `str` | Yes | — | Amazon `language_code` | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `AssemblyAISTT` @@ -347,7 +388,8 @@ Fixed sample rate: 24000 Hz. | Parameter | Type | Required | Default | Description | |---|---|---|---|---| | `api_key` | `str` | Yes | — | AssemblyAI API key | -| `language` | `str` | No | `None` | Language code | +| `language` | `str` | Yes | — | Language code | +| `uri` | `str` | No | `None` | AssemblyAI streaming WebSocket URL | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | ### `AresSTT` diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index 712d0dd..8a8fdf2 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -3,6 +3,7 @@ AgentConfig, AgentConfigUpdate, AsrConfig, + TurnDetectionLanguage, ConversationHistory, ConversationRole, ConversationSessionTurn, @@ -204,6 +205,7 @@ "LlmStyle", "SttConfig", "AsrConfig", + "TurnDetectionLanguage", "SttVendor", "TtsConfig", "MllmConfig", diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index f84862c..0a652db 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -8,14 +8,8 @@ from .agent_session import AgentSession, AsyncAgentSession from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties -from ..agents.types.start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr -from ..agents.types.start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor from ..agents.types.start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from ..agents.types.start_agents_request_properties_avatar_vendor import StartAgentsRequestPropertiesAvatarVendor -from ..agents.types.start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm -from ..agents.types.start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle -from ..agents.types.start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm -from ..agents.types.start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor from ..agents.types.update_agents_request_properties import UpdateAgentsRequestProperties from ..agents.types.get_agents_response import GetAgentsResponse from ..agents.types.list_agents_response import ListAgentsResponse @@ -52,11 +46,6 @@ from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode -from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection -from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode -from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs -from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode -from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem from ..agents.types.start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence from ..agents.types.start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc from ..agents.types.start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures @@ -67,6 +56,13 @@ from ..agents.types.start_agents_request_properties_filler_words_content_static_config import StartAgentsRequestPropertiesFillerWordsContentStaticConfig from ..agents.types.start_agents_request_properties_filler_words_content_static_config_selection_rule import StartAgentsRequestPropertiesFillerWordsContentStaticConfigSelectionRule from ..types.tts import Tts +from ..types.asr import Asr +from ..types.llm import Llm +from ..types.llm_style import LlmStyle as GeneratedLlmStyle +from ..types.mllm import Mllm +from ..types.mllm_turn_detection import MllmTurnDetection +from ..types.mllm_turn_detection_mode import MllmTurnDetectionMode as GeneratedMllmTurnDetectionMode +from ..types.mllm_vendor import MllmVendor as GeneratedMllmVendor from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( AgentThinkAgentManagementRequestOnListeningAction, ) @@ -82,14 +78,14 @@ from .vendors.base import BaseAvatar, BaseLLM, BaseMLLM, BaseSTT, BaseTTS # Top-level aliases -LlmConfig = StartAgentsRequestPropertiesLlm -LlmStyle = StartAgentsRequestPropertiesLlmStyle -SttConfig = StartAgentsRequestPropertiesAsr +LlmConfig = Llm +LlmStyle = GeneratedLlmStyle +SttConfig = Asr AsrConfig = SttConfig -SttVendor = StartAgentsRequestPropertiesAsrVendor +SttVendor = typing.Any TtsConfig = Tts -MllmConfig = StartAgentsRequestPropertiesMllm -MllmVendor = StartAgentsRequestPropertiesMllmVendor +MllmConfig = Mllm +MllmVendor = GeneratedMllmVendor AvatarConfig = StartAgentsRequestPropertiesAvatar AvatarVendor = StartAgentsRequestPropertiesAvatarVendor TurnDetectionConfig = StartAgentsRequestPropertiesTurnDetection @@ -133,8 +129,8 @@ ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario InterruptionConfig = StartAgentsRequestPropertiesInterruption InterruptionMode = StartAgentsRequestPropertiesInterruptionMode -MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection -MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode +MllmTurnDetectionConfig = MllmTurnDetection +MllmTurnDetectionMode = GeneratedMllmTurnDetectionMode AgentConfig = StartAgentsRequestProperties AgentConfigUpdate = UpdateAgentsRequestProperties SessionInfo = GetAgentsResponse @@ -192,9 +188,9 @@ class SessionOptions(typing_extensions.TypedDict, total=False): warn: typing.Callable[[str], None] # LLM sub-type aliases -LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs -LlmGreetingConfigsMode = StartAgentsRequestPropertiesLlmGreetingConfigsMode -McpServersItem = StartAgentsRequestPropertiesLlmMcpServersItem +LlmGreetingConfigs = typing.Dict[str, typing.Any] +LlmGreetingConfigsMode = typing.Any +McpServersItem = typing.Dict[str, typing.Any] # Additional top-level config aliases GeofenceConfig = StartAgentsRequestPropertiesGeofence @@ -214,6 +210,78 @@ class SessionOptions(typing_extensions.TypedDict, total=False): from .token import generate_convo_ai_token, _parse_numeric_uid, _validate_expires_in +TurnDetectionLanguage = typing_extensions.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +] + +DEFAULT_TURN_DETECTION_LANGUAGE: TurnDetectionLanguage = "en-US" +TURN_DETECTION_LANGUAGE_VALUES: typing.Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +) +_TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) + def _dump_optional_model(value: typing.Any) -> typing.Any: if hasattr(value, "model_dump"): @@ -223,12 +291,28 @@ def _dump_optional_model(value: typing.Any) -> typing.Any: return value +def _is_turn_detection_language(value: typing.Any) -> bool: + return isinstance(value, str) and value in _TURN_DETECTION_LANGUAGES + + +def _validate_turn_detection_language(value: typing.Any) -> TurnDetectionLanguage: + if not _is_turn_detection_language(value): + raise ValueError(f"Invalid interaction language: {value}") + return value # type: ignore[return-value] + + class Agent: """A reusable agent definition. Use the fluent builder methods (.with_llm(), .with_tts(), .with_stt(), .with_mllm()) to configure vendor settings after construction. + Deprecated: + The Agent-level ``instructions``, ``greeting``, ``failure_message``, + ``max_history``, and ``greeting_configs`` convenience fields are kept + for compatibility. Configure those values on the LLM or MLLM vendor + instead. + Examples -------- >>> from agora_agent import Agent, OpenAI, ElevenLabsTTS, DeepgramSTT @@ -236,8 +320,8 @@ class Agent: >>> agent = Agent(instructions="You are a helpful voice assistant.") >>> agent = ( ... agent - ... .with_llm(OpenAI(api_key="...", model="gpt-4")) - ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", sample_rate=24000)) + ... .with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")) + ... .with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1", sample_rate=24000)) ... .with_stt(DeepgramSTT(api_key="...", model="nova-2")) ... ) """ @@ -259,8 +343,10 @@ def __init__( rtc: typing.Optional[RtcConfig] = None, filler_words: typing.Optional[FillerWordsConfig] = None, greeting_configs: typing.Optional[LlmGreetingConfigs] = None, + pipeline_id: typing.Optional[str] = None, ): self._name = name + self._pipeline_id = pipeline_id self._instructions = instructions self._greeting = greeting self._failure_message = failure_message @@ -369,17 +455,19 @@ def with_interruption(self, config: InterruptionConfig) -> "Agent": return new_agent def with_instructions(self, instructions: str) -> "Agent": + """Deprecated. Configure system messages on the LLM vendor instead.""" new_agent = self._clone() new_agent._instructions = instructions return new_agent def with_greeting(self, greeting: str) -> "Agent": + """Deprecated. Configure the greeting on the LLM or MLLM vendor instead.""" new_agent = self._clone() new_agent._greeting = greeting return new_agent def with_greeting_configs(self, configs: LlmGreetingConfigs) -> "Agent": - """Returns a new Agent with greeting playback configuration.""" + """Deprecated. Configure greeting playback on the LLM vendor instead.""" new_agent = self._clone() new_agent._greeting_configs = configs return new_agent @@ -448,16 +536,13 @@ def with_audio_scenario(self, audio_scenario: ParametersAudioScenario) -> "Agent return new_agent def with_failure_message(self, message: str) -> "Agent": - """Returns a new Agent with the specified failure message. - - The failure message is played via TTS when the LLM call fails. - """ + """Deprecated. Configure the failure message on the LLM or MLLM vendor instead.""" new_agent = self._clone() new_agent._failure_message = message return new_agent def with_max_history(self, max_history: int) -> "Agent": - """Returns a new Agent with the specified maximum conversation history length.""" + """Deprecated. Configure max history on the LLM vendor instead.""" new_agent = self._clone() new_agent._max_history = max_history return new_agent @@ -526,6 +611,11 @@ def _resolved_parameters(self) -> typing.Optional[typing.Union[SessionParams, Se def name(self) -> typing.Optional[str]: return self._name + @property + def pipeline_id(self) -> typing.Optional[str]: + """Published AI Studio pipeline ID used as this agent's base configuration.""" + return self._pipeline_id + @property def llm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: return self._llm @@ -610,6 +700,7 @@ def filler_words(self) -> typing.Optional[FillerWordsConfig]: def config(self) -> typing.Dict[str, typing.Any]: return { "name": self._name, + "pipeline_id": self._pipeline_id, "instructions": self._instructions, "greeting": self._greeting, "failure_message": self._failure_message, @@ -804,6 +895,9 @@ def to_properties( base_kwargs["mllm"] = mllm_config return StartAgentsRequestProperties(**base_kwargs) + base_kwargs["asr"] = self._resolve_asr_config() + base_kwargs["turn_detection"] = self._resolve_turn_detection_config() + if skip_vendor_validation: return StartAgentsRequestProperties(**base_kwargs) @@ -829,14 +923,37 @@ def to_properties( base_kwargs["llm"] = llm_config base_kwargs["tts"] = self._tts - if self._stt is not None: - base_kwargs["asr"] = self._stt return StartAgentsRequestProperties(**base_kwargs) + def _resolve_asr_config(self) -> typing.Dict[str, typing.Any]: + asr_config = dict(self._stt or {}) + asr_config.pop("language", None) + if not asr_config: + asr_config["vendor"] = "ares" + return asr_config + + def _resolve_turn_detection_config(self) -> TurnDetectionConfig: + existing_stt_language = self._stt.get("language") if self._stt is not None else None + existing_turn_detection_language = self._field_value(self._turn_detection, "language") + language = ( + existing_turn_detection_language + if existing_turn_detection_language is not None + else existing_stt_language + if _is_turn_detection_language(existing_stt_language) + else DEFAULT_TURN_DETECTION_LANGUAGE + ) + language = _validate_turn_detection_language(language) + if self._turn_detection is None: + return StartAgentsRequestPropertiesTurnDetection(language=language) + if isinstance(self._turn_detection, dict): + return typing.cast(TurnDetectionConfig, {**self._turn_detection, "language": language}) + return self._copy_model_update(self._turn_detection, {"language": language}) + def _clone(self) -> "Agent": new_agent = Agent.__new__(Agent) new_agent._name = self._name + new_agent._pipeline_id = self._pipeline_id new_agent._llm = self._llm new_agent._tts = self._tts new_agent._stt = self._stt diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index a749d1e..5c866ac 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -52,7 +52,8 @@ class AgentSessionOptions(_AgentSessionRequiredOptions, total=False): Optional fields --------------- - app_certificate, token, idle_timeout, enable_string_uid, expires_in + app_certificate, token, idle_timeout, enable_string_uid, preset, + pipeline_id, expires_in, debug, warn """ app_certificate: str @@ -290,14 +291,18 @@ def _is_mllm_mode(self) -> bool: return True return mllm is not None - def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + def _build_start_properties( + self, + token_opts: typing.Dict[str, typing.Any], + skip_vendor_validation: bool, + ) -> typing.Dict[str, typing.Any]: base_properties = self._agent.to_properties( channel=self._channel, agent_uid=self._agent_uid, remote_uids=self._remote_uids, idle_timeout=self._idle_timeout, enable_string_uid=self._enable_string_uid, - skip_vendor_validation=True, + skip_vendor_validation=skip_vendor_validation, **token_opts, ) properties = self._dump_model(base_properties) @@ -416,7 +421,7 @@ class AgentSession(_AgentSessionBase): >>> >>> client = Agora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are a helpful voice assistant.") - >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) >>> session = agent.create_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = session.start() >>> session.say("Hello!") @@ -445,6 +450,7 @@ def start(self) -> str: self._status = "starting" try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id if self._token: token_opts: typing.Dict[str, typing.Any] = {"token": self._token} else: @@ -454,7 +460,7 @@ def start(self) -> str: "expires_in": self._expires_in, } - properties = self._build_start_properties(token_opts) + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) resolved_preset, resolved_properties = resolve_session_presets( self._preset, properties, @@ -466,7 +472,7 @@ def start(self) -> str: "appid": self._app_id, "name": self._name, "preset": resolved_preset, - "pipeline_id": self._pipeline_id, + "pipeline_id": pipeline_id, "properties": resolved_properties, }) @@ -480,7 +486,7 @@ def start(self) -> str: name=self._name, properties=request_properties, preset=resolved_preset, - pipeline_id=self._pipeline_id, + pipeline_id=pipeline_id, request_options=self._request_options(), ) @@ -737,7 +743,7 @@ class AsyncAgentSession(_AgentSessionBase): >>> >>> client = AsyncAgora(area=Area.US, app_id="...", app_certificate="...") >>> agent = Agent(name="assistant", instructions="You are helpful.") - >>> agent = agent.with_llm(OpenAI(api_key="...", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...")) + >>> agent = agent.with_llm(OpenAI(api_key="...", base_url="https://api.openai.com/v1/chat/completions", model="gpt-4")).with_tts(ElevenLabsTTS(key="...", model_id="...", voice_id="...", base_url="wss://api.elevenlabs.io/v1")) >>> session = agent.create_async_session(client, channel="room-123", agent_uid="1", remote_uids=["100"]) >>> agent_id = await session.start() >>> await session.say("Hello!") @@ -766,6 +772,7 @@ async def start(self) -> str: self._status = "starting" try: + pipeline_id = self._pipeline_id if self._pipeline_id is not None else self._agent.pipeline_id if self._token: token_opts: typing.Dict[str, typing.Any] = {"token": self._token} else: @@ -775,7 +782,7 @@ async def start(self) -> str: "expires_in": self._expires_in, } - properties = self._build_start_properties(token_opts) + properties = self._build_start_properties(token_opts, skip_vendor_validation=bool(self._preset or pipeline_id)) resolved_preset, resolved_properties = resolve_session_presets( self._preset, properties, @@ -787,7 +794,7 @@ async def start(self) -> str: "appid": self._app_id, "name": self._name, "preset": resolved_preset, - "pipeline_id": self._pipeline_id, + "pipeline_id": pipeline_id, "properties": resolved_properties, }) @@ -801,7 +808,7 @@ async def start(self) -> str: name=self._name, properties=request_properties, preset=resolved_preset, - pipeline_id=self._pipeline_id, + pipeline_id=pipeline_id, request_options=self._request_options(), ) diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index b521867..9156a01 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -1,13 +1,11 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator -from ...agents.types.start_agents_request_properties_llm_greeting_configs import ( - StartAgentsRequestPropertiesLlmGreetingConfigs, -) from .base import BaseLLM -LlmGreetingConfigs = Union[StartAgentsRequestPropertiesLlmGreetingConfigs, Dict[str, Any]] +LlmGreetingConfigs = Dict[str, Any] +_OPENAI_MANAGED_MODELS = {"gpt-4o-mini", "gpt-4.1-mini", "gpt-5-nano", "gpt-5-mini"} def _ensure_mcp_transport(servers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -32,7 +30,7 @@ class OpenAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: Optional[str] = Field(default=None, description="OpenAI API key") - model: str = Field(default="gpt-4o-mini", description="Model name") + model: str = Field(..., description="Model name") base_url: Optional[str] = Field(default=None, description="Custom base URL") temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) @@ -50,6 +48,20 @@ class OpenAIOptions(BaseModel): mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") + @model_validator(mode="after") + def _validate_byok_params(self) -> "OpenAIOptions": + if not self.model: + raise ValueError("OpenAI requires model") + if self.api_key is not None and self.base_url is None: + raise ValueError("OpenAI requires base_url when api_key is set") + if self.api_key is None and self.base_url is not None: + raise ValueError("OpenAI base_url is only valid when api_key is set") + if self.api_key is None and self.model.strip().lower() not in _OPENAI_MANAGED_MODELS: + raise ValueError("OpenAI requires api_key unless using a supported Agora-managed model") + if self.api_key is None and self.vendor is not None: + raise ValueError("OpenAI Agora-managed mode does not allow vendor") + return self + class OpenAI(BaseLLM): def __init__(self, **kwargs: Any): self.options = OpenAIOptions(**kwargs) @@ -104,6 +116,7 @@ class AzureOpenAIOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Azure OpenAI API key") + model: str = Field(..., description="Azure deployment model name") endpoint: str = Field(..., description="Azure endpoint URL") deployment_name: str = Field(..., description="Azure deployment name") api_version: str = Field(default="2024-08-01-preview", description="Azure API version") @@ -142,7 +155,7 @@ def to_config(self) -> Dict[str, Any]: } # Named fields take precedence over anything in the generic params dict. - params: Dict[str, Any] = dict(self.options.params or {}) + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} if self.options.temperature is not None: params["temperature"] = self.options.temperature if self.options.top_p is not None: @@ -178,9 +191,9 @@ class AnthropicOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Anthropic API key") - model: str = Field(default="claude-3-5-sonnet-20241022", description="Model name") - url: Optional[str] = Field(default=None, description="Custom API endpoint URL") - max_tokens: Optional[int] = Field(default=None, gt=0) + model: str = Field(..., description="Model name") + url: str = Field(..., description="Anthropic messages endpoint URL") + max_tokens: int = Field(..., gt=0) temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -188,7 +201,7 @@ class AnthropicOptions(BaseModel): failure_message: Optional[str] = Field(default=None) input_modalities: Optional[List[str]] = Field(default=None) params: Optional[Dict[str, Any]] = Field(default=None) - headers: Optional[Dict[str, str]] = Field(default=None) + headers: Dict[str, str] = Field(..., description="Anthropic request headers") output_modalities: Optional[List[str]] = Field(default=None) greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) @@ -211,17 +224,16 @@ def to_config(self) -> Dict[str, Any]: params["top_p"] = self.options.top_p config: Dict[str, Any] = { - "url": self.options.url or "https://api.anthropic.com/v1/messages", + "url": self.options.url, "api_key": self.options.api_key, "params": params, + "headers": self.options.headers, "style": "anthropic", "input_modalities": self.options.input_modalities or ["text"], } if self.options.system_messages is not None: config["system_messages"] = self.options.system_messages - if self.options.headers is not None: - config["headers"] = self.options.headers if self.options.greeting_message is not None: config["greeting_message"] = self.options.greeting_message if self.options.failure_message is not None: @@ -246,7 +258,7 @@ class GeminiOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Google AI API key") - model: str = Field(default="gemini-2.0-flash-exp", description="Model name") + model: str = Field(..., description="Model name") url: Optional[str] = Field(default=None, description="Custom API endpoint URL") temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) @@ -317,8 +329,8 @@ class GroqOptions(OpenAIOptions): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="Groq API key") - model: str = Field(default="llama-3.3-70b-versatile", description="Model name") - base_url: Optional[str] = Field(default=None, description="Custom Groq-compatible endpoint") + model: str = Field(..., description="Model name") + base_url: str = Field(..., description="Groq-compatible endpoint") class Groq(BaseLLM): @@ -327,7 +339,7 @@ def __init__(self, **kwargs: Any): def to_config(self) -> Dict[str, Any]: config = OpenAI(**_dump_optional_model(self.options)).to_config() - config["url"] = self.options.base_url or "https://api.groq.com/openai/v1/chat/completions" + config["url"] = self.options.base_url return config @@ -372,11 +384,29 @@ def to_config(self) -> Dict[str, Any]: return config -class AmazonBedrockOptions(AnthropicOptions): +class AmazonBedrockOptions(BaseModel): model_config = ConfigDict(extra="forbid") - api_key: str = Field(..., description="Amazon Bedrock API key or gateway token") - url: str = Field(..., description="Amazon Bedrock proxy or runtime endpoint") + access_key: str = Field(..., description="AWS access key ID") + secret_key: str = Field(..., description="AWS secret access key") + region: str = Field(..., description="AWS region") + model: str = Field(..., description="Amazon Bedrock model identifier") + max_tokens: Optional[int] = Field(default=None, gt=0) + url: Optional[str] = Field(default=None, description="Amazon Bedrock converse stream endpoint URL") + temperature: Optional[float] = Field(default=None, ge=0.0, le=1.0) + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) + greeting_message: Optional[str] = Field(default=None) + failure_message: Optional[str] = Field(default=None) + input_modalities: Optional[List[str]] = Field(default=None) + params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) + output_modalities: Optional[List[str]] = Field(default=None) + greeting_configs: Optional[LlmGreetingConfigs] = Field(default=None) + template_variables: Optional[Dict[str, str]] = Field(default=None) + vendor: Optional[str] = Field(default=None) + mcp_servers: Optional[List[Dict[str, Any]]] = Field(default=None) + max_history: Optional[int] = Field(default=None, gt=0, description="Maximum number of conversation history messages to cache") class AmazonBedrock(BaseLLM): @@ -384,7 +414,45 @@ def __init__(self, **kwargs: Any): self.options = AmazonBedrockOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - return Anthropic(**_dump_optional_model(self.options)).to_config() + params: Dict[str, Any] = dict(self.options.params or {}) + if self.options.max_tokens is not None: + params["max_tokens"] = self.options.max_tokens + if self.options.temperature is not None: + params["temperature"] = self.options.temperature + if self.options.top_p is not None: + params["top_p"] = self.options.top_p + + config: Dict[str, Any] = { + "url": self.options.url or f"https://bedrock-runtime.{self.options.region}.amazonaws.com/model/{self.options.model}/converse-stream", + "access_key": self.options.access_key, + "secret_key": self.options.secret_key, + "region": self.options.region, + "model": self.options.model, + "params": params, + "style": "bedrock", + "input_modalities": self.options.input_modalities or ["text"], + } + if self.options.system_messages is not None: + config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers + if self.options.greeting_message is not None: + config["greeting_message"] = self.options.greeting_message + if self.options.failure_message is not None: + config["failure_message"] = self.options.failure_message + if self.options.output_modalities is not None: + config["output_modalities"] = self.options.output_modalities + if self.options.greeting_configs is not None: + config["greeting_configs"] = _dump_optional_model(self.options.greeting_configs) + if self.options.template_variables is not None: + config["template_variables"] = self.options.template_variables + if self.options.vendor is not None: + config["vendor"] = self.options.vendor + if self.options.mcp_servers is not None: + config["mcp_servers"] = _ensure_mcp_transport(self.options.mcp_servers) + if self.options.max_history is not None: + config["max_history"] = self.options.max_history + return config class DifyOptions(BaseModel): @@ -392,6 +460,7 @@ class DifyOptions(BaseModel): api_key: str = Field(..., description="Dify API key") url: str = Field(..., description="Dify workflow or chat endpoint") + model: str = Field(..., description="Dify model identifier") user: Optional[str] = Field(default=None, description="Dify user identifier") conversation_id: Optional[str] = Field(default=None, description="Dify conversation ID") system_messages: Optional[List[Dict[str, Any]]] = Field(default=None) @@ -413,7 +482,7 @@ def __init__(self, **kwargs: Any): self.options = DifyOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = dict(self.options.params or {}) + params: Dict[str, Any] = {"model": self.options.model, **(self.options.params or {})} if self.options.user is not None: params["user"] = self.options.user if self.options.conversation_id is not None: diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index b58f040..236a494 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -2,12 +2,10 @@ from pydantic import BaseModel, ConfigDict, Field -from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( - StartAgentsRequestPropertiesMllmTurnDetection, -) +from ...types.mllm_turn_detection import MllmTurnDetection from .base import BaseMLLM -MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection +MllmTurnDetectionConfig = MllmTurnDetection class OpenAIRealtimeOptions(BaseModel): @@ -15,6 +13,9 @@ class OpenAIRealtimeOptions(BaseModel): api_key: str = Field(..., description="OpenAI API key") model: Optional[str] = Field(default=None, description="Model name (e.g., gpt-4o-realtime-preview)") + voice: Optional[str] = Field(default=None, description="Voice identifier") + instructions: Optional[str] = Field(default=None, description="System instructions") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="Audio transcription settings") url: Optional[str] = Field(default=None, description="WebSocket URL") greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") @@ -36,13 +37,25 @@ def to_config(self) -> Dict[str, Any]: if self.options.url is not None: config["url"] = self.options.url - if self.options.model is not None: - params = {"model": self.options.model} + if ( + self.options.model is not None + or self.options.params is not None + or self.options.voice is not None + or self.options.instructions is not None + or self.options.input_audio_transcription is not None + ): + params: Dict[str, Any] = {} + if self.options.model is not None: + params["model"] = self.options.model if self.options.params is not None: params.update(self.options.params) + if self.options.voice is not None: + params["voice"] = self.options.voice + if self.options.instructions is not None: + params["instructions"] = self.options.instructions + if self.options.input_audio_transcription is not None: + params["input_audio_transcription"] = self.options.input_audio_transcription config["params"] = params - elif self.options.params is not None: - config["params"] = self.options.params if self.options.greeting_message is not None: config["greeting_message"] = self.options.greeting_message if self.options.input_modalities is not None: @@ -128,6 +141,11 @@ class VertexAIOptions(BaseModel): adc_credentials_string: str = Field(..., description="Application Default Credentials JSON string") instructions: Optional[str] = Field(default=None, description="System instructions") voice: Optional[str] = Field(default=None, description="Voice name (e.g., Aoede, Charon)") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") @@ -145,16 +163,26 @@ def to_config(self) -> Dict[str, Any]: # matching the TypeScript SDK. params: Dict[str, Any] = dict(self.options.additional_params or {}) params["model"] = self.options.model - params["project_id"] = self.options.project_id - params["location"] = self.options.location - params["adc_credentials_string"] = self.options.adc_credentials_string if self.options.instructions is not None: params["instructions"] = self.options.instructions if self.options.voice is not None: params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options config: Dict[str, Any] = { "vendor": "vertexai", + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, "params": params, } @@ -184,6 +212,11 @@ class GeminiLiveOptions(BaseModel): url: Optional[str] = Field(default=None, description="WebSocket URL") instructions: Optional[str] = Field(default=None, description="System instructions") voice: Optional[str] = Field(default=None, description="Voice name") + affective_dialog: Optional[bool] = Field(default=None, description="Enable affective dialog") + proactive_audio: Optional[bool] = Field(default=None, description="Enable proactive audio") + transcribe_agent: Optional[bool] = Field(default=None, description="Transcribe agent speech") + transcribe_user: Optional[bool] = Field(default=None, description="Transcribe user speech") + http_options: Optional[Dict[str, Any]] = Field(default=None, description="HTTP options") greeting_message: Optional[str] = Field(default=None, description="Agent greeting message") input_modalities: Optional[List[str]] = Field(default=None, description="Input modalities") output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") @@ -205,6 +238,16 @@ def to_config(self) -> Dict[str, Any]: params["instructions"] = self.options.instructions if self.options.voice is not None: params["voice"] = self.options.voice + if self.options.affective_dialog is not None: + params["affective_dialog"] = self.options.affective_dialog + if self.options.proactive_audio is not None: + params["proactive_audio"] = self.options.proactive_audio + if self.options.transcribe_agent is not None: + params["transcribe_agent"] = self.options.transcribe_agent + if self.options.transcribe_user is not None: + params["transcribe_user"] = self.options.transcribe_user + if self.options.http_options is not None: + params["http_options"] = self.options.http_options config: Dict[str, Any] = { "vendor": "gemini", diff --git a/src/agora_agent/agentkit/vendors/stt.py b/src/agora_agent/agentkit/vendors/stt.py index a26e130..e5117b0 100644 --- a/src/agora_agent/agentkit/vendors/stt.py +++ b/src/agora_agent/agentkit/vendors/stt.py @@ -1,9 +1,88 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator +from typing_extensions import Literal from .base import BaseSTT +TurnDetectionLanguage = Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +] + +TURN_DETECTION_LANGUAGE_VALUES: Tuple[TurnDetectionLanguage, ...] = ( + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", +) +_TURN_DETECTION_LANGUAGES = set(TURN_DETECTION_LANGUAGE_VALUES) +_DEEPGRAM_MANAGED_MODELS = {"nova-2", "nova-3"} + + +def _turn_detection_language(language: Optional[str]) -> Optional[TurnDetectionLanguage]: + if language in _TURN_DETECTION_LANGUAGES: + return language # type: ignore[return-value] + return None + class SpeechmaticsSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -11,6 +90,7 @@ class SpeechmaticsSTTOptions(BaseModel): api_key: str = Field(..., description="Speechmatics API key") language: str = Field(..., description="Language code (e.g., en, es, fr)") model: Optional[str] = Field(default=None, description="Model name") + uri: Optional[str] = Field(default=None, description="Speechmatics streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) class SpeechmaticsSTT(BaseSTT): @@ -18,20 +98,24 @@ def __init__(self, **kwargs: Any): self.options = SpeechmaticsSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "api_key": self.options.api_key, "language": self.options.language, - } + }) if self.options.model is not None: params["model"] = self.options.model - if self.options.additional_params is not None: - params.update(self.options.additional_params) + if self.options.uri is not None: + params["uri"] = self.options.uri - return { + config: Dict[str, Any] = { "vendor": "speechmatics", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config class DeepgramSTTOptions(BaseModel): @@ -44,15 +128,21 @@ class DeepgramSTTOptions(BaseModel): punctuation: Optional[bool] = Field(default=None, description="Enable punctuation") additional_params: Optional[Dict[str, Any]] = Field(default=None) + @model_validator(mode="after") + def _validate_managed_model(self) -> "DeepgramSTTOptions": + if self.api_key is None and (self.model is None or self.model.strip().lower() not in _DEEPGRAM_MANAGED_MODELS): + raise ValueError("DeepgramSTT requires api_key unless using a supported Agora-managed model") + return self + class DeepgramSTT(BaseSTT): def __init__(self, **kwargs: Any): self.options = DeepgramSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {} + params: Dict[str, Any] = dict(self.options.additional_params or {}) if self.options.api_key is not None: - params["api_key"] = self.options.api_key + params["key"] = self.options.api_key if self.options.model is not None: params["model"] = self.options.model if self.options.language is not None: @@ -61,14 +151,14 @@ def to_config(self) -> Dict[str, Any]: params["smart_format"] = self.options.smart_format if self.options.punctuation is not None: params["punctuation"] = self.options.punctuation - if self.options.additional_params is not None: - params.update(self.options.additional_params) - - return { + config: Dict[str, Any] = { "vendor": "deepgram", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config class MicrosoftSTTOptions(BaseModel): @@ -76,7 +166,7 @@ class MicrosoftSTTOptions(BaseModel): key: str = Field(..., description="Azure subscription key") region: str = Field(..., description="Azure region (e.g., eastus)") - language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + language: str = Field(..., description="Language code (e.g., en-US)") additional_params: Optional[Dict[str, Any]] = Field(default=None) class MicrosoftSTT(BaseSTT): @@ -84,20 +174,22 @@ def __init__(self, **kwargs: Any): self.options = MicrosoftSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "key": self.options.key, "region": self.options.region, - } + }) if self.options.language is not None: params["language"] = self.options.language - if self.options.additional_params is not None: - params.update(self.options.additional_params) - return { + config: Dict[str, Any] = { "vendor": "microsoft", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config class OpenAISTTOptions(BaseModel): @@ -106,6 +198,8 @@ class OpenAISTTOptions(BaseModel): api_key: str = Field(..., description="OpenAI API key") model: Optional[str] = Field(default=None, description="Model (default: whisper-1)") language: Optional[str] = Field(default=None, description="Language code") + prompt: Optional[str] = Field(default=None, description="Prompt that guides OpenAI transcription") + input_audio_transcription: Optional[Dict[str, Any]] = Field(default=None, description="OpenAI transcription settings") additional_params: Optional[Dict[str, Any]] = Field(default=None) class OpenAISTT(BaseSTT): @@ -113,25 +207,36 @@ def __init__(self, **kwargs: Any): self.options = OpenAISTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"api_key": self.options.api_key} + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + transcription = {"model": "whisper-1", **(self.options.input_audio_transcription or {})} if self.options.model is not None: - params["model"] = self.options.model - if self.options.additional_params is not None: - params.update(self.options.additional_params) + transcription["model"] = self.options.model + if self.options.prompt is not None: + transcription["prompt"] = self.options.prompt + if self.options.language is not None: + transcription["language"] = self.options.language + params["input_audio_transcription"] = transcription - return { + config: Dict[str, Any] = { "vendor": "openai", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config class GoogleSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") - api_key: str = Field(..., description="Google Cloud API key") - language: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") + project_id: str = Field(..., description="Google Cloud project ID") + location: str = Field(..., description="Google Cloud region") + adc_credentials_string: str = Field(..., description="Google service account credentials JSON string") + language: str = Field(..., description="Language code (e.g., en-US)") + model: Optional[str] = Field(default=None, description="Recognition model") additional_params: Optional[Dict[str, Any]] = Field(default=None) class GoogleSTT(BaseSTT): @@ -139,18 +244,26 @@ def __init__(self, **kwargs: Any): self.options = GoogleSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"api_key": self.options.api_key} + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "project_id": self.options.project_id, + "location": self.options.location, + "adc_credentials_string": self.options.adc_credentials_string, + }) if self.options.language is not None: params["language"] = self.options.language - if self.options.additional_params is not None: - params.update(self.options.additional_params) + if self.options.model is not None: + params["model"] = self.options.model - return { + config: Dict[str, Any] = { "vendor": "google", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config class AmazonSTTOptions(BaseModel): @@ -159,7 +272,7 @@ class AmazonSTTOptions(BaseModel): access_key: str = Field(..., description="AWS Access Key ID") secret_key: str = Field(..., description="AWS Secret Access Key") region: str = Field(..., description="AWS region (e.g., us-east-1)") - language: Optional[str] = Field(default=None, description="Language code") + language: str = Field(..., description="Language code") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AmazonSTT(BaseSTT): @@ -167,28 +280,31 @@ def __init__(self, **kwargs: Any): self.options = AmazonSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { - "access_key": self.options.access_key, - "secret_key": self.options.secret_key, + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ + "access_key_id": self.options.access_key, + "secret_access_key": self.options.secret_key, "region": self.options.region, - } + }) if self.options.language is not None: - params["language"] = self.options.language - if self.options.additional_params is not None: - params.update(self.options.additional_params) + params["language_code"] = self.options.language - return { + config: Dict[str, Any] = { "vendor": "amazon", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config class AssemblyAISTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") api_key: str = Field(..., description="AssemblyAI API key") - language: Optional[str] = Field(default=None, description="Language code") + language: str = Field(..., description="Language code") + uri: Optional[str] = Field(default=None, description="AssemblyAI streaming WebSocket URL") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AssemblyAISTT(BaseSTT): @@ -196,21 +312,27 @@ def __init__(self, **kwargs: Any): self.options = AssemblyAISTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"api_key": self.options.api_key} - if self.options.additional_params is not None: - params.update(self.options.additional_params) + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params["api_key"] = self.options.api_key + if self.options.language is not None: + params["language"] = self.options.language + if self.options.uri is not None: + params["uri"] = self.options.uri - return { + config: Dict[str, Any] = { "vendor": "assemblyai", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config class AresSTTOptions(BaseModel): model_config = ConfigDict(extra="forbid") - language: Optional[str] = Field(default=None, description="Language code") + language: Optional[TurnDetectionLanguage] = Field(default=None, description="Language code") additional_params: Optional[Dict[str, Any]] = Field(default=None) class AresSTT(BaseSTT): @@ -239,17 +361,19 @@ def __init__(self, **kwargs: Any): self.options = SarvamSTTOptions(**kwargs) def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "api_key": self.options.api_key, "language": self.options.language, - } + }) if self.options.model is not None: params["model"] = self.options.model - if self.options.additional_params is not None: - params.update(self.options.additional_params) - return { + config: Dict[str, Any] = { "vendor": "sarvam", - "language": self.options.language, "params": params, } + turn_detection_language = _turn_detection_language(self.options.language) + if turn_detection_language is not None: + config["language"] = turn_detection_language + return config diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index 557ea56..a052ea5 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -1,9 +1,9 @@ from typing import Any, Dict, List, Optional -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from .base import BaseTTS, CartesiaSampleRate, ElevenLabsSampleRate, GoogleTTSSampleRate, MicrosoftSampleRate - +from ..presets import MiniMaxPresetModels, OpenAITtsPresetModels class ElevenLabsTTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -11,7 +11,7 @@ class ElevenLabsTTSOptions(BaseModel): key: str = Field(..., description="ElevenLabs API key") model_id: str = Field(..., description="Model ID (e.g., eleven_flash_v2_5)") voice_id: str = Field(..., description="Voice ID") - base_url: Optional[str] = Field(default=None, description="WebSocket base URL") + base_url: str = Field(..., description="WebSocket base URL") sample_rate: Optional[ElevenLabsSampleRate] = Field(default=None, description="Sample rate in Hz") skip_patterns: Optional[List[int]] = Field(default=None) optimize_streaming_latency: Optional[int] = Field(default=None, ge=0, le=4) @@ -31,12 +31,11 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "key": self.options.key, + "base_url": self.options.base_url, "model_id": self.options.model_id, "voice_id": self.options.voice_id, } - if self.options.base_url is not None: - params["base_url"] = self.options.base_url if self.options.sample_rate is not None: params["sample_rate"] = self.options.sample_rate if self.options.optimize_streaming_latency is not None: @@ -63,6 +62,8 @@ class MicrosoftTTSOptions(BaseModel): region: str = Field(..., description="Azure region (e.g., eastus)") voice_name: str = Field(..., description="Voice name") sample_rate: Optional[MicrosoftSampleRate] = Field(default=None, description="Sample rate in Hz") + speed: Optional[float] = Field(default=None, description="Speaking rate multiplier") + volume: Optional[float] = Field(default=None, description="Audio volume") skip_patterns: Optional[List[int]] = Field(default=None) class MicrosoftTTS(BaseTTS): @@ -82,6 +83,10 @@ def to_config(self) -> Dict[str, Any]: if self.options.sample_rate is not None: params["sample_rate"] = self.options.sample_rate + if self.options.speed is not None: + params["speed"] = self.options.speed + if self.options.volume is not None: + params["volume"] = self.options.volume result: Dict[str, Any] = {"vendor": "microsoft", "params": params} if self.options.skip_patterns is not None: @@ -95,10 +100,31 @@ class OpenAITTSOptions(BaseModel): api_key: Optional[str] = Field(default=None, description="OpenAI API key") voice: str = Field(..., description="Voice name (alloy, echo, fable, onyx, nova, shimmer)") model: Optional[str] = Field(default=None, description="Model name (tts-1, tts-1-hd)") - response_format: Optional[str] = Field(default=None, description="Audio format (e.g., pcm)") + base_url: Optional[str] = Field(default=None, description="Endpoint URL") + instructions: Optional[str] = Field(default=None, description="Custom voice instructions") speed: Optional[float] = Field(default=None, description="Speech speed multiplier") skip_patterns: Optional[List[int]] = Field(default=None) + @model_validator(mode="after") + def _validate_byok_params(self) -> "OpenAITTSOptions": + if self.api_key is not None: + missing = [ + name + for name, value in ( + ("model", self.model), + ("base_url", self.base_url), + ) + if value is None + ] + if missing: + raise ValueError(f"OpenAITTS requires {', '.join(missing)} when api_key is set") + else: + if self.model is not None and self.model.strip().lower() not in OpenAITtsPresetModels: + raise ValueError("OpenAITTS requires api_key unless using the Agora-managed tts-1 model") + if self.base_url is not None: + raise ValueError("OpenAITTS base_url is only valid when api_key is set") + return self + class OpenAITTS(BaseTTS): def __init__(self, **kwargs: Any): self.options = OpenAITTSOptions(**kwargs) @@ -113,11 +139,13 @@ def to_config(self) -> Dict[str, Any]: } if self.options.api_key is not None: params["api_key"] = self.options.api_key - - if self.options.model is not None: + params["base_url"] = self.options.base_url params["model"] = self.options.model - if self.options.response_format is not None: - params["response_format"] = self.options.response_format + elif self.options.model is not None: + params["model"] = self.options.model + + if self.options.instructions is not None: + params["instructions"] = self.options.instructions if self.options.speed is not None: params["speed"] = self.options.speed @@ -132,7 +160,9 @@ class CartesiaTTSOptions(BaseModel): api_key: str = Field(..., description="Cartesia API key") voice_id: str = Field(..., description="Voice ID") - model_id: Optional[str] = Field(default=None, description="Model ID") + model_id: str = Field(..., description="Model ID") + base_url: Optional[str] = Field(default=None, description="WebSocket URL") + language: Optional[str] = Field(default=None, description="Target language") sample_rate: Optional[CartesiaSampleRate] = Field(default=None, description="Sample rate in Hz") skip_patterns: Optional[List[int]] = Field(default=None) @@ -147,13 +177,16 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { "api_key": self.options.api_key, + "model_id": self.options.model_id, "voice": {"mode": "id", "id": self.options.voice_id}, } - if self.options.model_id is not None: - params["model_id"] = self.options.model_id + if self.options.base_url is not None: + params["base_url"] = self.options.base_url if self.options.sample_rate is not None: - params["sample_rate"] = self.options.sample_rate + params["output_format"] = {"container": "raw", "sample_rate": self.options.sample_rate} + if self.options.language is not None: + params["language"] = self.options.language result: Dict[str, Any] = {"vendor": "cartesia", "params": params} if self.options.skip_patterns is not None: @@ -164,7 +197,7 @@ def to_config(self) -> Dict[str, Any]: class GoogleTTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") - key: str = Field(..., description="Google Cloud API key") + key: str = Field(..., description="Google Cloud service account credentials JSON string") voice_name: str = Field(..., description="Voice name") language_code: Optional[str] = Field(default=None, description="Language code (e.g., en-US)") sample_rate_hertz: Optional[GoogleTTSSampleRate] = Field(default=None, description="Sample rate in Hz") @@ -180,14 +213,14 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, - "voice_name": self.options.voice_name, + "credentials": self.options.key, + "VoiceSelectionParams": {"name": self.options.voice_name}, } if self.options.language_code is not None: - params["language_code"] = self.options.language_code + params["VoiceSelectionParams"]["language_code"] = self.options.language_code if self.options.sample_rate_hertz is not None: - params["sample_rate_hertz"] = self.options.sample_rate_hertz + params["AudioConfig"] = {"sample_rate_hertz": self.options.sample_rate_hertz} result: Dict[str, Any] = {"vendor": "google", "params": params} if self.options.skip_patterns is not None: @@ -202,6 +235,7 @@ class AmazonTTSOptions(BaseModel): secret_key: str = Field(..., description="AWS secret key") region: str = Field(..., description="AWS region (e.g., us-east-1)") voice_id: str = Field(..., description="Amazon Polly voice ID") + engine: str = Field(..., description="Amazon Polly engine type") skip_patterns: Optional[List[int]] = Field(default=None) class AmazonTTS(BaseTTS): @@ -214,10 +248,11 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "access_key": self.options.access_key, - "secret_key": self.options.secret_key, - "region": self.options.region, - "voice_id": self.options.voice_id, + "aws_access_key_id": self.options.access_key, + "aws_secret_access_key": self.options.secret_key, + "region_name": self.options.region, + "voice": self.options.voice_id, + "engine": self.options.engine, } result: Dict[str, Any] = {"vendor": "amazon", "params": params} @@ -233,7 +268,7 @@ class DeepgramTTSOptions(BaseModel): model: str = Field(..., description="Deepgram TTS model (e.g., 'aura-2-thalia-en')") base_url: Optional[str] = Field(default=None, description="WebSocket endpoint") sample_rate: Optional[int] = Field(default=None, description="Sample rate in Hz") - params: Optional[Dict[str, Any]] = Field(default=None, description="Additional Deepgram TTS parameters") + additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional Deepgram TTS parameters") skip_patterns: Optional[List[int]] = Field(default=None) class DeepgramTTS(BaseTTS): @@ -245,17 +280,16 @@ def sample_rate(self) -> Optional[int]: return self.options.sample_rate def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { + params: Dict[str, Any] = dict(self.options.additional_params or {}) + params.update({ "api_key": self.options.api_key, "model": self.options.model, - **(self.options.params or {}), - } + }) if self.options.base_url is not None: params["base_url"] = self.options.base_url if self.options.sample_rate is not None: params["sample_rate"] = self.options.sample_rate - result: Dict[str, Any] = {"vendor": "deepgram", "params": params} if self.options.skip_patterns is not None: result["skip_patterns"] = self.options.skip_patterns @@ -267,6 +301,11 @@ class HumeAITTSOptions(BaseModel): key: str = Field(..., description="Hume AI API key") config_id: Optional[str] = Field(default=None, description="Configuration ID") + voice_id: str = Field(..., description="Hume AI voice ID") + base_url: Optional[str] = Field(default=None, description="Base URL") + provider: str = Field(..., description="Voice provider type") + speed: Optional[float] = Field(default=None, description="Playback speed") + trailing_silence: Optional[float] = Field(default=None, description="Trailing silence in seconds") skip_patterns: Optional[List[int]] = Field(default=None) class HumeAITTS(BaseTTS): @@ -278,10 +317,20 @@ def sample_rate(self) -> Optional[int]: return None def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = {"key": self.options.key} + params: Dict[str, Any] = { + "key": self.options.key, + "voice_id": self.options.voice_id, + "provider": self.options.provider, + } if self.options.config_id is not None: params["config_id"] = self.options.config_id + if self.options.base_url is not None: + params["base_url"] = self.options.base_url + if self.options.speed is not None: + params["speed"] = self.options.speed + if self.options.trailing_silence is not None: + params["trailing_silence"] = self.options.trailing_silence result: Dict[str, Any] = {"vendor": "humeai", "params": params} if self.options.skip_patterns is not None: @@ -294,10 +343,8 @@ class RimeTTSOptions(BaseModel): key: str = Field(..., description="Rime API key") speaker: str = Field(..., description="Speaker ID") - model_id: Optional[str] = Field(default=None, description="Model ID") - lang: Optional[str] = Field(default=None, description="Language code") - sampling_rate: Optional[int] = Field(default=None, description="Sampling rate in Hz") - speed_alpha: Optional[float] = Field(default=None, description="Speed multiplier") + model_id: str = Field(..., description="Model ID") + base_url: Optional[str] = Field(default=None, description="WebSocket URL") skip_patterns: Optional[List[int]] = Field(default=None) class RimeTTS(BaseTTS): @@ -310,18 +357,12 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, + "api_key": self.options.key, "speaker": self.options.speaker, + "modelId": self.options.model_id, } - - if self.options.model_id is not None: - params["model_id"] = self.options.model_id - if self.options.lang is not None: - params["lang"] = self.options.lang - if self.options.sampling_rate is not None: - params["samplingRate"] = self.options.sampling_rate - if self.options.speed_alpha is not None: - params["speedAlpha"] = self.options.speed_alpha + if self.options.base_url is not None: + params["base_url"] = self.options.base_url result: Dict[str, Any] = {"vendor": "rime", "params": params} if self.options.skip_patterns is not None: @@ -334,6 +375,7 @@ class FishAudioTTSOptions(BaseModel): key: str = Field(..., description="Fish Audio API key") reference_id: str = Field(..., description="Reference ID") + backend: str = Field(..., description="Backend") skip_patterns: Optional[List[int]] = Field(default=None) class FishAudioTTS(BaseTTS): @@ -346,8 +388,9 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, + "api_key": self.options.key, "reference_id": self.options.reference_id, + "backend": self.options.backend, } result: Dict[str, Any] = {"vendor": "fishaudio", "params": params} @@ -366,6 +409,24 @@ class MiniMaxTTSOptions(BaseModel): url: Optional[str] = Field(default=None, description="WebSocket endpoint (e.g., 'wss://api-uw.minimax.io/ws/v1/t2a_v2')") skip_patterns: Optional[List[int]] = Field(default=None) + @model_validator(mode="after") + def _validate_byok_params(self) -> "MiniMaxTTSOptions": + if self.key is not None: + missing = [ + name + for name, value in ( + ("group_id", self.group_id), + ("voice_id", self.voice_id), + ("url", self.url), + ) + if value is None + ] + if missing: + raise ValueError(f"MiniMaxTTS requires {', '.join(missing)} when key is set") + elif self.model.strip().lower() not in MiniMaxPresetModels: + raise ValueError("MiniMaxTTS requires key unless using a supported Agora-managed model") + return self + class MiniMaxTTS(BaseTTS): def __init__(self, **kwargs: Any): self.options = MiniMaxTTSOptions(**kwargs) @@ -397,6 +458,10 @@ class SarvamTTSOptions(BaseModel): key: str = Field(..., description="Sarvam API subscription key") speaker: str = Field(..., description="Speaker/voice ID (e.g., 'anushka', 'abhilash', 'karun', 'hitesh', 'manisha', 'vidya', 'arya')") target_language_code: str = Field(..., description="Target language code (e.g., 'en-IN', 'hi-IN', 'ta-IN')") + pitch: Optional[float] = Field(default=None, description="Pitch adjustment") + pace: Optional[float] = Field(default=None, description="Speed of speech") + loudness: Optional[float] = Field(default=None, description="Volume level") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate in Hz") skip_patterns: Optional[List[int]] = Field(default=None) class SarvamTTS(BaseTTS): @@ -409,10 +474,18 @@ def sample_rate(self) -> Optional[int]: def to_config(self) -> Dict[str, Any]: params: Dict[str, Any] = { - "key": self.options.key, + "api_subscription_key": self.options.key, "speaker": self.options.speaker, "target_language_code": self.options.target_language_code, } + if self.options.pitch is not None: + params["pitch"] = self.options.pitch + if self.options.pace is not None: + params["pace"] = self.options.pace + if self.options.loudness is not None: + params["loudness"] = self.options.loudness + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate result: Dict[str, Any] = {"vendor": "sarvam", "params": params} if self.options.skip_patterns is not None: @@ -424,8 +497,13 @@ class MurfTTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") key: str = Field(..., description="Murf API key") - voice_id: str = Field(..., description="Voice ID (e.g., 'Ariana', 'Natalie', 'Ken')") - style: Optional[str] = Field(default=None, description="Voice style (e.g., 'Angry', 'Sad', 'Conversational', 'Newscast')") + voice_id: Optional[str] = Field(default=None, description="Voice ID (e.g., 'Ariana', 'Natalie', 'Ken')") + base_url: Optional[str] = Field(default=None, description="WebSocket endpoint") + locale: Optional[str] = Field(default=None, description="Voice locale") + rate: Optional[float] = Field(default=None, description="Speech rate") + pitch: Optional[float] = Field(default=None, description="Pitch adjustment") + model: Optional[str] = Field(default=None, description="TTS model") + sample_rate: Optional[int] = Field(default=None, description="Audio sample rate") skip_patterns: Optional[List[int]] = Field(default=None) class MurfTTS(BaseTTS): @@ -437,13 +515,22 @@ def sample_rate(self) -> Optional[int]: return None def to_config(self) -> Dict[str, Any]: - params: Dict[str, Any] = { - "key": self.options.key, - "voice_id": self.options.voice_id, - } + params: Dict[str, Any] = {"api_key": self.options.key} - if self.options.style is not None: - params["style"] = self.options.style + if self.options.base_url is not None: + params["base_url"] = self.options.base_url + if self.options.voice_id is not None: + params["voiceId"] = self.options.voice_id + if self.options.locale is not None: + params["locale"] = self.options.locale + if self.options.rate is not None: + params["rate"] = self.options.rate + if self.options.pitch is not None: + params["pitch"] = self.options.pitch + if self.options.model is not None: + params["model"] = self.options.model + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate result: Dict[str, Any] = {"vendor": "murf", "params": params} if self.options.skip_patterns is not None: diff --git a/src/agora_agent/agents/client.py b/src/agora_agent/agents/client.py index 3f6af4c..e923c9a 100644 --- a/src/agora_agent/agents/client.py +++ b/src/agora_agent/agents/client.py @@ -84,11 +84,16 @@ def start( Examples -------- - from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Agora, + Asr_Ares, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -108,9 +113,7 @@ def start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -118,13 +121,15 @@ def start( voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", @@ -641,11 +646,16 @@ async def start( -------- import asyncio - from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft + from agora_agent import ( + Asr_Ares, + AsyncAgora, + Llm, + LlmParams, + MicrosoftTtsParams, + Tts_Microsoft, + ) from agora_agent.agents import ( StartAgentsRequestProperties, - StartAgentsRequestPropertiesAsr, - StartAgentsRequestPropertiesLlm, StartAgentsRequestPropertiesTurnDetection, StartAgentsRequestPropertiesTurnDetectionConfig, StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeech, @@ -668,9 +678,7 @@ async def main() -> None: agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - asr=StartAgentsRequestPropertiesAsr( - language="en-US", - ), + asr=Asr_Ares(), tts=Tts_Microsoft( params=MicrosoftTtsParams( key="key", @@ -678,13 +686,15 @@ async def main() -> None: voice_name="voice_name", ), ), - llm=StartAgentsRequestPropertiesLlm( + llm=Llm( url="https://api.openai.com/v1/chat/completions", api_key="", system_messages=[ {"role": "system", "content": "You are a helpful chatbot."} ], - params={"model": "gpt-4o-mini"}, + params=LlmParams( + model="gpt-4o-mini", + ), max_history=32, greeting_message="Hello, how can I assist you today?", failure_message="Please hold on a second.", diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py index 06c3482..3cddb7e 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties.py +++ b/src/agora_agent/agents/types/start_agents_request_properties.py @@ -5,15 +5,15 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr import Asr +from ...types.llm import Llm +from ...types.mllm import Mllm from ...types.tts import Tts from .start_agents_request_properties_advanced_features import StartAgentsRequestPropertiesAdvancedFeatures -from .start_agents_request_properties_asr import StartAgentsRequestPropertiesAsr from .start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from .start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords from .start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence from .start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption -from .start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm -from .start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm from .start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters from .start_agents_request_properties_rtc import StartAgentsRequestPropertiesRtc from .start_agents_request_properties_sal import StartAgentsRequestPropertiesSal @@ -67,7 +67,7 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Advanced features configuration. """ - asr: typing.Optional[StartAgentsRequestPropertiesAsr] = pydantic.Field(default=None) + asr: typing.Optional[Asr] = pydantic.Field(default=None) """ Automatic Speech Recognition (ASR) configuration. """ @@ -77,12 +77,12 @@ class StartAgentsRequestProperties(UncheckedBaseModel): Text-to-speech (TTS) module configuration. """ - llm: typing.Optional[StartAgentsRequestPropertiesLlm] = pydantic.Field(default=None) + llm: typing.Optional[Llm] = pydantic.Field(default=None) """ Large language model (LLM) configuration. """ - mllm: typing.Optional[StartAgentsRequestPropertiesMllm] = pydantic.Field(default=None) + mllm: typing.Optional[Mllm] = pydantic.Field(default=None) """ Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr.py b/src/agora_agent/agents/types/start_agents_request_properties_asr.py deleted file mode 100644 index 7385e17..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr.py +++ /dev/null @@ -1,47 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_asr_vendor import StartAgentsRequestPropertiesAsrVendor - - -class StartAgentsRequestPropertiesAsr(UncheckedBaseModel): - """ - Automatic Speech Recognition (ASR) configuration. - """ - - language: typing.Optional[str] = pydantic.Field(default=None) - """ - The BCP-47 language tag identifying the primary language used for agent interaction. If `params` contains a vendor-specific language code, it takes precedence over this setting. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesAsrVendor] = pydantic.Field(default=None) - """ - ASR provider: - - `ares`: Adaptive Recognition Engine for Speech - - `microsoft`: Microsoft Azure - - `deepgram`: Deepgram - - `openai`: OpenAI (Beta) - - `speechmatics`: Speechmatics - - `assemblyai`: AssemblyAI (Beta) - - `amazon`: Amazon Transcribe (Beta) - - `google`: Google (Beta) - - `sarvam`: Sarvam (Beta) - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - The configuration parameters for the ASR vendor. See [ASR Overview](https://docs.agora.io/en/conversational-ai/models/asr/overview) for details. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py deleted file mode 100644 index 973d62c..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_asr_vendor.py +++ /dev/null @@ -1,10 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesAsrVendor = typing.Union[ - typing.Literal[ - "ares", "microsoft", "deepgram", "openai", "google", "amazon", "assemblyai", "speechmatics", "sarvam" - ], - typing.Any, -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py deleted file mode 100644 index 9ab0f62..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ /dev/null @@ -1,115 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs -from .start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem -from .start_agents_request_properties_llm_style import StartAgentsRequestPropertiesLlmStyle - - -class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): - """ - Large language model (LLM) configuration. - """ - - url: str = pydantic.Field() - """ - The LLM callback address. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The LLM verification API key. The default value is an empty string. Ensure that you enable the API key in a production environment. - """ - - system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - A set of predefined information used as input to the LLM, including prompt words and examples. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional LLM configuration parameters, such as the `model` used, and the maximum token limit. For details about each supported LLM, refer to [Supported LLMs](https://docs.agora.io/en/conversational-ai/models/llm/overview#supported-llms). - """ - - max_history: typing.Optional[int] = pydantic.Field(default=None) - """ - The number of conversation history messages cached in the custom LLM. History includes user and agent dialog messages, tool call information, and timestamps. Agent and user messages are recorded separately. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM input modalities: - - `["text"]`: Text only - - `["text", "image"]`: Text plus image. Recommended configuration, requires the selected LLM to support visual input - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - LLM output modalities: - - `["text"]`: The output text is converted to speech by the TTS module and then published to the RTC channel. - - `["audio"]`: Voice only. Voice is published directly to the RTC channel. - - `["text", "audio"]`: Text plus voice. Write your own logic to process the output of LLM as needed. - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting. If provided, the first user in the channel is automatically greeted with the message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Prompt for agent activation failure. If provided, it is returned through TTS when the custom LLM call fails. - """ - - vendor: typing.Optional[str] = pydantic.Field(default=None) - """ - LLM provider, supports the following settings: - - `custom`: Custom LLM. When you set this option, the agent includes the following fields, in addition to `role` and `content` when making requests to the custom LLM: - - `turn_id`: A unique identifier for each conversation turn. It starts from `0` and increments with each turn. One user-agent interaction corresponds to one `turn_id`. - - `timestamp`: The request timestamp, in milliseconds. - - `azure`: Use this value for Azure OpenAI - """ - - style: typing.Optional[StartAgentsRequestPropertiesLlmStyle] = pydantic.Field(default=None) - """ - The request style for chat completion: - - `openai`: For OpenAI and OpenAI-compatible APIs - - `gemini`: For Google Gemini and Google Vertex API format - - `anthropic`: For Anthropic Claude API format - - `dify`: For Dify API format - """ - - greeting_configs: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigs] = pydantic.Field(default=None) - """ - Agent greeting broadcast configuration. - """ - - template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Template parameter configuration used to insert variables into the agent's `system_messages`, `greeting_message`, `failure_message`, and `parameters.silence_config.content` text. Uses key-value pairs, where the key is the variable name and the value is the variable's value. To insert defined variables in the prompt text, use the syntax `{{variable_name}}`. The system automatically replaces each variable with the corresponding value defined in `template_variables`. Variable values cannot reference other variables. - """ - - mcp_servers: typing.Optional[typing.List[StartAgentsRequestPropertiesLlmMcpServersItem]] = pydantic.Field( - default=None - ) - """ - MCP (Model Context Protocol) server configuration. By configuring MCP servers, agents can call tools provided by external services to implement advanced functionality. - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - Custom headers to include in requests to the LLM. Use this field to pass business-specific information such as custom fields or tenant identifiers. These headers are merged with the headers generated by the Conversational AI Engine. If a key conflict occurs, the engine-generated header takes precedence. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py deleted file mode 100644 index c0d7046..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ /dev/null @@ -1,43 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_llm_greeting_configs_mode import ( - StartAgentsRequestPropertiesLlmGreetingConfigsMode, -) - - -class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): - """ - Agent greeting broadcast configuration. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesLlmGreetingConfigsMode] = pydantic.Field(default=None) - """ - Determines when the agent sends greeting messages to users joining the channel. - - `single_every`: Broadcasts a greeting every time a user joins the channel. - - `single_first`: Broadcasts a greeting only once to the first user who joins the channel. - """ - - delay_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The delay in milliseconds before the agent plays the greeting message after a user joins the channel. - """ - - interruptable: typing.Optional[bool] = pydantic.Field(default=None) - """ - - `true`: Follows the global `interruption` configuration. - - `false`: Uninterruptible. The greeting plays in its entirety. If the user speaks multiple times while the greeting plays, the system merges the speech segments after the greeting ends and sends them to the LLM for a single response. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py deleted file mode 100644 index 44e4a55..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmGreetingConfigsMode = typing.Union[ - typing.Literal["single_every", "single_first"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py deleted file mode 100644 index 0474072..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_mcp_servers_item.py +++ /dev/null @@ -1,54 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesLlmMcpServersItem(UncheckedBaseModel): - name: str = pydantic.Field() - """ - A unique identifier for the MCP server. Maximum 48 characters. Accepts only English letters and numbers. - """ - - endpoint: str = pydantic.Field() - """ - The endpoint address of the MCP server. The agent uses this to communicate with the MCP server. - """ - - transport: typing.Optional[typing.Literal["streamable_http"]] = pydantic.Field(default=None) - """ - Transport protocol type. - - `streamable_http`: Streaming HTTP protocol - """ - - headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) - """ - HTTP header information to include when requesting the MCP server, such as authentication information. - """ - - allowed_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - A list of tools that the agent is allowed to invoke. The agent can only use tools on this list. - - Empty or omitted: All tools are enabled. - - Empty array `[]`: No tools are enabled. - - `["*"]`: All tools are enabled. - - Specific tools `["aa", "bb"]`: Only listed tools are enabled. - - Mix with wildcard `["aa", "*"]`: All tools are enabled (wildcard takes precedence). - """ - - timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - The MCP server request timeout in milliseconds. After timeout, the agent stops waiting for the MCP server's response and continues executing subsequent logic. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py deleted file mode 100644 index eaa9a0d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_style.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesLlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py deleted file mode 100644 index 0993ebc..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ /dev/null @@ -1,86 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection -from .start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor - - -class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - """ - Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. - """ - - enable: typing.Optional[bool] = pydantic.Field(default=None) - """ - Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. Replaces the deprecated `advanced_features.enable_mllm`. - """ - - url: typing.Optional[str] = pydantic.Field(default=None) - """ - The MLLM WebSocket URL for real-time communication. - """ - - api_key: typing.Optional[str] = pydantic.Field(default=None) - """ - The API key used for MLLM authentication. - """ - - messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) - """ - Array of conversation items used for short-term memory management. Uses the same structure as `item.content` from the OpenAI Realtime API. - """ - - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional MLLM configuration parameters. The `modalities` setting is overridden by `input_modalities` and `output_modalities`. The `turn_detection` setting is overridden by `mllm.turn_detection`. - """ - - input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM input modalities: - - `["audio"]`: Audio only - - `["audio", "text"]`: Audio plus text - """ - - output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) - """ - MLLM output modalities: - - `["text", "audio"]`: Text plus audio - """ - - greeting_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent greeting message. If provided, the first user in the channel is automatically greeted with this message upon joining. - """ - - failure_message: typing.Optional[str] = pydantic.Field(default=None) - """ - Agent failure message. If provided, the agent speaks this message when an MLLM request fails. - """ - - vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) - """ - MLLM provider. Currently supports: - - `openai`: OpenAI Realtime API - - `gemini`: Google Gemini Live - - `vertexai`: Google Gemini Live (Vertex AI) - - `xai`: xAI Grok Realtime API - """ - - turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py deleted file mode 100644 index 032979d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py +++ /dev/null @@ -1,61 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_agora_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig, -) - - -class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): - """ - Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. - """ - - mode: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionMode] = pydantic.Field(default=None) - """ - Turn detection mode for MLLM: - - `agora_vad`: Agora VAD-based detection. - - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API, Gemini Live, and xAI Grok. - - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. - """ - - agora_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - server_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig] = pydantic.Field( - default=None - ) - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - semantic_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig] = ( - pydantic.Field(default=None) - ) - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py deleted file mode 100644 index ec30215..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py +++ /dev/null @@ -1,42 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel - - -class StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): - """ - Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. - """ - - interrupt_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Minimum duration of speech in milliseconds required to trigger an interruption. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. A higher value reduces false positives. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py deleted file mode 100644 index 0d004e8..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionMode = typing.Union[ - typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py deleted file mode 100644 index 1e310f0..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py +++ /dev/null @@ -1,32 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness import ( - StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): - """ - Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. - """ - - eagerness: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness] = ( - pydantic.Field(default=None) - ) - """ - Controls how eagerly the model ends its turn. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py deleted file mode 100644 index 8b67b1d..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py +++ /dev/null @@ -1,7 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness = typing.Union[ - typing.Literal["auto", "low", "medium", "high"], typing.Any -] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py deleted file mode 100644 index c74d8d7..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py +++ /dev/null @@ -1,62 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -import pydantic -from ...core.pydantic_utilities import IS_PYDANTIC_V2 -from ...core.unchecked_base_model import UncheckedBaseModel -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, -) -from .start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, -) - - -class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBaseModel): - """ - Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. - """ - - prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of audio in milliseconds to include before the detected speech start. - """ - - silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Duration of silence in milliseconds required to determine end of speech. - """ - - threshold: typing.Optional[float] = pydantic.Field(default=None) - """ - VAD sensitivity threshold. Applicable to OpenAI Realtime API and xAI Grok. - """ - - idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) - """ - Idle timeout in milliseconds. Applicable to OpenAI Realtime API only. - """ - - start_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for start of speech detection. Applicable to Gemini Live only. - """ - - end_of_speech_sensitivity: typing.Optional[ - StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity - ] = pydantic.Field(default=None) - """ - Sensitivity for end of speech detection. Applicable to Gemini Live only. - """ - - if IS_PYDANTIC_V2: - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 - else: - - class Config: - frozen = True - smart_union = True - extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py deleted file mode 100644 index 0233696..0000000 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_vendor.py +++ /dev/null @@ -1,5 +0,0 @@ -# This file was auto-generated by Fern from our API Definition. - -import typing - -StartAgentsRequestPropertiesMllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py index 40dbb02..fb58a36 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py @@ -5,6 +5,7 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from ...types.asr_language import AsrLanguage from .start_agents_request_properties_turn_detection_config import StartAgentsRequestPropertiesTurnDetectionConfig from .start_agents_request_properties_turn_detection_eagerness import StartAgentsRequestPropertiesTurnDetectionEagerness from .start_agents_request_properties_turn_detection_interrupt_mode import ( @@ -18,6 +19,11 @@ class StartAgentsRequestPropertiesTurnDetection(UncheckedBaseModel): Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. """ + language: typing.Optional[AsrLanguage] = pydantic.Field(default=None) + """ + BCP-47 language tag identifying the primary language used for agent interaction. + """ + mode: typing.Optional[typing.Literal["default"]] = pydantic.Field(default=None) """ Conversation turn detection mode: diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index c44e886..acd9073 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agents/v2.0.0", + "User-Agent": "agora-agents/v2.1.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agents", - "X-Fern-SDK-Version": "v2.0.0", + "X-Fern-SDK-Version": "v2.1.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/types/amazon_asr.py b/src/agora_agent/types/amazon_asr.py new file mode 100644 index 0000000..4054518 --- /dev/null +++ b/src/agora_agent/types/amazon_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_asr_params import AmazonAsrParams +from .asr_language import AsrLanguage + + +class AmazonAsr(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_asr_params.py b/src/agora_agent/types/amazon_asr_params.py new file mode 100644 index 0000000..1d30688 --- /dev/null +++ b/src/agora_agent/types/amazon_asr_params.py @@ -0,0 +1,52 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AmazonAsrParams(UncheckedBaseModel): + """ + Amazon Transcribe ASR configuration parameters. + """ + + region: str = pydantic.Field() + """ + AWS region + """ + + access_key_id: str = pydantic.Field() + """ + AWS access key ID + """ + + secret_access_key: str = pydantic.Field() + """ + AWS secret access key + """ + + language_code: str = pydantic.Field() + """ + Language code for speech recognition + """ + + media_sample_rate_hz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hertz for the audio input + """ + + media_encoding: typing.Optional[str] = pydantic.Field(default=None) + """ + Encoding format of the audio input + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/amazon_tts_params.py b/src/agora_agent/types/amazon_tts_params.py index baaa6fa..bbecb36 100644 --- a/src/agora_agent/types/amazon_tts_params.py +++ b/src/agora_agent/types/amazon_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .amazon_tts_params_engine import AmazonTtsParamsEngine class AmazonTtsParams(UncheckedBaseModel): @@ -12,26 +13,31 @@ class AmazonTtsParams(UncheckedBaseModel): Amazon Polly TTS configuration parameters. """ - access_key: str = pydantic.Field() + aws_access_key_id: str = pydantic.Field() """ - AWS access key + AWS access key ID """ - secret_key: str = pydantic.Field() + aws_secret_access_key: str = pydantic.Field() """ AWS secret key """ - region: str = pydantic.Field() + region_name: str = pydantic.Field() """ AWS region (e.g., "us-east-1") """ - voice_id: str = pydantic.Field() + voice: str = pydantic.Field() """ Amazon Polly voice ID """ + engine: AmazonTtsParamsEngine = pydantic.Field() + """ + Amazon Polly engine type + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/amazon_tts_params_engine.py b/src/agora_agent/types/amazon_tts_params_engine.py new file mode 100644 index 0000000..d9e3cfe --- /dev/null +++ b/src/agora_agent/types/amazon_tts_params_engine.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AmazonTtsParamsEngine = typing.Union[typing.Literal["standard", "neural", "long-form", "generative"], typing.Any] diff --git a/src/agora_agent/types/ares_asr.py b/src/agora_agent/types/ares_asr.py new file mode 100644 index 0000000..cf42216 --- /dev/null +++ b/src/agora_agent/types/ares_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage + + +class AresAsr(UncheckedBaseModel): + """ + Adaptive Recognition Engine for Speech ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/ares_asr_params.py b/src/agora_agent/types/ares_asr_params.py new file mode 100644 index 0000000..afa1d76 --- /dev/null +++ b/src/agora_agent/types/ares_asr_params.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AresAsrParams = typing.Dict[str, typing.Any] diff --git a/src/agora_agent/types/asr.py b/src/agora_agent/types/asr.py new file mode 100644 index 0000000..f08086f --- /dev/null +++ b/src/agora_agent/types/asr.py @@ -0,0 +1,172 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import typing + +import pydantic +import typing_extensions +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel, UnionMetadata +from .amazon_asr_params import AmazonAsrParams +from .ares_asr_params import AresAsrParams +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams +from .deepgram_asr_params import DeepgramAsrParams +from .google_asr_params import GoogleAsrParams +from .microsoft_asr_params import MicrosoftAsrParams +from .open_ai_asr_params import OpenAiAsrParams +from .sarvam_asr_params import SarvamAsrParams +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class Asr_Ares(UncheckedBaseModel): + vendor: typing.Literal["ares"] = "ares" + language: typing.Optional[AsrLanguage] = None + params: typing.Optional[AresAsrParams] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Microsoft(UncheckedBaseModel): + vendor: typing.Literal["microsoft"] = "microsoft" + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Deepgram(UncheckedBaseModel): + vendor: typing.Literal["deepgram"] = "deepgram" + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Openai(UncheckedBaseModel): + vendor: typing.Literal["openai"] = "openai" + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Google(UncheckedBaseModel): + vendor: typing.Literal["google"] = "google" + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Amazon(UncheckedBaseModel): + vendor: typing.Literal["amazon"] = "amazon" + language: typing.Optional[AsrLanguage] = None + params: AmazonAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Assemblyai(UncheckedBaseModel): + vendor: typing.Literal["assemblyai"] = "assemblyai" + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Speechmatics(UncheckedBaseModel): + vendor: typing.Literal["speechmatics"] = "speechmatics" + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +class Asr_Sarvam(UncheckedBaseModel): + vendor: typing.Literal["sarvam"] = "sarvam" + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + +Asr = typing_extensions.Annotated[ + typing.Union[ + Asr_Ares, + Asr_Microsoft, + Asr_Deepgram, + Asr_Openai, + Asr_Google, + Asr_Amazon, + Asr_Assemblyai, + Asr_Speechmatics, + Asr_Sarvam, + ], + UnionMetadata(discriminant="vendor"), +] diff --git a/src/agora_agent/types/asr_language.py b/src/agora_agent/types/asr_language.py new file mode 100644 index 0000000..4ff3c88 --- /dev/null +++ b/src/agora_agent/types/asr_language.py @@ -0,0 +1,41 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AsrLanguage = typing.Union[ + typing.Literal[ + "ar-EG", + "ar-JO", + "ar-SA", + "ar-AE", + "bn-IN", + "zh-CN", + "zh-HK", + "zh-TW", + "nl-NL", + "en-IN", + "en-US", + "fil-PH", + "fr-FR", + "de-DE", + "gu-IN", + "he-IL", + "hi-IN", + "id-ID", + "it-IT", + "ja-JP", + "kn-IN", + "ko-KR", + "ms-MY", + "fa-IR", + "pt-PT", + "ru-RU", + "es-ES", + "ta-IN", + "te-IN", + "th-TH", + "tr-TR", + "vi-VN", + ], + typing.Any, +] diff --git a/src/agora_agent/types/assembly_ai_asr.py b/src/agora_agent/types/assembly_ai_asr.py new file mode 100644 index 0000000..ea2ebf4 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .assembly_ai_asr_params import AssemblyAiAsrParams + + +class AssemblyAiAsr(UncheckedBaseModel): + """ + AssemblyAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: AssemblyAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/assembly_ai_asr_params.py b/src/agora_agent/types/assembly_ai_asr_params.py new file mode 100644 index 0000000..f3a5818 --- /dev/null +++ b/src/agora_agent/types/assembly_ai_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class AssemblyAiAsrParams(UncheckedBaseModel): + """ + AssemblyAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + AssemblyAI API key + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for AssemblyAI's streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_output_format.py b/src/agora_agent/types/cartesia_tts_output_format.py new file mode 100644 index 0000000..ab7e122 --- /dev/null +++ b/src/agora_agent/types/cartesia_tts_output_format.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class CartesiaTtsOutputFormat(UncheckedBaseModel): + """ + Cartesia audio output format configuration. + """ + + container: typing.Optional[str] = pydantic.Field(default=None) + """ + Audio container format for the output stream + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sampling rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/cartesia_tts_params.py b/src/agora_agent/types/cartesia_tts_params.py index 2aaf069..1478570 100644 --- a/src/agora_agent/types/cartesia_tts_params.py +++ b/src/agora_agent/types/cartesia_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .cartesia_tts_output_format import CartesiaTtsOutputFormat from .cartesia_tts_voice import CartesiaTtsVoice @@ -18,15 +19,21 @@ class CartesiaTtsParams(UncheckedBaseModel): Cartesia API key """ - voice: CartesiaTtsVoice - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: str = pydantic.Field() """ - Model ID (optional) + Model ID (for example, sonic-2) """ - sample_rate: typing.Optional[int] = pydantic.Field(default=None) + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Cartesia streaming API + """ + + voice: CartesiaTtsVoice + output_format: typing.Optional[CartesiaTtsOutputFormat] = None + language: typing.Optional[str] = pydantic.Field(default=None) """ - Audio sampling rate in Hz + Target language for speech synthesis """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/deepgram_asr.py b/src/agora_agent/types/deepgram_asr.py new file mode 100644 index 0000000..1c79c7b --- /dev/null +++ b/src/agora_agent/types/deepgram_asr.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .deepgram_asr_params import DeepgramAsrParams + + +class DeepgramAsr(UncheckedBaseModel): + """ + Deepgram ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: DeepgramAsrParams + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands for preset-backed Deepgram usage. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_asr_params.py b/src/agora_agent/types/deepgram_asr_params.py new file mode 100644 index 0000000..259958e --- /dev/null +++ b/src/agora_agent/types/deepgram_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class DeepgramAsrParams(UncheckedBaseModel): + """ + Deepgram ASR configuration parameters. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for Deepgram's streaming API + """ + + key: str = pydantic.Field() + """ + Deepgram API key + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Speech recognition model + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for speech recognition + """ + + keyterm: typing.Optional[str] = pydantic.Field(default=None) + """ + Boost specialized terms and brands + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_tts_params.py b/src/agora_agent/types/deepgram_tts_params.py index e858291..ebac500 100644 --- a/src/agora_agent/types/deepgram_tts_params.py +++ b/src/agora_agent/types/deepgram_tts_params.py @@ -32,11 +32,6 @@ class DeepgramTtsParams(UncheckedBaseModel): Audio sampling rate in Hz """ - params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) - """ - Additional Deepgram TTS parameters - """ - skip_patterns: typing.Optional[typing.List[int]] = pydantic.Field(default=None) """ Controls whether the TTS module skips bracketed content when reading LLM response text. diff --git a/src/agora_agent/types/eleven_labs_tts_params.py b/src/agora_agent/types/eleven_labs_tts_params.py index c6127fd..4a2bf8f 100644 --- a/src/agora_agent/types/eleven_labs_tts_params.py +++ b/src/agora_agent/types/eleven_labs_tts_params.py @@ -12,7 +12,7 @@ class ElevenLabsTtsParams(UncheckedBaseModel): ElevenLabs TTS configuration parameters. """ - base_url: typing.Optional[str] = pydantic.Field(default=None) + base_url: str = pydantic.Field() """ WebSocket URL (e.g., "wss://api.elevenlabs.io/v1") """ @@ -37,6 +37,31 @@ class ElevenLabsTtsParams(UncheckedBaseModel): Audio sample rate in Hz (16kHz for Akool, 24kHz for HeyGen) """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech speed multiplier. + """ + + stability: typing.Optional[float] = pydantic.Field(default=None) + """ + Voice stability. Higher values produce more consistent speech. + """ + + similarity_boost: typing.Optional[float] = pydantic.Field(default=None) + """ + Similarity boost for the selected voice. + """ + + style: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking style and expressiveness control. + """ + + use_speaker_boost: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to improve voice quality and similarity. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/fish_audio_tts_params.py b/src/agora_agent/types/fish_audio_tts_params.py index 0ad77aa..9bb4ebb 100644 --- a/src/agora_agent/types/fish_audio_tts_params.py +++ b/src/agora_agent/types/fish_audio_tts_params.py @@ -12,7 +12,7 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Fish Audio API key """ @@ -22,6 +22,11 @@ class FishAudioTtsParams(UncheckedBaseModel): Fish Audio reference ID """ + backend: str = pydantic.Field() + """ + Backend model version to use + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/google_asr.py b/src/agora_agent/types/google_asr.py new file mode 100644 index 0000000..8473a04 --- /dev/null +++ b/src/agora_agent/types/google_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .google_asr_params import GoogleAsrParams + + +class GoogleAsr(UncheckedBaseModel): + """ + Google ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: GoogleAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_asr_params.py b/src/agora_agent/types/google_asr_params.py new file mode 100644 index 0000000..9d17db6 --- /dev/null +++ b/src/agora_agent/types/google_asr_params.py @@ -0,0 +1,47 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleAsrParams(UncheckedBaseModel): + """ + Google ASR configuration parameters. + """ + + project_id: str = pydantic.Field() + """ + Google Cloud project ID + """ + + location: str = pydantic.Field() + """ + Google Cloud region for the speech service + """ + + adc_credentials_string: str = pydantic.Field() + """ + Google Cloud service account credentials JSON string + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Recognition model to use + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_audio_config.py b/src/agora_agent/types/google_tts_audio_config.py new file mode 100644 index 0000000..9c2a405 --- /dev/null +++ b/src/agora_agent/types/google_tts_audio_config.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsAudioConfig(UncheckedBaseModel): + """ + Google audio output configuration. + """ + + speaking_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) + """ + Sample rate in Hz + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/google_tts_params.py b/src/agora_agent/types/google_tts_params.py index dc00322..4a9ee38 100644 --- a/src/agora_agent/types/google_tts_params.py +++ b/src/agora_agent/types/google_tts_params.py @@ -3,8 +3,12 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel +from .google_tts_audio_config import GoogleTtsAudioConfig +from .google_tts_voice_selection_params import GoogleTtsVoiceSelectionParams class GoogleTtsParams(UncheckedBaseModel): @@ -12,25 +16,17 @@ class GoogleTtsParams(UncheckedBaseModel): Google TTS configuration parameters. """ - key: str = pydantic.Field() + credentials: str = pydantic.Field() """ - Google Cloud API key + Google Cloud service account credentials JSON string """ - voice_name: str = pydantic.Field() - """ - Google voice name - """ - - language_code: typing.Optional[str] = pydantic.Field(default=None) - """ - Language code (e.g., "en-US") - """ - - sample_rate_hertz: typing.Optional[int] = pydantic.Field(default=None) - """ - Sample rate in Hz (default depends on selected voice) - """ + voice_selection_params: typing_extensions.Annotated[ + GoogleTtsVoiceSelectionParams, FieldMetadata(alias="VoiceSelectionParams") + ] + audio_config: typing_extensions.Annotated[ + typing.Optional[GoogleTtsAudioConfig], FieldMetadata(alias="AudioConfig") + ] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/src/agora_agent/types/google_tts_voice_selection_params.py b/src/agora_agent/types/google_tts_voice_selection_params.py new file mode 100644 index 0000000..ee75953 --- /dev/null +++ b/src/agora_agent/types/google_tts_voice_selection_params.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class GoogleTtsVoiceSelectionParams(UncheckedBaseModel): + """ + Google voice selection parameters. + """ + + name: str = pydantic.Field() + """ + Google voice name + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/hume_ai_tts_params.py b/src/agora_agent/types/hume_ai_tts_params.py index 08cb12b..00c9f54 100644 --- a/src/agora_agent/types/hume_ai_tts_params.py +++ b/src/agora_agent/types/hume_ai_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .hume_ai_tts_params_provider import HumeAiTtsParamsProvider class HumeAiTtsParams(UncheckedBaseModel): @@ -17,9 +18,34 @@ class HumeAiTtsParams(UncheckedBaseModel): Hume AI API key """ + voice_id: str = pydantic.Field() + """ + Hume AI voice ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Base URL for the Hume AI API + """ + + provider: HumeAiTtsParamsProvider = pydantic.Field() + """ + Voice provider type + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Playback speed of the generated speech + """ + + trailing_silence: typing.Optional[float] = pydantic.Field(default=None) + """ + Duration of silence in seconds to add at the end of each utterance + """ + config_id: typing.Optional[str] = pydantic.Field(default=None) """ - Hume AI configuration ID + Hume AI configuration ID. Deprecated; use voice_id for the documented TTS shape. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/hume_ai_tts_params_provider.py b/src/agora_agent/types/hume_ai_tts_params_provider.py new file mode 100644 index 0000000..cf07e73 --- /dev/null +++ b/src/agora_agent/types/hume_ai_tts_params_provider.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +HumeAiTtsParamsProvider = typing.Union[typing.Literal["HUME_AI", "CUSTOM_VOICE"], typing.Any] diff --git a/src/agora_agent/types/llm.py b/src/agora_agent/types/llm.py new file mode 100644 index 0000000..2b0283d --- /dev/null +++ b/src/agora_agent/types/llm.py @@ -0,0 +1,120 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .llm_params import LlmParams +from .llm_style import LlmStyle + + +class Llm(UncheckedBaseModel): + """ + Large language model (LLM) configuration. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM callback address. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM verification API key. + """ + + access_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS access key ID. Used by Amazon Bedrock when api_key is not provided. + """ + + secret_key: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS secret access key. Used by Amazon Bedrock when api_key is not provided. + """ + + region: typing.Optional[str] = pydantic.Field(default=None) + """ + AWS region. Used by Amazon Bedrock. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Top-level model identifier. Used by Amazon Bedrock. + """ + + system_messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + A set of predefined information used as input to the LLM. + """ + + params: typing.Optional[LlmParams] = None + max_history: typing.Optional[int] = pydantic.Field(default=None) + """ + The number of conversation history messages cached in the custom LLM. + """ + + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + LLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Prompt for agent activation failure. + """ + + vendor: typing.Optional[str] = pydantic.Field(default=None) + """ + LLM provider identifier. + """ + + style: typing.Optional[LlmStyle] = pydantic.Field(default=None) + """ + The request style for chat completion. + """ + + ignore_empty: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to handle empty Gemini responses. + """ + + greeting_configs: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) + """ + Agent greeting broadcast configuration. + """ + + template_variables: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Template parameter configuration. + """ + + mcp_servers: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + MCP server configuration. + """ + + headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Custom headers to include in requests to the LLM. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_params.py b/src/agora_agent/types/llm_params.py new file mode 100644 index 0000000..f6df01f --- /dev/null +++ b/src/agora_agent/types/llm_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class LlmParams(UncheckedBaseModel): + """ + Additional LLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The LLM model identifier. + """ + + max_tokens: typing.Optional[int] = pydantic.Field(default=None) + """ + Maximum tokens in the response. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/llm_style.py b/src/agora_agent/types/llm_style.py new file mode 100644 index 0000000..8319ca1 --- /dev/null +++ b/src/agora_agent/types/llm_style.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +LlmStyle = typing.Union[typing.Literal["openai", "gemini", "anthropic", "dify", "bedrock"], typing.Any] diff --git a/src/agora_agent/types/microsoft_asr.py b/src/agora_agent/types/microsoft_asr.py new file mode 100644 index 0000000..f602e09 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .microsoft_asr_params import MicrosoftAsrParams + + +class MicrosoftAsr(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: MicrosoftAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_asr_params.py b/src/agora_agent/types/microsoft_asr_params.py new file mode 100644 index 0000000..bea79e4 --- /dev/null +++ b/src/agora_agent/types/microsoft_asr_params.py @@ -0,0 +1,42 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MicrosoftAsrParams(UncheckedBaseModel): + """ + Microsoft Azure ASR configuration parameters. + """ + + key: str = pydantic.Field() + """ + Microsoft Azure API key + """ + + region: str = pydantic.Field() + """ + Azure region + """ + + language: str = pydantic.Field() + """ + Language code for speech recognition + """ + + phrase_list: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + Words or phrases to improve recognition accuracy + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/microsoft_tts_params.py b/src/agora_agent/types/microsoft_tts_params.py index 3c9e80c..12f441e 100644 --- a/src/agora_agent/types/microsoft_tts_params.py +++ b/src/agora_agent/types/microsoft_tts_params.py @@ -32,6 +32,16 @@ class MicrosoftTtsParams(UncheckedBaseModel): Audio sampling rate in Hz """ + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. Values between 0.5 and 2.0. + """ + + volume: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio volume. Values between 0.0 and 100.0. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/mllm.py b/src/agora_agent/types/mllm.py new file mode 100644 index 0000000..3bcdb95 --- /dev/null +++ b/src/agora_agent/types/mllm.py @@ -0,0 +1,88 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_params import MllmParams +from .mllm_turn_detection import MllmTurnDetection +from .mllm_vendor import MllmVendor + + +class Mllm(UncheckedBaseModel): + """ + Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. + """ + + enable: typing.Optional[bool] = pydantic.Field(default=None) + """ + Enable Multimodal Large Language Model. + """ + + url: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM WebSocket URL for real-time communication. + """ + + api_key: typing.Optional[str] = pydantic.Field(default=None) + """ + The API key used for MLLM authentication. + """ + + adc_credentials_string: typing.Optional[str] = pydantic.Field(default=None) + """ + Base64-encoded Google Cloud Application Default Credentials. Used by Vertex AI. + """ + + project_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud project ID. Used by Vertex AI. + """ + + location: typing.Optional[str] = pydantic.Field(default=None) + """ + Google Cloud location or region. Used by Vertex AI. + """ + + messages: typing.Optional[typing.List[typing.Dict[str, typing.Any]]] = pydantic.Field(default=None) + """ + Array of conversation items used for short-term memory management. + """ + + params: typing.Optional[MllmParams] = None + input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM input modalities. + """ + + output_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + MLLM output modalities. + """ + + greeting_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent greeting message. + """ + + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Agent failure message. + """ + + vendor: typing.Optional[MllmVendor] = pydantic.Field(default=None) + """ + MLLM provider. + """ + + turn_detection: typing.Optional[MllmTurnDetection] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_http_options.py b/src/agora_agent/types/mllm_http_options.py new file mode 100644 index 0000000..19baebb --- /dev/null +++ b/src/agora_agent/types/mllm_http_options.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmHttpOptions(UncheckedBaseModel): + """ + HTTP request options for the MLLM provider. + """ + + api_version: typing.Optional[str] = pydantic.Field(default=None) + """ + API version to use. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_input_audio_transcription.py b/src/agora_agent/types/mllm_input_audio_transcription.py new file mode 100644 index 0000000..6bb3d9d --- /dev/null +++ b/src/agora_agent/types/mllm_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmInputAudioTranscription(UncheckedBaseModel): + """ + Configuration for audio input transcription. + """ + + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language of the input audio. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + Model to use for transcription. + """ + + prompt: typing.Optional[str] = pydantic.Field(default=None) + """ + Text to guide the transcription model. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_params.py b/src/agora_agent/types/mllm_params.py new file mode 100644 index 0000000..5437b69 --- /dev/null +++ b/src/agora_agent/types/mllm_params.py @@ -0,0 +1,71 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_http_options import MllmHttpOptions +from .mllm_input_audio_transcription import MllmInputAudioTranscription + + +class MllmParams(UncheckedBaseModel): + """ + Additional MLLM configuration parameters. + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + The MLLM model identifier. + """ + + voice: typing.Optional[str] = pydantic.Field(default=None) + """ + Voice identifier for audio output. + """ + + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + System instructions that define the agent behavior or tone. + """ + + input_audio_transcription: typing.Optional[MllmInputAudioTranscription] = None + affective_dialog: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to enable Gemini affective dialog. + """ + + proactive_audio: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether Gemini may choose not to respond when no reply is needed. + """ + + transcribe_agent: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the agent speech in real time. + """ + + transcribe_user: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to transcribe the user speech in real time. + """ + + http_options: typing.Optional[MllmHttpOptions] = None + language: typing.Optional[str] = pydantic.Field(default=None) + """ + Language code for xAI Grok speech recognition and synthesis. + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sample rate in Hz. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection.py b/src/agora_agent/types/mllm_turn_detection.py new file mode 100644 index 0000000..2cd3503 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection.py @@ -0,0 +1,35 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_agora_vad_config import MllmTurnDetectionAgoraVadConfig +from .mllm_turn_detection_mode import MllmTurnDetectionMode +from .mllm_turn_detection_semantic_vad_config import MllmTurnDetectionSemanticVadConfig +from .mllm_turn_detection_server_vad_config import MllmTurnDetectionServerVadConfig + + +class MllmTurnDetection(UncheckedBaseModel): + """ + Turn detection configuration for the MLLM module. + """ + + mode: typing.Optional[MllmTurnDetectionMode] = pydantic.Field(default=None) + """ + Turn detection mode for MLLM. + """ + + agora_vad_config: typing.Optional[MllmTurnDetectionAgoraVadConfig] = None + server_vad_config: typing.Optional[MllmTurnDetectionServerVadConfig] = None + semantic_vad_config: typing.Optional[MllmTurnDetectionSemanticVadConfig] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py new file mode 100644 index 0000000..4168ef3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_agora_vad_config.py @@ -0,0 +1,23 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class MllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): + interrupt_duration_ms: typing.Optional[int] = None + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_mode.py b/src/agora_agent/types/mllm_turn_detection_mode.py new file mode 100644 index 0000000..f6cd693 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_mode.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionMode = typing.Union[typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py new file mode 100644 index 0000000..aeaf440 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config.py @@ -0,0 +1,21 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_semantic_vad_config_eagerness import MllmTurnDetectionSemanticVadConfigEagerness + + +class MllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): + eagerness: typing.Optional[MllmTurnDetectionSemanticVadConfigEagerness] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py new file mode 100644 index 0000000..dbf9b4d --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_semantic_vad_config_eagerness.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmTurnDetectionSemanticVadConfigEagerness = typing.Union[typing.Literal["auto", "low", "medium", "high"], typing.Any] diff --git a/src/agora_agent/types/mllm_turn_detection_server_vad_config.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py new file mode 100644 index 0000000..b2976b3 --- /dev/null +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, +) +from .mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( + MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, +) + + +class MllmTurnDetectionServerVadConfig(UncheckedBaseModel): + prefix_padding_ms: typing.Optional[int] = None + silence_duration_ms: typing.Optional[int] = None + threshold: typing.Optional[float] = None + idle_timeout_ms: typing.Optional[int] = None + start_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity] = None + end_of_speech_sensitivity: typing.Optional[MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py index e92d3f1..b9b3377 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ typing.Literal["END_SENSITIVITY_HIGH", "END_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py similarity index 61% rename from src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py rename to src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py index 25860c1..90ccf51 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py +++ b/src/agora_agent/types/mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py @@ -2,6 +2,6 @@ import typing -StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ +MllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ typing.Literal["START_SENSITIVITY_HIGH", "START_SENSITIVITY_LOW"], typing.Any ] diff --git a/src/agora_agent/types/mllm_vendor.py b/src/agora_agent/types/mllm_vendor.py new file mode 100644 index 0000000..61c4d1a --- /dev/null +++ b/src/agora_agent/types/mllm_vendor.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +MllmVendor = typing.Union[typing.Literal["openai", "gemini", "vertexai", "xai"], typing.Any] diff --git a/src/agora_agent/types/murf_tts_params.py b/src/agora_agent/types/murf_tts_params.py index 5107f62..78f78d8 100644 --- a/src/agora_agent/types/murf_tts_params.py +++ b/src/agora_agent/types/murf_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,19 +14,46 @@ class MurfTtsParams(UncheckedBaseModel): Murf TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Murf API key """ - voice_id: str = pydantic.Field() + base_url: typing.Optional[str] = pydantic.Field(default=None) """ - Voice ID (e.g., Ariana, Natalie, Ken) + WebSocket endpoint for streaming TTS output """ - style: typing.Optional[str] = pydantic.Field(default=None) + voice_id: typing_extensions.Annotated[typing.Optional[str], FieldMetadata(alias="voiceId")] = pydantic.Field( + default=None + ) """ - Voice style (e.g., Angry, Sad, Conversational, Newscast) + Voice ID (e.g., Matthew) + """ + + locale: typing.Optional[str] = pydantic.Field(default=None) + """ + Locale for the selected voice + """ + + rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Speech rate adjustment + """ + + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment + """ + + model: typing.Optional[str] = pydantic.Field(default=None) + """ + TTS model to use + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/open_ai_asr.py b/src/agora_agent/types/open_ai_asr.py new file mode 100644 index 0000000..eec2aab --- /dev/null +++ b/src/agora_agent/types/open_ai_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .open_ai_asr_params import OpenAiAsrParams + + +class OpenAiAsr(UncheckedBaseModel): + """ + OpenAI ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: OpenAiAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_asr_params.py b/src/agora_agent/types/open_ai_asr_params.py new file mode 100644 index 0000000..a5fadc8 --- /dev/null +++ b/src/agora_agent/types/open_ai_asr_params.py @@ -0,0 +1,30 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .open_ai_input_audio_transcription import OpenAiInputAudioTranscription + + +class OpenAiAsrParams(UncheckedBaseModel): + """ + OpenAI ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + OpenAI API key + """ + + input_audio_transcription: OpenAiInputAudioTranscription + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_input_audio_transcription.py b/src/agora_agent/types/open_ai_input_audio_transcription.py new file mode 100644 index 0000000..9db45b1 --- /dev/null +++ b/src/agora_agent/types/open_ai_input_audio_transcription.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class OpenAiInputAudioTranscription(UncheckedBaseModel): + """ + OpenAI audio transcription configuration. + """ + + model: str = pydantic.Field() + """ + OpenAI ASR model to use for transcription + """ + + prompt: str = pydantic.Field() + """ + Prompt that guides the transcription process + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 3839646..c8f6e51 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -14,7 +14,12 @@ class OpenAiTtsParams(UncheckedBaseModel): api_key: typing.Optional[str] = pydantic.Field(default=None) """ - OpenAI API key. Optional for Agora-managed OpenAI TTS usage. + OpenAI API key. Optional for preset-backed OpenAI TTS usage. + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Endpoint URL for the OpenAI TTS service. """ voice: str = pydantic.Field() @@ -27,6 +32,16 @@ class OpenAiTtsParams(UncheckedBaseModel): Model name (e.g., "tts-1", "tts-1-hd") """ + instructions: typing.Optional[str] = pydantic.Field(default=None) + """ + Custom instructions for voice style, accent, pace, and tone. + """ + + speed: typing.Optional[float] = pydantic.Field(default=None) + """ + Speaking rate multiplier. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/rime_tts_params.py b/src/agora_agent/types/rime_tts_params.py index 6d18375..ade1c5b 100644 --- a/src/agora_agent/types/rime_tts_params.py +++ b/src/agora_agent/types/rime_tts_params.py @@ -3,7 +3,9 @@ import typing import pydantic +import typing_extensions from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.serialization import FieldMetadata from ..core.unchecked_base_model import UncheckedBaseModel @@ -12,7 +14,7 @@ class RimeTtsParams(UncheckedBaseModel): Rime TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Rime API key """ @@ -22,9 +24,14 @@ class RimeTtsParams(UncheckedBaseModel): Rime speaker ID """ - model_id: typing.Optional[str] = pydantic.Field(default=None) + model_id: typing_extensions.Annotated[str, FieldMetadata(alias="modelId")] = pydantic.Field() """ - Model ID (optional) + Rime TTS model ID + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Rime streaming API """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/types/sarvam_asr.py b/src/agora_agent/types/sarvam_asr.py new file mode 100644 index 0000000..ec95847 --- /dev/null +++ b/src/agora_agent/types/sarvam_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .sarvam_asr_params import SarvamAsrParams + + +class SarvamAsr(UncheckedBaseModel): + """ + Sarvam ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SarvamAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_asr_params.py b/src/agora_agent/types/sarvam_asr_params.py new file mode 100644 index 0000000..f29769d --- /dev/null +++ b/src/agora_agent/types/sarvam_asr_params.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SarvamAsrParams(UncheckedBaseModel): + """ + Sarvam ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Sarvam API key + """ + + language: str = pydantic.Field() + """ + Language code for transcription. Set to unknown for automatic language detection. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/sarvam_tts_params.py b/src/agora_agent/types/sarvam_tts_params.py index 93457a4..855299f 100644 --- a/src/agora_agent/types/sarvam_tts_params.py +++ b/src/agora_agent/types/sarvam_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .sarvam_tts_params_target_language_code import SarvamTtsParamsTargetLanguageCode class SarvamTtsParams(UncheckedBaseModel): @@ -12,7 +13,7 @@ class SarvamTtsParams(UncheckedBaseModel): Sarvam TTS configuration parameters. """ - key: str = pydantic.Field() + api_subscription_key: str = pydantic.Field() """ Sarvam API subscription key """ @@ -22,11 +23,31 @@ class SarvamTtsParams(UncheckedBaseModel): Voice ID (e.g., anushka, abhilash, karun, hitesh, manisha, vidya, arya) """ - target_language_code: str = pydantic.Field() + target_language_code: SarvamTtsParamsTargetLanguageCode = pydantic.Field() """ Target language code (e.g., en-IN) """ + pitch: typing.Optional[float] = pydantic.Field(default=None) + """ + Pitch adjustment for the voice + """ + + pace: typing.Optional[float] = pydantic.Field(default=None) + """ + Speed of speech + """ + + loudness: typing.Optional[float] = pydantic.Field(default=None) + """ + Volume level of the speech + """ + + sample_rate: typing.Optional[float] = pydantic.Field(default=None) + """ + Audio sample rate in Hz + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/types/sarvam_tts_params_target_language_code.py b/src/agora_agent/types/sarvam_tts_params_target_language_code.py new file mode 100644 index 0000000..b1722ec --- /dev/null +++ b/src/agora_agent/types/sarvam_tts_params_target_language_code.py @@ -0,0 +1,8 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +SarvamTtsParamsTargetLanguageCode = typing.Union[ + typing.Literal["en-IN", "hi-IN", "bn-IN", "ta-IN", "te-IN", "kn-IN", "ml-IN", "mr-IN", "gu-IN", "pa-IN", "or-IN"], + typing.Any, +] diff --git a/src/agora_agent/types/speechmatics_asr.py b/src/agora_agent/types/speechmatics_asr.py new file mode 100644 index 0000000..644db25 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .asr_language import AsrLanguage +from .speechmatics_asr_params import SpeechmaticsAsrParams + + +class SpeechmaticsAsr(UncheckedBaseModel): + """ + Speechmatics ASR configuration. + """ + + language: typing.Optional[AsrLanguage] = None + params: SpeechmaticsAsrParams + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/speechmatics_asr_params.py b/src/agora_agent/types/speechmatics_asr_params.py new file mode 100644 index 0000000..4709d22 --- /dev/null +++ b/src/agora_agent/types/speechmatics_asr_params.py @@ -0,0 +1,37 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class SpeechmaticsAsrParams(UncheckedBaseModel): + """ + Speechmatics ASR configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Speechmatics API key + """ + + language: str = pydantic.Field() + """ + Language code to use for transcription + """ + + uri: typing.Optional[str] = pydantic.Field(default=None) + """ + WebSocket URL for the Speechmatics streaming API + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/tests/custom/test_llm_vendors.py b/tests/custom/test_llm_vendors.py index faca9bf..2861e45 100644 --- a/tests/custom/test_llm_vendors.py +++ b/tests/custom/test_llm_vendors.py @@ -1,8 +1,10 @@ -from agora_agent import AmazonBedrock, CustomLLM, Dify, Groq, VertexAILLM +import pytest + +from agora_agent import AmazonBedrock, Anthropic, AzureOpenAI, CustomLLM, Dify, Gemini, Groq, OpenAI, VertexAILLM def test_groq_serializes_as_openai_compatible() -> None: - config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile").to_config() + config = Groq(api_key="groq-key", model="llama-3.3-70b-versatile", base_url="https://api.groq.com/openai/v1/chat/completions").to_config() assert config["url"] == "https://api.groq.com/openai/v1/chat/completions" assert config["api_key"] == "groq-key" @@ -19,6 +21,37 @@ def test_custom_llm_marks_request_as_custom() -> None: assert config["style"] == "openai" +def test_anthropic_serializes_required_claude_fields() -> None: + config = Anthropic( + api_key="anthropic-key", + model="claude-3-5-sonnet-20241022", + url="https://api.anthropic.com/v1/messages", + headers={"anthropic-version": "2023-06-01"}, + max_tokens=1024, + ).to_config() + + assert config["url"] == "https://api.anthropic.com/v1/messages" + assert config["api_key"] == "anthropic-key" + assert config["style"] == "anthropic" + assert config["headers"]["anthropic-version"] == "2023-06-01" + assert config["params"]["model"] == "claude-3-5-sonnet-20241022" + assert config["params"]["max_tokens"] == 1024 + + +def test_azure_openai_includes_required_model_param() -> None: + config = AzureOpenAI( + api_key="azure-key", + endpoint="https://example.openai.azure.com", + deployment_name="deployment", + model="gpt-4o", + ).to_config() + + assert config["api_key"] == "azure-key" + assert config["vendor"] == "azure" + assert config["style"] == "openai" + assert config["params"]["model"] == "gpt-4o" + + def test_vertex_ai_llm_includes_project_routing() -> None: config = VertexAILLM( api_key="vertex-token", @@ -34,27 +67,68 @@ def test_vertex_ai_llm_includes_project_routing() -> None: assert config["params"]["location"] == "us-central1" -def test_amazon_bedrock_serializes_as_anthropic_style() -> None: +def test_amazon_bedrock_serializes_as_bedrock_style() -> None: config = AmazonBedrock( - api_key="bedrock-key", - url="https://bedrock.example.com/messages", + access_key="aws-access", + secret_key="aws-secret", + region="us-east-1", model="anthropic.claude-3-5-sonnet-20241022-v2:0", ).to_config() - assert config["api_key"] == "bedrock-key" - assert config["style"] == "anthropic" - assert config["params"]["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + assert config["access_key"] == "aws-access" + assert config["secret_key"] == "aws-secret" + assert config["region"] == "us-east-1" + assert config["url"] == "https://bedrock-runtime.us-east-1.amazonaws.com/model/anthropic.claude-3-5-sonnet-20241022-v2:0/converse-stream" + assert config["model"] == "anthropic.claude-3-5-sonnet-20241022-v2:0" + assert config["style"] == "bedrock" def test_dify_serializes_conversation_fields() -> None: config = Dify( api_key="dify-key", url="https://api.dify.ai/v1/chat-messages", + model="default", user="user-1", conversation_id="conversation-1", ).to_config() assert config["api_key"] == "dify-key" assert config["style"] == "dify" + assert config["params"]["model"] == "default" assert config["params"]["user"] == "user-1" assert config["params"]["conversation_id"] == "conversation-1" + + +def test_llm_vendors_reject_missing_required_models() -> None: + with pytest.raises(Exception, match="model"): + OpenAI(api_key="openai-key", base_url="https://api.openai.com/v1/chat/completions") + + with pytest.raises(Exception, match="model"): + Anthropic( + api_key="anthropic-key", + url="https://api.anthropic.com/v1/messages", + headers={"anthropic-version": "2023-06-01"}, + max_tokens=1024, + ) + + with pytest.raises(Exception, match="model"): + Gemini(api_key="google-key") + + with pytest.raises(Exception, match="model"): + Groq(api_key="groq-key", base_url="https://api.groq.com/openai/v1/chat/completions") + + with pytest.raises(Exception, match="model"): + VertexAILLM(api_key="vertex-token", project_id="project", location="us-central1") + + with pytest.raises(Exception, match="model"): + AmazonBedrock(access_key="aws-access", secret_key="aws-secret", region="us-east-1") + + +def test_openai_managed_mode_is_restricted_to_supported_models() -> None: + assert OpenAI(model="gpt-5-mini").to_config()["params"]["model"] == "gpt-5-mini" + + with pytest.raises(Exception, match="api_key"): + OpenAI(model="gpt-4o") + + with pytest.raises(Exception, match="does not allow vendor"): + OpenAI(model="gpt-5-mini", vendor="custom") diff --git a/tests/custom/test_pipeline_id.py b/tests/custom/test_pipeline_id.py new file mode 100644 index 0000000..c6c8c8f --- /dev/null +++ b/tests/custom/test_pipeline_id.py @@ -0,0 +1,123 @@ +import pytest + +from agora_agent import Agent + + +def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + if hasattr(value, "dict"): + return value.dict(exclude_none=True) + return value + + +class StartResponse: + agent_id = "agent-id" + + +class FakeAgentsClient: + def __init__(self): + self.calls = [] + + def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + +class FakeAsyncAgentsClient: + def __init__(self): + self.calls = [] + + async def start(self, appid, **kwargs): + self.calls.append({"appid": appid, **kwargs}) + return StartResponse() + + +class FakeClient: + app_id = "appid" + app_certificate = None + + def __init__(self, agents): + self.agents = agents + + +def start_agent(agent, **overrides): + agents = FakeAgentsClient() + client = FakeClient(agents) + options = { + "channel": "channel", + "token": "token", + "agent_uid": "1", + "remote_uids": ["100"], + **overrides, + } + + agent_id = agent.create_session(client, **options).start() + + assert agent_id == "agent-id" + assert len(agents.calls) == 1 + return agents.calls[0] + + +def test_agent_pipeline_id_sends_top_level_pipeline_id() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["appid"] == "appid" + assert call["name"] == "support" + assert call["pipeline_id"] == "studio-pipeline-id" + properties = dump(call["properties"]) + assert properties["channel"] == "channel" + assert properties["token"] == "token" + assert properties["agent_rtc_uid"] == "1" + assert properties["remote_rtc_uids"] == ["100"] + + +def test_session_pipeline_id_overrides_agent_pipeline_id() -> None: + call = start_agent( + Agent(name="support", pipeline_id="agent-pipeline"), + pipeline_id="session-pipeline", + ) + + assert call["pipeline_id"] == "session-pipeline" + + +def test_agent_pipeline_id_skips_missing_vendor_validation() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + + +def test_pipeline_id_is_not_sent_inside_properties() -> None: + call = start_agent(Agent(name="support", pipeline_id="studio-pipeline-id")) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(call["properties"]) + + +def test_pipeline_id_survives_builder_clone() -> None: + agent = Agent(name="support", pipeline_id="studio-pipeline-id").with_tools(True) + + assert agent.pipeline_id == "studio-pipeline-id" + call = start_agent(agent) + + assert call["pipeline_id"] == "studio-pipeline-id" + assert dump(call["properties"])["advanced_features"] == {"enable_tools": True} + + +@pytest.mark.asyncio +async def test_async_session_uses_agent_pipeline_id() -> None: + agents = FakeAsyncAgentsClient() + client = FakeClient(agents) + agent = Agent(name="support", pipeline_id="studio-pipeline-id") + + agent_id = await agent.create_async_session( + client, + channel="channel", + token="token", + agent_uid="1", + remote_uids=["100"], + ).start() + + assert agent_id == "agent-id" + assert agents.calls[0]["pipeline_id"] == "studio-pipeline-id" + assert "pipeline_id" not in dump(agents.calls[0]["properties"]) diff --git a/tests/custom/test_stt_language.py b/tests/custom/test_stt_language.py new file mode 100644 index 0000000..c398e02 --- /dev/null +++ b/tests/custom/test_stt_language.py @@ -0,0 +1,139 @@ +import pytest + +from agora_agent import ( + Agent, + AmazonSTT, + AssemblyAISTT, + DeepgramSTT, + ElevenLabsTTS, + GoogleSTT, + OpenAI, + OpenAISTT, + SpeechmaticsSTT, + TurnDetectionConfig, +) + + +def dump(value): + if hasattr(value, "model_dump"): + return value.model_dump(exclude_none=True) + return value.dict(exclude_none=True) + + +def base_agent() -> Agent: + return ( + Agent() + .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini", base_url="https://api.openai.com/v1/chat/completions")) + .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5", base_url="wss://api.elevenlabs.io/v1")) + ) + + +def properties(agent: Agent) -> dict: + return dump( + agent.to_properties( + channel="channel", + token="token", + agent_uid="1001", + remote_uids=["1002"], + ) + ) + + +def test_bcp47_stt_language_sets_turn_detection_language_and_provider_param() -> None: + props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en-US"))) + + assert props["asr"]["vendor"] == "speechmatics" + assert "language" not in props["asr"] + assert props["turn_detection"]["language"] == "en-US" + assert props["asr"]["params"]["language"] == "en-US" + + +def test_provider_language_defaults_turn_detection_language_when_not_supported_by_ares() -> None: + props = properties(base_agent().with_stt(SpeechmaticsSTT(api_key="stt-key", language="en"))) + + assert props["asr"]["vendor"] == "speechmatics" + assert "language" not in props["asr"] + assert props["turn_detection"]["language"] == "en-US" + assert props["asr"]["params"]["language"] == "en" + + +def test_turn_detection_language_can_differ_from_provider_language() -> None: + props = properties( + Agent(turn_detection=TurnDetectionConfig(language="fr-FR")) + .with_llm(OpenAI(api_key="llm-key", model="gpt-4o-mini", base_url="https://api.openai.com/v1/chat/completions")) + .with_tts(ElevenLabsTTS(key="tts-key", voice_id="voice", model_id="eleven_flash_v2_5", base_url="wss://api.elevenlabs.io/v1")) + .with_stt(SpeechmaticsSTT(api_key="stt-key", language="en")) + ) + + assert props["turn_detection"]["language"] == "fr-FR" + assert "language" not in props["asr"] + assert props["asr"]["params"]["language"] == "en" + + +def test_invalid_turn_detection_language_is_rejected() -> None: + with pytest.raises(ValueError, match="Invalid interaction language: en"): + properties(Agent(turn_detection=TurnDetectionConfig(language="en"))) # type: ignore[arg-type] + + +def test_default_turn_detection_language_is_sent_without_stt() -> None: + props = properties(base_agent()) + + assert props["asr"] == {"vendor": "ares"} + assert props["turn_detection"] == {"language": "en-US"} + + +def test_stt_vendor_params_match_documented_shapes() -> None: + assert DeepgramSTT(model="nova-3", language="en-US").to_config()["params"] == { + "model": "nova-3", + "language": "en-US", + } + + with pytest.raises(Exception, match="api_key"): + DeepgramSTT(model="enhanced") + + assert DeepgramSTT(api_key="dg-key", language="en").to_config()["params"] == { + "key": "dg-key", + "language": "en", + } + + assert OpenAISTT(api_key="openai-key", model="gpt-4o-mini-transcribe", language="en").to_config()["params"] == { + "api_key": "openai-key", + "input_audio_transcription": { + "model": "gpt-4o-mini-transcribe", + "language": "en", + }, + } + + assert OpenAISTT(api_key="openai-key").to_config()["params"] == { + "api_key": "openai-key", + "input_audio_transcription": { + "model": "whisper-1", + }, + } + + assert GoogleSTT( + project_id="project", + location="global", + adc_credentials_string="{}", + language="en-US", + model="long", + ).to_config()["params"] == { + "project_id": "project", + "location": "global", + "adc_credentials_string": "{}", + "language": "en-US", + "model": "long", + } + + assert AmazonSTT(access_key="access", secret_key="secret", region="us-east-1", language="en-US").to_config()["params"] == { + "access_key_id": "access", + "secret_access_key": "secret", + "region": "us-east-1", + "language_code": "en-US", + } + + assert AssemblyAISTT(api_key="assembly-key", language="en-US", uri="wss://example.test/ws").to_config()["params"] == { + "api_key": "assembly-key", + "language": "en-US", + "uri": "wss://example.test/ws", + } diff --git a/tests/custom/test_tts_vendors.py b/tests/custom/test_tts_vendors.py new file mode 100644 index 0000000..9499eca --- /dev/null +++ b/tests/custom/test_tts_vendors.py @@ -0,0 +1,118 @@ +import pytest + +from agora_agent import AmazonTTS, CartesiaTTS, DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, HumeAITTS, MiniMaxTTS, MurfTTS, OpenAITTS, RimeTTS, SarvamTTS + + +def test_tts_vendor_params_match_generated_core_shapes() -> None: + assert AmazonTTS(access_key="access", secret_key="secret", region="us-east-1", voice_id="Joanna", engine="neural").to_config()["params"] == { + "aws_access_key_id": "access", + "aws_secret_access_key": "secret", + "region_name": "us-east-1", + "voice": "Joanna", + "engine": "neural", + } + + assert GoogleTTS(key="{}", voice_name="en-US-JennyNeural", language_code="en-US", sample_rate_hertz=24000).to_config()["params"] == { + "credentials": "{}", + "VoiceSelectionParams": {"name": "en-US-JennyNeural", "language_code": "en-US"}, + "AudioConfig": {"sample_rate_hertz": 24000}, + } + + assert CartesiaTTS(api_key="cartesia-key", voice_id="voice", model_id="sonic-2", sample_rate=24000).to_config()["params"] == { + "api_key": "cartesia-key", + "model_id": "sonic-2", + "voice": {"mode": "id", "id": "voice"}, + "output_format": {"container": "raw", "sample_rate": 24000}, + } + + assert RimeTTS(key="rime-key", speaker="speaker", model_id="mist").to_config()["params"] == { + "api_key": "rime-key", + "speaker": "speaker", + "modelId": "mist", + } + + assert FishAudioTTS(key="fish-key", reference_id="ref", backend="speech-1.5").to_config()["params"] == { + "api_key": "fish-key", + "reference_id": "ref", + "backend": "speech-1.5", + } + + assert ElevenLabsTTS(key="eleven-key", model_id="eleven_flash_v2_5", voice_id="voice", base_url="wss://api.elevenlabs.io/v1").to_config()["params"] == { + "key": "eleven-key", + "base_url": "wss://api.elevenlabs.io/v1", + "model_id": "eleven_flash_v2_5", + "voice_id": "voice", + } + + assert DeepgramTTS(api_key="deepgram-key", model="aura-2-thalia-en", base_url="wss://api.deepgram.com/v1/speak", sample_rate=24000, additional_params={"encoding": "linear16"}).to_config()["params"] == { + "api_key": "deepgram-key", + "model": "aura-2-thalia-en", + "base_url": "wss://api.deepgram.com/v1/speak", + "sample_rate": 24000, + "encoding": "linear16", + } + + assert OpenAITTS(api_key="openai-key", voice="coral", model="gpt-4o-mini-tts", base_url="https://api.openai.com/v1", instructions="speak clearly").to_config()["params"] == { + "voice": "coral", + "api_key": "openai-key", + "base_url": "https://api.openai.com/v1", + "model": "gpt-4o-mini-tts", + "instructions": "speak clearly", + } + + assert OpenAITTS(voice="coral").to_config()["params"] == { + "voice": "coral", + } + + assert HumeAITTS(key="hume-key", voice_id="voice", provider="CUSTOM_VOICE").to_config()["params"] == { + "key": "hume-key", + "voice_id": "voice", + "provider": "CUSTOM_VOICE", + } + + assert MiniMaxTTS(key="minimax-key", group_id="group", model="speech-02-turbo", voice_id="voice", url="wss://api-uw.minimax.io/ws/v1/t2a_v2").to_config()["params"] == { + "model": "speech-02-turbo", + "key": "minimax-key", + "group_id": "group", + "voice_setting": {"voice_id": "voice"}, + "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", + } + + assert SarvamTTS(key="sarvam-key", speaker="anushka", target_language_code="en-IN", sample_rate=24000).to_config()["params"] == { + "api_subscription_key": "sarvam-key", + "speaker": "anushka", + "target_language_code": "en-IN", + "sample_rate": 24000, + } + + assert MurfTTS( + key="murf-key", + voice_id="Ariana", + base_url="wss://murf.example/ws", + locale="en-US", + rate=0, + pitch=0, + model="FALCON", + sample_rate=24000, + ).to_config()["params"] == { + "api_key": "murf-key", + "base_url": "wss://murf.example/ws", + "voiceId": "Ariana", + "locale": "en-US", + "rate": 0, + "pitch": 0, + "model": "FALCON", + "sample_rate": 24000, + } + + assert MurfTTS(key="murf-key").to_config()["params"] == { + "api_key": "murf-key", + } + + +def test_tts_managed_mode_validation_matches_core_shapes() -> None: + with pytest.raises(Exception, match="OpenAITTS requires api_key"): + OpenAITTS(voice="coral", model="tts-1-hd") + + with pytest.raises(Exception, match="MiniMaxTTS requires key"): + MiniMaxTTS(model="speech-02-turbo")