Preparing Archive
azure-ai-voicelive-py
Build real-time voice AI applications using Azure AI Voice Live SDK (azure-ai-voicelive). Use this skill when creating Python applications that need real-time bidirectional audio communication with...
Architectural Overview
"This module is grounded in ai engineering patterns and exposes 1 core capabilities across 1 execution phases."
Azure AI Voice Live SDK
Build real-time voice AI applications with bidirectional WebSocket communication.
Installation
pip install azure-ai-voicelive aiohttp azure-identity
Environment Variables
AZURE_COGNITIVE_SERVICES_ENDPOINT=https://<region>.api.cognitive.microsoft.com
# For API key auth (not recommended for production)
AZURE_COGNITIVE_SERVICES_KEY=<api-key>
Authentication
DefaultAzureCredential (preferred):
from azure.ai.voicelive.aio import connect
from azure.identity.aio import DefaultAzureCredential
async with connect(
endpoint=os.environ["AZURE_COGNITIVE_SERVICES_ENDPOINT"],
credential=DefaultAzureCredential(),
model="gpt-4o-realtime-preview",
credential_scopes=["https://cognitiveservices.azure.com/.default"]
) as conn:
...
API Key:
from azure.ai.voicelive.aio import connect
from azure.core.credentials import AzureKeyCredential
async with connect(
endpoint=os.environ["AZURE_COGNITIVE_SERVICES_ENDPOINT"],
credential=AzureKeyCredential(os.environ["AZURE_COGNITIVE_SERVICES_KEY"]),
model="gpt-4o-realtime-preview"
) as conn:
...
Quick Start
import asyncio
import os
from azure.ai.voicelive.aio import connect
from azure.identity.aio import DefaultAzureCredential
async def main():
async with connect(
endpoint=os.environ["AZURE_COGNITIVE_SERVICES_ENDPOINT"],
credential=DefaultAzureCredential(),
model="gpt-4o-realtime-preview",
credential_scopes=["https://cognitiveservices.azure.com/.default"]
) as conn:
# Update session with instructions
await conn.session.update(session={
"instructions": "You are a helpful assistant.",
"modalities": ["text", "audio"],
"voice": "alloy"
})
# Listen for events
async for event in conn:
print(f"Event: {event.type}")
if event.type == "response.audio_transcript.done":
print(f"Transcript: {event.transcript}")
elif event.type == "response.done":
break
asyncio.run(main())
Core Architecture
Connection Resources
The VoiceLiveConnection exposes these resources:
| Resource | Purpose | Key Methods |
|---|---|---|
conn.session |
Session configuration | update(session=...) |
conn.response |
Model responses | create(), cancel() |
conn.input_audio_buffer |
Audio input | append(), commit(), clear() |
conn.output_audio_buffer |
Audio output | clear() |
conn.conversation |
Conversation state | item.create(), item.delete(), item.truncate() |
conn.transcription_session |
Transcription config | update(session=...) |
Session Configuration
from azure.ai.voicelive.models import RequestSession, FunctionTool
await conn.session.update(session=RequestSession(
instructions="You are a helpful voice assistant.",
modalities=["text", "audio"],
voice="alloy", # or "echo", "shimmer", "sage", etc.
input_audio_format="pcm16",
output_audio_format="pcm16",
turn_detection={
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
},
tools=[
FunctionTool(
type="function",
name="get_weather",
description="Get current weather",
parameters={
"type": "object",
"properties": {
"location": {"type": "string"}
},
"required": ["location"]
}
)
]
))
Audio Streaming
Send Audio (Base64 PCM16)
import base64
# Read audio chunk (16-bit PCM, 24kHz mono)
audio_chunk = await read_audio_from_microphone()
b64_audio = base64.b64encode(audio_chunk).decode()
await conn.input_audio_buffer.append(audio=b64_audio)
Receive Audio
async for event in conn:
if event.type == "response.audio.delta":
audio_bytes = base64.b64decode(event.delta)
await play_audio(audio_bytes)
elif event.type == "response.audio.done":
print("Audio complete")
Event Handling
async for event in conn:
match event.type:
# Session events
case "session.created":
print(f"Session: {event.session}")
case "session.updated":
print("Session updated")
# Audio input events
case "input_audio_buffer.speech_started":
print(f"Speech started at {event.audio_start_ms}ms")
case "input_audio_buffer.speech_stopped":
print(f"Speech stopped at {event.audio_end_ms}ms")
# Transcription events
case "conversation.item.input_audio_transcription.completed":
print(f"User said: {event.transcript}")
case "conversation.item.input_audio_transcription.delta":
print(f"Partial: {event.delta}")
# Response events
case "response.created":
print(f"Response started: {event.response.id}")
case "response.audio_transcript.delta":
print(event.delta, end="", flush=True)
case "response.audio.delta":
audio = base64.b64decode(event.delta)
case "response.done":
print(f"Response complete: {event.response.status}")
# Function calls
case "response.function_call_arguments.done":
result = handle_function(event.name, event.arguments)
await conn.conversation.item.create(item={
"type": "function_call_output",
"call_id": event.call_id,
"output": json.dumps(result)
})
await conn.response.create()
# Errors
case "error":
print(f"Error: {event.error.message}")
Common Patterns
Manual Turn Mode (No VAD)
await conn.session.update(session={"turn_detection": None})
# Manually control turns
await conn.input_audio_buffer.append(audio=b64_audio)
await conn.input_audio_buffer.commit() # End of user turn
await conn.response.create() # Trigger response
Interrupt Handling
async for event in conn:
if event.type == "input_audio_buffer.speech_started":
# User interrupted - cancel current response
await conn.response.cancel()
await conn.output_audio_buffer.clear()
Conversation History
# Add system message
await conn.conversation.item.create(item={
"type": "message",
"role": "system",
"content": [{"type": "input_text", "text": "Be concise."}]
})
# Add user message
await conn.conversation.item.create(item={
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": "Hello!"}]
})
await conn.response.create()
Voice Options
| Voice | Description |
|---|---|
alloy |
Neutral, balanced |
echo |
Warm, conversational |
shimmer |
Clear, professional |
sage |
Calm, authoritative |
coral |
Friendly, upbeat |
ash |
Deep, measured |
ballad |
Expressive |
verse |
Storytelling |
Azure voices: Use AzureStandardVoice, AzureCustomVoice, or AzurePersonalVoice models.
Audio Formats
| Format | Sample Rate | Use Case |
|---|---|---|
pcm16 |
24kHz | Default, high quality |
pcm16-8000hz |
8kHz | Telephony |
pcm16-16000hz |
16kHz | Voice assistants |
g711_ulaw |
8kHz | Telephony (US) |
g711_alaw |
8kHz | Telephony (EU) |
Turn Detection Options
# Server VAD (default)
{"type": "server_vad", "threshold": 0.5, "silence_duration_ms": 500}
# Azure Semantic VAD (smarter detection)
{"type": "azure_semantic_vad"}
{"type": "azure_semantic_vad_en"} # English optimized
{"type": "azure_semantic_vad_multilingual"}
Error Handling
from azure.ai.voicelive.aio import ConnectionError, ConnectionClosed
try:
async with connect(...) as conn:
async for event in conn:
if event.type == "error":
print(f"API Error: {event.error.code} - {event.error.message}")
except ConnectionClosed as e:
print(f"Connection closed: {e.code} - {e.reason}")
except ConnectionError as e:
print(f"Connection error: {e}")
References
- Detailed API Reference: See references/api-reference.md
- Complete Examples: See references/examples.md
- All Models & Types: See references/models.md
When to Use
This skill is applicable to execute the workflow or actions described in the overview.
Primary Stack
Python
Tooling Surface
Guide only
Workspace Path
.agents/skills/azure-ai-voicelive-py
Operational Ecosystem
The complete hardware and software toolchain required.
Module Topology
Antigravity Core
Principal Engineering Agent
Recommended for this workflow
Adjacent modules that complement this skill surface
An error occurred. Please try again later.