Add voice capabilities to your agents by converting between audio and text using lifecycle hooks. This enables use cases like voice assistants, audio chatbots, and speech-to-speech interactions.
The examples below use OpenAI’s APIs, but you can use any transcription or text-to-speech service (ElevenLabs, Google Cloud, or custom implementations). Simply replace the stt and tts functions with your preferred provider’s API.
Speech-to-Text (STT) with Pre-hooks
Use a pre_hook to transcribe audio files before the agent processes them. This allows you to use any text model, not just audio-capable ones:
from timbal import Agent
from timbal.state import get_run_context
from timbal.types.file import File
import os
from openai import AsyncOpenAI
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
async def stt(audio_file: File) -> str:
"""Transcribe an audio file."""
transcript = await client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
)
return transcript.text
async def pre_hook():
"""Transcribe audio files before processing."""
span = get_run_context().current_span()
prompt = span.input.get("prompt")
# Transcribe audio file
if (isinstance(prompt, File) and
prompt.__content_type__ and
prompt.__content_type__.startswith("audio/")):
transcription = await stt(prompt)
span.input["prompt"] = transcription
agent = Agent(
name="AudioAgent",
model="openai/gpt-5.2",
pre_hook=pre_hook
)
audio_file = File.validate("/path/to/recording.wav")
result = await agent(prompt=audio_file).collect()
Text-to-Speech (TTS) with Post-hooks
Use a post_hook to convert the agent’s text response to audio:
import os
from openai import AsyncOpenAI
from timbal import Agent
from timbal.state import get_run_context
from timbal.types.file import File
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
async def tts(text: str) -> File:
"""Convert text to speech using OpenAI TTS."""
response = await client.audio.speech.create(
model="tts-1",
voice="alloy",
input=text,
)
return File.validate(response.content, {"extension": ".mp3"})
async def post_hook():
"""Convert text output to audio."""
span = get_run_context().current_span()
text = span.output.collect_text()
span.output = await tts(text)
agent = Agent(
name="VoiceAgent",
model="openai/gpt-5.2",
post_hook=post_hook,
)
result = await agent(prompt="What is the capital of France?").collect()
# result.output is now an audio File — play it with your preferred method
Speech-to-Speech Interactions
Combine both patterns to build agents that understand and respond in audio:
import os
from openai import AsyncOpenAI
from timbal import Agent
from timbal.state import get_run_context
from timbal.types.file import File
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
async def stt(audio_file: File) -> str:
"""Transcribe audio file to text."""
transcript = await client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
)
return transcript.text
async def tts(text: str) -> File:
"""Convert text to speech."""
response = await client.audio.speech.create(
model="tts-1",
voice="alloy",
input=text,
)
return File.validate(response.content, {"extension": ".mp3"})
async def pre_hook():
"""Transcribe audio input before processing."""
span = get_run_context().current_span()
prompt = span.input.get("prompt")
# Transcribe audio file
if (isinstance(prompt, File) and
prompt.__content_type__ and
prompt.__content_type__.startswith("audio/")):
span.input["prompt"] = await stt(prompt)
async def post_hook():
"""Convert text output to audio."""
span = get_run_context().current_span()
text = span.output.collect_text()
span.output = await tts(text)
agent = Agent(
name="VoiceAssistant",
model="openai/gpt-5.2",
pre_hook=pre_hook,
post_hook=post_hook,
)
# Agent receives audio and responds with audio
audio_input = File.validate("/path/to/recording.wav")
result = await agent(prompt=audio_input).collect()
# result.output is an audio File — play it with your preferred method
Key Concepts
- Pre-hooks for STT: Transcribe audio files before processing, enabling any text model to work with audio
- Post-hooks for TTS: Convert text responses to audio after generation
- Flexible Providers: Use any transcription or synthesis service (OpenAI, ElevenLabs, or custom implementations)
- Modality Conversion: Hooks enable seamless conversion between audio and text modalities