Learn how to build a voice agent pipeline using ElevenLabs STT/TTS with OpenAI LLM and Maxim tracing
This tutorial demonstrates how to build a complete voice agent pipeline that converts speech to text, processes it with an LLM, and generates speech output. The entire pipeline is traced end-to-end using Maxim for full observability.
The agent uses Elevenlabs’ transcription and synthesis capabilities with an external LLM to generate the response.
2. Initialize Maxim Logger and Instrument ElevenLabs
# Initialize Maxim logger# This automatically picks up MAXIM_API_KEY and MAXIM_LOG_REPO_ID from environment variableslogger = Maxim().logger()# Instrument ElevenLabs STT/TTS methodsinstrument_elevenlabs(logger)# Initialize ElevenLabs clientelevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)# Initialize OpenAI client with Maxim integrationopenai_client = MaximOpenAIClient( client=OpenAI(api_key=OPENAI_API_KEY), logger=logger)
Creates a Maxim logger instance that automatically reads credentials from environment variables.
instrument_elevenlabs patches ElevenLabs SDK methods to automatically capture STT and TTS operations as spans.
MaximOpenAIClient wraps the OpenAI client to trace LLM calls within the same trace context.
The OpenAI integration is used to demonstrate how to trace LLM calls with Maxim in addition to ElevenLabs. You can use any other LLM provider you want.
def call_openai_llm(transcript: str, trace_id: str) -> str: """ Call OpenAI LLM to generate a response based on the user's transcript. Uses the same trace ID to link the LLM call with the STT-TTS pipeline. """ messages = [ {"role": "system", "content": "You are a helpful assistant. Respond concisely and naturally."}, {"role": "user", "content": transcript}, ] # Create a chat completion request with trace ID in extra_headers response = openai_client.chat.completions.create( model="gpt-4o-mini", messages=messages, extra_headers={ "x-maxim-trace-id": trace_id } ) # Extract response text response_text = response.choices[0].message.content return response_text
Sends the transcribed text to OpenAI’s GPT-4o-mini model for processing.
Uses x-maxim-trace-id header to link this LLM call to the same trace as STT and TTS operations.
Returns the generated response text for TTS conversion.
def stt_tts_pipeline_agent(): """ A simple agent that demonstrates the STT-LLM-TTS pipeline with unified tracing. Flow: 1. User provides audio input (speech) 2. STT converts audio to text (transcript) - instrumented, sets trace input 3. OpenAI LLM processes the transcript and generates a response - uses same trace ID 4. TTS converts LLM response text to audio - instrumented, sets trace output 5. Audio is returned as output """ # Create a shared trace ID for the entire pipeline trace_id = str(uuid4()) trace = logger.trace( TraceConfigDict( id=trace_id, name="STT-OpenAI-TTS Pipeline Agent", tags={"provider": "elevenlabs+openai", "operation": "pipeline"}, ) ) # Create request options with trace_id header for both STT and TTS request_options = RequestOptions( additional_headers={ "x-maxim-trace-id": trace_id } ) print("=== STT-OpenAI-TTS Pipeline Agent ===") print(f"Trace ID: {trace_id}")
Generates a unique trace ID to correlate all operations in the pipeline.
Creates a Maxim trace with descriptive name and tags for easy filtering.
Configures RequestOptions with the trace ID header for ElevenLabs API calls.
else: print(f"Sample audio file not found at {audio_file_path}") print("Creating a simple STT-LLM-TTS example instead...") # Create a dummy transcript for testing dummy_transcript = "Hello, how are you?" print(f"Using dummy transcript: {dummy_transcript}") # Set trace input to the transcript trace.set_input(dummy_transcript) # OpenAI LLM processing response_text = call_openai_llm(dummy_transcript, trace_id) print(f"LLM Response: {response_text}") # Text-to-Speech only audio_output = elevenlabs_client.text_to_speech.convert( text=response_text, voice_id="JBFqnCBsd6RMkjVDRZzb", model_id="eleven_multilingual_v2", output_format="mp3_44100_128", request_options=request_options ) trace.end()
Provides a fallback when no audio file is available for testing.
Manually sets the trace input using trace.set_input().
Demonstrates that the TTS portion works independently of STT.
"""Example agent using ElevenLabs STT-TTS pipeline with OpenAI LLM and Maxim tracing."""import osfrom uuid import uuid4from dotenv import load_dotenvfrom elevenlabs.play import playfrom elevenlabs.client import ElevenLabsfrom elevenlabs.core import RequestOptionsfrom openai import OpenAIfrom maxim import Maximfrom maxim.logger.components.trace import TraceConfigDictfrom maxim.logger.elevenlabs import instrument_elevenlabsfrom maxim.logger.openai import MaximOpenAIClientload_dotenv()# ConfigurationELEVENLABS_API_KEY = os.getenv("EL_API_KEY")OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")if not ELEVENLABS_API_KEY: raise ValueError("ELEVENLABS_API_KEY environment variable is not set")if not OPENAI_API_KEY: raise ValueError("OPENAI_API_KEY environment variable is not set")# Initialize Maxim logger# This automatically picks up MAXIM_API_KEY and MAXIM_LOG_REPO_ID from environment variableslogger = Maxim().logger()# Instrument ElevenLabs STT/TTS methodsinstrument_elevenlabs(logger)# Initialize ElevenLabs clientelevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)# Initialize OpenAI client with Maxim integrationopenai_client = MaximOpenAIClient( client=OpenAI(api_key=OPENAI_API_KEY), logger=logger)def call_openai_llm(transcript: str, trace_id: str) -> str: """ Call OpenAI LLM to generate a response based on the user's transcript. Uses the same trace ID to link the LLM call with the STT-TTS pipeline. """ messages = [ {"role": "system", "content": "You are a helpful assistant. Respond concisely and naturally."}, {"role": "user", "content": transcript}, ] # Create a chat completion request with trace ID in extra_headers response = openai_client.chat.completions.create( model="gpt-4o-mini", messages=messages, extra_headers={ "x-maxim-trace-id": trace_id } ) # Extract response text response_text = response.choices[0].message.content return response_textdef stt_tts_pipeline_agent(): """ A simple agent that demonstrates the STT-LLM-TTS pipeline with unified tracing. Flow: 1. User provides audio input (speech) 2. STT converts audio to text (transcript) - instrumented, sets trace input 3. OpenAI LLM processes the transcript and generates a response - uses same trace ID 4. TTS converts LLM response text to audio - instrumented, sets trace output 5. Audio is returned as output All operations (STT, LLM, TTS) are traced under a single trace via instrumentation. The trace input is the user's speech transcript, and the output is the LLM response text. Both user speech and assistant speech audio files are attached to the trace. """ # Create a shared trace ID for the entire pipeline trace_id = str(uuid4()) trace = logger.trace( TraceConfigDict( id=trace_id, name="STT-OpenAI-TTS Pipeline Agent", tags={"provider": "elevenlabs+openai", "operation": "pipeline"}, ) ) # Create request options with trace_id header for both STT and TTS request_options = RequestOptions( additional_headers={ "x-maxim-trace-id": trace_id } ) print("=== STT-OpenAI-TTS Pipeline Agent ===") print(f"Trace ID: {trace_id}") audio_file_path = os.path.join( os.path.dirname(__file__), "files", "sample_audio.wav" ) # Check if sample file exists, otherwise create a dummy scenario if os.path.exists(audio_file_path): print(f"Processing audio file: {audio_file_path}") # Convert speech to text # This will add to the existing trace (trace_id from request_options) # - Input: audio attachment (speech) # - Output: transcript text with open(audio_file_path, "rb") as audio_file: transcript = elevenlabs_client.speech_to_text.convert( file=audio_file, model_id="scribe_v1", request_options=request_options ) # Extract transcript text from the result object transcript_text = "" if isinstance(transcript, str): transcript_text = transcript elif hasattr(transcript, "text"): transcript_text = transcript.text elif isinstance(transcript, dict) and "text" in transcript: transcript_text = transcript["text"] else: transcript_text = str(transcript) print(f"Transcript: {transcript_text}") # OpenAI LLM processing print("\n=== OpenAI LLM Processing ===") response_text = call_openai_llm(transcript_text, trace_id) print(f"LLM Response: {response_text}") # Text-to-Speech print("\n=== Text-to-Speech ===") # Convert LLM response text to speech # This will also add to the same trace (trace_id from request_options) # - Input: LLM response text (already set as trace output above) # - Output: audio attachment (assistant speech) audio_output = elevenlabs_client.text_to_speech.convert( text=response_text, voice_id="JBFqnCBsd6RMkjVDRZzb", model_id="eleven_multilingual_v2", output_format="mp3_44100_128", request_options=request_options ) play(audio_output) else: print(f"Sample audio file not found at {audio_file_path}") print("Creating a simple STT-LLM-TTS example instead...") # Create a dummy transcript for testing dummy_transcript = "Hello, how are you?" print(f"Using dummy transcript: {dummy_transcript}") # Set trace input to the transcript trace.set_input(dummy_transcript) # OpenAI LLM processing print("\n=== OpenAI LLM Processing ===") response_text = call_openai_llm(dummy_transcript, trace_id) print(f"LLM Response: {response_text}") # Text-to-Speech print("\n=== Text-to-Speech ===") audio_output = elevenlabs_client.text_to_speech.convert( text=response_text, voice_id="JBFqnCBsd6RMkjVDRZzb", model_id="eleven_multilingual_v2", output_format="mp3_44100_128", request_options=request_options ) trace.end()if __name__ == "__main__": try: stt_tts_pipeline_agent() finally: logger.cleanup()