Accounts
Calls, Messages, Stages
Corpora, Query, Sources
Webhooks
Create Call
Creates a new call using the specified system prompt and other properties
curl --request POST \
--url https://api.ultravox.ai/api/calls \
--header 'Content-Type: application/json' \
--header 'X-API-Key: <api-key>' \
--data '{
"systemPrompt": "<string>",
"temperature": 123,
"model": "<string>",
"voice": "<string>",
"externalVoice": {
"elevenLabs": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"useSpeakerBoost": true,
"style": 123,
"similarityBoost": 123,
"stability": 123,
"pronunciationDictionaries": [
{
"dictionaryId": "<string>",
"versionId": "<string>"
}
]
},
"cartesia": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"emotion": "<string>"
},
"playHt": {
"userId": "<string>",
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"quality": "<string>",
"temperature": 123,
"emotion": 123,
"voiceGuidance": 123,
"styleGuidance": 123,
"textGuidance": 123,
"voiceConditioningSeconds": 123
},
"lmnt": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"conversational": true
}
},
"languageHint": "<string>",
"initialMessages": [
{
"role": "MESSAGE_ROLE_UNSPECIFIED",
"text": "<string>",
"invocationId": "<string>",
"toolName": "<string>",
"errorDetails": "<string>",
"medium": "MESSAGE_MEDIUM_UNSPECIFIED",
"callStageMessageIndex": 123,
"callStageId": "<string>",
"callState": {},
"timespan": {
"start": "<string>",
"end": "<string>"
}
}
],
"joinTimeout": "<string>",
"maxDuration": "<string>",
"timeExceededMessage": "<string>",
"inactivityMessages": [
{
"duration": "<string>",
"message": "<string>",
"endBehavior": "END_BEHAVIOR_UNSPECIFIED"
}
],
"selectedTools": [
{
"toolId": "<string>",
"toolName": "<string>",
"temporaryTool": {
"modelToolName": "<string>",
"description": "<string>",
"dynamicParameters": [
{
"name": "<string>",
"location": "PARAMETER_LOCATION_UNSPECIFIED",
"schema": {},
"required": true
}
],
"staticParameters": [
{
"name": "<string>",
"location": "PARAMETER_LOCATION_UNSPECIFIED",
"value": "<any>"
}
],
"automaticParameters": [
{
"name": "<string>",
"location": "PARAMETER_LOCATION_UNSPECIFIED",
"knownValue": "KNOWN_PARAM_UNSPECIFIED"
}
],
"requirements": {
"httpSecurityOptions": {
"options": [
{
"requirements": {},
"ultravoxCallTokenRequirement": {
"scopes": [
"<string>"
]
}
}
]
},
"requiredParameterOverrides": [
"<string>"
]
},
"timeout": "<string>",
"precomputable": true,
"http": {
"baseUrlPattern": "<string>",
"httpMethod": "<string>"
},
"client": {},
"defaultReaction": "AGENT_REACTION_UNSPECIFIED"
},
"nameOverride": "<string>",
"authTokens": {},
"parameterOverrides": {}
}
],
"medium": {
"webRtc": {},
"twilio": {},
"serverWebSocket": {
"inputSampleRate": 123,
"outputSampleRate": 123,
"clientBufferSizeMs": 123
},
"telnyx": {},
"plivo": {},
"exotel": {}
},
"recordingEnabled": true,
"firstSpeaker": "FIRST_SPEAKER_UNSPECIFIED",
"transcriptOptional": true,
"initialOutputMedium": "MESSAGE_MEDIUM_UNSPECIFIED",
"vadSettings": {
"turnEndpointDelay": "<string>",
"minimumTurnDuration": "<string>",
"minimumInterruptionDuration": "<string>",
"frameActivationThreshold": 123
},
"firstSpeakerSettings": {
"user": {
"fallback": {
"delay": "<string>",
"text": "<string>"
}
},
"agent": {
"uninterruptible": true,
"text": "<string>",
"delay": "<string>"
}
},
"experimentalSettings": {},
"metadata": {},
"initialState": {}
}'
{
"callId": "3c90c3cc-0d44-4b50-8888-8dd25736052a",
"clientVersion": "<string>",
"created": "2023-11-07T05:31:56Z",
"joined": "2023-11-07T05:31:56Z",
"ended": "2023-11-07T05:31:56Z",
"endReason": "unjoined",
"firstSpeaker": "FIRST_SPEAKER_AGENT",
"firstSpeakerSettings": {
"user": {
"fallback": {
"delay": "<string>",
"text": "<string>"
}
},
"agent": {
"uninterruptible": true,
"text": "<string>",
"delay": "<string>"
}
},
"inactivityMessages": [
{
"duration": "<string>",
"message": "<string>",
"endBehavior": "END_BEHAVIOR_UNSPECIFIED"
}
],
"initialOutputMedium": "MESSAGE_MEDIUM_VOICE",
"joinTimeout": "30s",
"joinUrl": "<string>",
"languageHint": "<string>",
"maxDuration": "3600s",
"medium": {
"webRtc": {},
"twilio": {},
"serverWebSocket": {
"inputSampleRate": 123,
"outputSampleRate": 123,
"clientBufferSizeMs": 123
},
"telnyx": {},
"plivo": {},
"exotel": {}
},
"model": "fixie-ai/ultravox",
"recordingEnabled": false,
"systemPrompt": "<string>",
"temperature": 0,
"timeExceededMessage": "<string>",
"voice": "<string>",
"externalVoice": {
"elevenLabs": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"useSpeakerBoost": true,
"style": 123,
"similarityBoost": 123,
"stability": 123,
"pronunciationDictionaries": [
{
"dictionaryId": "<string>",
"versionId": "<string>"
}
]
},
"cartesia": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"emotion": "<string>"
},
"playHt": {
"userId": "<string>",
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"quality": "<string>",
"temperature": 123,
"emotion": 123,
"voiceGuidance": 123,
"styleGuidance": 123,
"textGuidance": 123,
"voiceConditioningSeconds": 123
},
"lmnt": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"conversational": true
}
},
"transcriptOptional": true,
"errorCount": 0,
"vadSettings": {
"turnEndpointDelay": "<string>",
"minimumTurnDuration": "<string>",
"minimumInterruptionDuration": "<string>",
"frameActivationThreshold": 123
},
"shortSummary": "<string>",
"summary": "<string>",
"experimentalSettings": "<any>",
"metadata": {},
"initialState": {}
}
Authorizations
API key
Query Parameters
Adds a prompt for a greeting if there's not an initial message that the model would naturally respond to (a user message or tool result).
The UUID of a prior call. When specified, the new call will use the same properites as the prior call unless overriden in this request's body. The new call will also use the prior call's message history as its own initial_messages. (It's illegal to also set initial_messages in the body.)
Body
A request to start a call.
The system prompt provided to the model during generations.
The model temperature, between 0 and 1. Defaults to 0.
The model used for generations. Defaults to fixie-ai/ultravox.
The ID (or name if unique) of the voice the agent should use for this call.
A voice not known to Ultravox Realtime that can nonetheless be used for this call.
Your account must have an API key set for the provider of the voice.
Either this or voice
may be set, but not both.
A voice served by ElevenLabs.
The ID of the voice in ElevenLabs.
The ElevenLabs model to use.
The speaking rate. Must be between 0.7 and 1.2. Defaults to 1. See https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request.body.voice_settings.speed
A reference to a pronunciation dictionary within ElevenLabs.
A voice served by Cartesia.
The ID of the voice in Cartesia.
The Cartesia model to use.
The speaking rate. Must be between -1 and 1. Defaults to 0. See https://docs.cartesia.ai/api-reference/tts/tts#send.Generation%20Request.voice.Ttsrequest%20ID%20Specifier.__experimental_controls.speed
A voice served by PlayHT.
The "user id" for the PlayHT API. This must be the user who owns the Play API key associated with your Ultravox account.
The ID of the voice in PlayHT. Typically an s3 location.
The PlayHT model (aka "engine") to use.
The speaking rate. Must be between 0 and 5. Defaults to 1.
A voice served by LMNT.
The ID of the voice in LMNT.
The LMNT model to use.
The speaking rate. Must be between 0.25 and 2. Defaults to 1. See https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-speed
A BCP47 language code that may be used to guide speech recognition and synthesis.
The conversation history to start from for this call.
A message exchanged during a call.
The message's role.
MESSAGE_ROLE_UNSPECIFIED
, MESSAGE_ROLE_USER
, MESSAGE_ROLE_AGENT
, MESSAGE_ROLE_TOOL_CALL
, MESSAGE_ROLE_TOOL_RESULT
The message text for user and agent messages, tool arguments for tool_call messages, tool results for tool_result messages.
The invocation ID for tool messages. Used to pair tool calls with their results.
The tool name for tool messages.
For failed tool calls, additional debugging information. While the text field is presented to the model so it can respond to failures gracefully, the full details are only exposed via the Ultravox REST API.
The medium of the message.
MESSAGE_MEDIUM_UNSPECIFIED
, MESSAGE_MEDIUM_VOICE
, MESSAGE_MEDIUM_TEXT
The index of the message within the call stage.
The call stage this message appeared in.
If the message updated the call state, the new call state.
The timespan during the call when this message occurred. This is only set for messages that occurred during the call (stage) and not for messages in the call's (call stage's) initial messages.
A timeout for joining the call. Defaults to 30 seconds.
The maximum duration of the call. Defaults to 1 hour.
What the agent should say immediately before hanging up if the call's time limit is reached.
Messages spoken by the agent when the user is inactive for the specified duration. Durations are cumulative, so a message m > 1 with duration 30s will be spoken 30 seconds after message m-1.
A message the agent should say after some duration. The duration's meaning varies depending on the context.
The duration after which the message should be spoken.
The message to speak.
The behavior to exhibit when the message is finished being spoken.
END_BEHAVIOR_UNSPECIFIED
, END_BEHAVIOR_HANG_UP_SOFT
, END_BEHAVIOR_HANG_UP_STRICT
The tools available to the agent for (the first stage of) this call.
A tool selected for a particular call. Exactly one of tool_id, tool_name, or temporary_tool should be set.
The ID of an existing base tool.
The name of an existing base tool. The name must uniquely identify the tool.
A temporary tool definition, available only for this call (and subsequent calls created using priorCallId without overriding selected tools).
The name of the tool, as presented to the model. Must match ^[a-zA-Z0-9_-]{1,64}$.
The description of the tool.
The parameters that the tool accepts.
A dynamic parameter the tool accepts that may be set by the model.
The name of the parameter.
Where the parameter is used.
PARAMETER_LOCATION_UNSPECIFIED
, PARAMETER_LOCATION_QUERY
, PARAMETER_LOCATION_PATH
, PARAMETER_LOCATION_HEADER
, PARAMETER_LOCATION_BODY
The JsonSchema definition of the parameter. This typically includes things like type, description, enum values, format, other restrictions, etc.
Whether the parameter is required.
The static parameters added when the tool is invoked.
A static parameter that is unconditionally added when the tool is invoked. This parameter is not exposed to or set by the model.
The name of the parameter.
Where the parameter is used.
PARAMETER_LOCATION_UNSPECIFIED
, PARAMETER_LOCATION_QUERY
, PARAMETER_LOCATION_PATH
, PARAMETER_LOCATION_HEADER
, PARAMETER_LOCATION_BODY
The value of the parameter.
Additional parameters that are automatically set by the system when the tool is invoked.
A parameter that is automatically set by the system.
The name of the parameter.
Where the parameter is used.
PARAMETER_LOCATION_UNSPECIFIED
, PARAMETER_LOCATION_QUERY
, PARAMETER_LOCATION_PATH
, PARAMETER_LOCATION_HEADER
, PARAMETER_LOCATION_BODY
The value to set for the parameter.
KNOWN_PARAM_UNSPECIFIED
, KNOWN_PARAM_CALL_ID
, KNOWN_PARAM_CONVERSATION_HISTORY
, KNOWN_PARAM_OUTPUT_SAMPLE_RATE
, KNOWN_PARAM_CALL_STATE
Requirements that must be fulfilled when creating a call for the tool to be used.
Security requirements for an HTTP tool.
The options for security. Only one must be met. The first one that can be satisfied will be used in general. The single exception to this rule is that we always prefer a non-empty set of requirements over an empty set unless no non-empty set can be satisfied.
The security requirements for a request. All requirements must be met.
Dynamic parameters that must be overridden with an explicit (static) value.
The maximum amount of time the tool is allowed for execution. The conversation is frozen while tools run, so prefer sticking to the default unless you're comfortable with that consequence. If your tool is too slow for the default and can't be made faster, still try to keep this timeout as low as possible.
The tool is guaranteed to be non-mutating, repeatable, and free of side-effects. Such tools can safely be executed speculatively, reducing their effective latency. However, the fact they were called may not be reflected in the call history if their result ends up unused.
Details for a client-implemented tool. Only body parameters are allowed for client tools.
Indicates the default for how the agent should proceed after the tool is invoked. Can be overridden by the tool implementation via the X-Ultravox-Agent-Reaction header.
AGENT_REACTION_UNSPECIFIED
, AGENT_REACTION_SPEAKS
, AGENT_REACTION_LISTENS
, AGENT_REACTION_SPEAKS_ONCE
An override for the model_tool_name. This is primarily useful when using multiple instances of the same durable tool (presumably with different parameter overrides.) The set of tools used within a call must have a unique set of model names and every name must match this pattern: ^[a-zA-Z0-9_-]{1,64}$.
Auth tokens used to satisfy the tool's security requirements.
Static values to use in place of dynamic parameters. Any parameter included here will be hidden from the model and the static value will be used instead. Some tools may require certain parameters to be overridden, but any parameter can be overridden regardless of whether it is required to be.
Represents a dynamically typed value which can be either null, a number, a string, a boolean, a recursive struct value, or a list of values.
The medium used for this call.
The call will use WebRTC with the Ultravox client SDK. This is the default.
The call will use Twilio's "Media Streams" protocol. Once you have a join URL from starting a call, include it in your TwiML like so: <Connect><Stream url=${your-join-url} /></Connect> This works for both inbound and outbound calls.
The call will use a plain websocket connection. This is unlikely to yield an acceptable user experience if used from a browser or mobile client, but may be suitable for a server-to-server connection. This option provides a simple way to connect your own server to an Ultravox inference instance.
The sample rate for input (user) audio. Required.
The desired sample rate for output (agent) audio. If unset, defaults to the input_sample_rate.
The size of the client-side audio buffer in milliseconds. Smaller buffers allow for faster interruptions but may cause audio underflow if network latency fluctuates too greatly. For the best of both worlds, set this to some large value (e.g. 30000) and implement support for playback_clear_buffer messages. Defaults to 60.
The call will use Telnyx's media streaming protocol. Once you have a join URL from starting a call, include it in your TexML like so: <Connect><Stream url=${your-join-url} bidirectionalMode="rtp" /></Connect> This works for both inbound and outbound calls.
The call will use Plivo's AudioStreams protocol. Once you have a join URL from starting a call, include it in your Plivo XML like so: <Stream keepCallAlive="true" bidirectional="true" contentType="audio/x-l16;rate=16000">${your-join-url}</Stream> This works for both inbound and outbound calls.
The call will use Exotel's "Voicebot" protocol. Once you have a join URL from starting a call, provide it to Exotel as the wss target URL for your Voicebot (either directly or more likely dynamically from your own server).
Whether the call should be recorded.
Who should talk first when the call starts. Typically set to FIRST_SPEAKER_USER for outgoing
calls and left as the default (FIRST_SPEAKER_AGENT) otherwise.
Deprecated. Prefer firstSpeakerSettings
. If both are set, they must match.
FIRST_SPEAKER_UNSPECIFIED
, FIRST_SPEAKER_AGENT
, FIRST_SPEAKER_USER
Indicates whether a transcript is optional for the call.
The medium to use for the call initially. May be altered by the client later. Defaults to voice.
MESSAGE_MEDIUM_UNSPECIFIED
, MESSAGE_MEDIUM_VOICE
, MESSAGE_MEDIUM_TEXT
VAD settings for the call.
The minimum amount of time the agent will wait to respond after the user seems to be done speaking. Increasing this value will make the agent less eager to respond, which may increase perceived response latency but will also make the agent less likely to jump in before the user is really done speaking.
Built-in VAD currently operates on 32ms frames, so only multiples of 32ms are meaningful. (Anything from 1ms to 31ms will produce the same result.)
Defaults to "0.384s" (384ms) as a starting point, but there's nothing special about this value aside from it corresponding to 12 VAD frames.
The minimum duration of user speech required to be considered a user turn. Increasing this value will cause the agent to ignore short user audio. This may be useful in particularly noisy environments, but it comes at the cost of possibly ignoring very short user responses such as "yes" or "no".
Defaults to "0s" meaning the agent considers all user audio inputs (that make it through built-in noise cancellation).
The minimum duration of user speech required to interrupt the agent. This works the same way as minimumTurnDuration, but allows for a higher threshold for interrupting the agent. (This value will be ignored if it is less than minimumTurnDuration.)
Defaults to "0.09s" (90ms) as a starting point, but there's nothing special about this value.
The threshold for the VAD to consider a frame as speech. This is a value between 0.1 and 1.
Miniumum value is 0.1, which is the default value.
The settings for the initial message to get a conversation started.
Defaults to agent: {}
which means the agent will start the conversation with an
(interruptible) greeting generated based on the system prompt and any initial messages.
(If first_speaker is set and this is not, first_speaker will be used instead.)
If set, the user should speak first.
If set, the agent will start the conversation itself if the user doesn't start speaking within the given delay.
If set, the agent should speak first.
Whether the user should be prevented from interrupting the agent's first message. Defaults to false (meaning the agent is interruptible as usual).
What the agent should say. If unset, the model will generate a greeting.
If set, the agent will wait this long before starting its greeting. This may be useful for ensuring the user is ready.
Experimental settings for the call.
Optional metadata key-value pairs to associate with the call. All values must be strings. Keys may not start with "ultravox.", which is reserved for system-provided metadata.
The initial state of the call stage which is readable/writable by tools.
Response
The version of the client that joined this call.
The reason the call ended.
unjoined
- Client never joinedhangup
- Client hung upagent_hangup
- Agent hung uptimeout
- Call timed outconnection_error
- Connection errorsystem_error
- System error
unjoined
, hangup
, agent_hangup
, timeout
, connection_error
, system_error
Who was supposed to talk first when the call started. Typically set to FIRST_SPEAKER_USER for outgoing calls and left as the default (FIRST_SPEAKER_AGENT) otherwise.
FIRST_SPEAKER_AGENT
, FIRST_SPEAKER_USER
Settings for the initial message to get the call started.
If set, the user should speak first.
If set, the agent will start the conversation itself if the user doesn't start speaking within the given delay.
If set, the agent should speak first.
Whether the user should be prevented from interrupting the agent's first message. Defaults to false (meaning the agent is interruptible as usual).
What the agent should say. If unset, the model will generate a greeting.
If set, the agent will wait this long before starting its greeting. This may be useful for ensuring the user is ready.
The medium used initially by the agent. May later be changed by the client.
MESSAGE_MEDIUM_VOICE
, MESSAGE_MEDIUM_TEXT
The number of errors in this call.
A short summary of the call.
A summary of the call.
Experimental settings for the call.
Optional metadata key-value pairs to associate with the call. All values must be strings.
The initial state of the call which is readable/writable by tools.
Messages spoken by the agent when the user is inactive for the specified duration. Durations are cumulative, so a message m > 1 with duration 30s will be spoken 30 seconds after message m-1.
A message the agent should say after some duration. The duration's meaning varies depending on the context.
The duration after which the message should be spoken.
The message to speak.
The behavior to exhibit when the message is finished being spoken.
END_BEHAVIOR_UNSPECIFIED
, END_BEHAVIOR_HANG_UP_SOFT
, END_BEHAVIOR_HANG_UP_STRICT
BCP47 language code that may be used to guide speech recognition.
16
Details about a call's protocol. By default, calls occur over WebRTC using the Ultravox client SDK. Setting a different call medium will prepare the server for a call using a different protocol. At most one call medium may be set.
The call will use WebRTC with the Ultravox client SDK. This is the default.
The call will use Twilio's "Media Streams" protocol. Once you have a join URL from starting a call, include it in your TwiML like so: <Connect><Stream url=${your-join-url} /></Connect> This works for both inbound and outbound calls.
The call will use a plain websocket connection. This is unlikely to yield an acceptable user experience if used from a browser or mobile client, but may be suitable for a server-to-server connection. This option provides a simple way to connect your own server to an Ultravox inference instance.
The sample rate for input (user) audio. Required.
The desired sample rate for output (agent) audio. If unset, defaults to the input_sample_rate.
The size of the client-side audio buffer in milliseconds. Smaller buffers allow for faster interruptions but may cause audio underflow if network latency fluctuates too greatly. For the best of both worlds, set this to some large value (e.g. 30000) and implement support for playback_clear_buffer messages. Defaults to 60.
The call will use Telnyx's media streaming protocol. Once you have a join URL from starting a call, include it in your TexML like so: <Connect><Stream url=${your-join-url} bidirectionalMode="rtp" /></Connect> This works for both inbound and outbound calls.
The call will use Plivo's AudioStreams protocol. Once you have a join URL from starting a call, include it in your Plivo XML like so: <Stream keepCallAlive="true" bidirectional="true" contentType="audio/x-l16;rate=16000">${your-join-url}</Stream> This works for both inbound and outbound calls.
The call will use Exotel's "Voicebot" protocol. Once you have a join URL from starting a call, provide it to Exotel as the wss target URL for your Voicebot (either directly or more likely dynamically from your own server).
0 <= x <= 1
A voice not known to Ultravox Realtime that can nonetheless be used for a call. Such voices are significantly less validated than normal voices and you'll be responsible for your own TTS-related errors. Exactly one field must be set.
A voice served by ElevenLabs.
The ID of the voice in ElevenLabs.
The ElevenLabs model to use.
The speaking rate. Must be between 0.7 and 1.2. Defaults to 1. See https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request.body.voice_settings.speed
A reference to a pronunciation dictionary within ElevenLabs.
A voice served by Cartesia.
The ID of the voice in Cartesia.
The Cartesia model to use.
The speaking rate. Must be between -1 and 1. Defaults to 0. See https://docs.cartesia.ai/api-reference/tts/tts#send.Generation%20Request.voice.Ttsrequest%20ID%20Specifier.__experimental_controls.speed
A voice served by PlayHT.
The "user id" for the PlayHT API. This must be the user who owns the Play API key associated with your Ultravox account.
The ID of the voice in PlayHT. Typically an s3 location.
The PlayHT model (aka "engine") to use.
The speaking rate. Must be between 0 and 5. Defaults to 1.
A voice served by LMNT.
The ID of the voice in LMNT.
The LMNT model to use.
The speaking rate. Must be between 0.25 and 2. Defaults to 1. See https://docs.lmnt.com/api-reference/speech/synthesize-speech-bytes#body-speed
Indicates whether a transcript is optional for the call.
VAD settings for the call.
The minimum amount of time the agent will wait to respond after the user seems to be done speaking. Increasing this value will make the agent less eager to respond, which may increase perceived response latency but will also make the agent less likely to jump in before the user is really done speaking.
Built-in VAD currently operates on 32ms frames, so only multiples of 32ms are meaningful. (Anything from 1ms to 31ms will produce the same result.)
Defaults to "0.384s" (384ms) as a starting point, but there's nothing special about this value aside from it corresponding to 12 VAD frames.
The minimum duration of user speech required to be considered a user turn. Increasing this value will cause the agent to ignore short user audio. This may be useful in particularly noisy environments, but it comes at the cost of possibly ignoring very short user responses such as "yes" or "no".
Defaults to "0s" meaning the agent considers all user audio inputs (that make it through built-in noise cancellation).
The minimum duration of user speech required to interrupt the agent. This works the same way as minimumTurnDuration, but allows for a higher threshold for interrupting the agent. (This value will be ignored if it is less than minimumTurnDuration.)
Defaults to "0.09s" (90ms) as a starting point, but there's nothing special about this value.
The threshold for the VAD to consider a frame as speech. This is a value between 0.1 and 1.
Miniumum value is 0.1, which is the default value.
curl --request POST \
--url https://api.ultravox.ai/api/calls \
--header 'Content-Type: application/json' \
--header 'X-API-Key: <api-key>' \
--data '{
"systemPrompt": "<string>",
"temperature": 123,
"model": "<string>",
"voice": "<string>",
"externalVoice": {
"elevenLabs": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"useSpeakerBoost": true,
"style": 123,
"similarityBoost": 123,
"stability": 123,
"pronunciationDictionaries": [
{
"dictionaryId": "<string>",
"versionId": "<string>"
}
]
},
"cartesia": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"emotion": "<string>"
},
"playHt": {
"userId": "<string>",
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"quality": "<string>",
"temperature": 123,
"emotion": 123,
"voiceGuidance": 123,
"styleGuidance": 123,
"textGuidance": 123,
"voiceConditioningSeconds": 123
},
"lmnt": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"conversational": true
}
},
"languageHint": "<string>",
"initialMessages": [
{
"role": "MESSAGE_ROLE_UNSPECIFIED",
"text": "<string>",
"invocationId": "<string>",
"toolName": "<string>",
"errorDetails": "<string>",
"medium": "MESSAGE_MEDIUM_UNSPECIFIED",
"callStageMessageIndex": 123,
"callStageId": "<string>",
"callState": {},
"timespan": {
"start": "<string>",
"end": "<string>"
}
}
],
"joinTimeout": "<string>",
"maxDuration": "<string>",
"timeExceededMessage": "<string>",
"inactivityMessages": [
{
"duration": "<string>",
"message": "<string>",
"endBehavior": "END_BEHAVIOR_UNSPECIFIED"
}
],
"selectedTools": [
{
"toolId": "<string>",
"toolName": "<string>",
"temporaryTool": {
"modelToolName": "<string>",
"description": "<string>",
"dynamicParameters": [
{
"name": "<string>",
"location": "PARAMETER_LOCATION_UNSPECIFIED",
"schema": {},
"required": true
}
],
"staticParameters": [
{
"name": "<string>",
"location": "PARAMETER_LOCATION_UNSPECIFIED",
"value": "<any>"
}
],
"automaticParameters": [
{
"name": "<string>",
"location": "PARAMETER_LOCATION_UNSPECIFIED",
"knownValue": "KNOWN_PARAM_UNSPECIFIED"
}
],
"requirements": {
"httpSecurityOptions": {
"options": [
{
"requirements": {},
"ultravoxCallTokenRequirement": {
"scopes": [
"<string>"
]
}
}
]
},
"requiredParameterOverrides": [
"<string>"
]
},
"timeout": "<string>",
"precomputable": true,
"http": {
"baseUrlPattern": "<string>",
"httpMethod": "<string>"
},
"client": {},
"defaultReaction": "AGENT_REACTION_UNSPECIFIED"
},
"nameOverride": "<string>",
"authTokens": {},
"parameterOverrides": {}
}
],
"medium": {
"webRtc": {},
"twilio": {},
"serverWebSocket": {
"inputSampleRate": 123,
"outputSampleRate": 123,
"clientBufferSizeMs": 123
},
"telnyx": {},
"plivo": {},
"exotel": {}
},
"recordingEnabled": true,
"firstSpeaker": "FIRST_SPEAKER_UNSPECIFIED",
"transcriptOptional": true,
"initialOutputMedium": "MESSAGE_MEDIUM_UNSPECIFIED",
"vadSettings": {
"turnEndpointDelay": "<string>",
"minimumTurnDuration": "<string>",
"minimumInterruptionDuration": "<string>",
"frameActivationThreshold": 123
},
"firstSpeakerSettings": {
"user": {
"fallback": {
"delay": "<string>",
"text": "<string>"
}
},
"agent": {
"uninterruptible": true,
"text": "<string>",
"delay": "<string>"
}
},
"experimentalSettings": {},
"metadata": {},
"initialState": {}
}'
{
"callId": "3c90c3cc-0d44-4b50-8888-8dd25736052a",
"clientVersion": "<string>",
"created": "2023-11-07T05:31:56Z",
"joined": "2023-11-07T05:31:56Z",
"ended": "2023-11-07T05:31:56Z",
"endReason": "unjoined",
"firstSpeaker": "FIRST_SPEAKER_AGENT",
"firstSpeakerSettings": {
"user": {
"fallback": {
"delay": "<string>",
"text": "<string>"
}
},
"agent": {
"uninterruptible": true,
"text": "<string>",
"delay": "<string>"
}
},
"inactivityMessages": [
{
"duration": "<string>",
"message": "<string>",
"endBehavior": "END_BEHAVIOR_UNSPECIFIED"
}
],
"initialOutputMedium": "MESSAGE_MEDIUM_VOICE",
"joinTimeout": "30s",
"joinUrl": "<string>",
"languageHint": "<string>",
"maxDuration": "3600s",
"medium": {
"webRtc": {},
"twilio": {},
"serverWebSocket": {
"inputSampleRate": 123,
"outputSampleRate": 123,
"clientBufferSizeMs": 123
},
"telnyx": {},
"plivo": {},
"exotel": {}
},
"model": "fixie-ai/ultravox",
"recordingEnabled": false,
"systemPrompt": "<string>",
"temperature": 0,
"timeExceededMessage": "<string>",
"voice": "<string>",
"externalVoice": {
"elevenLabs": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"useSpeakerBoost": true,
"style": 123,
"similarityBoost": 123,
"stability": 123,
"pronunciationDictionaries": [
{
"dictionaryId": "<string>",
"versionId": "<string>"
}
]
},
"cartesia": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"emotion": "<string>"
},
"playHt": {
"userId": "<string>",
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"quality": "<string>",
"temperature": 123,
"emotion": 123,
"voiceGuidance": 123,
"styleGuidance": 123,
"textGuidance": 123,
"voiceConditioningSeconds": 123
},
"lmnt": {
"voiceId": "<string>",
"model": "<string>",
"speed": 123,
"conversational": true
}
},
"transcriptOptional": true,
"errorCount": 0,
"vadSettings": {
"turnEndpointDelay": "<string>",
"minimumTurnDuration": "<string>",
"minimumInterruptionDuration": "<string>",
"frameActivationThreshold": 123
},
"shortSummary": "<string>",
"summary": "<string>",
"experimentalSettings": "<any>",
"metadata": {},
"initialState": {}
}