POST
/
api
/
calls
curl --request POST \
  --url https://api.ultravox.ai/api/calls \
  --header 'Content-Type: application/json' \
  --header 'X-API-Key: <api-key>' \
  --data '{
  "systemPrompt": "<string>",
  "temperature": 123,
  "model": "<string>",
  "voice": "<string>",
  "externalVoice": {
    "elevenLabs": {
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "useSpeakerBoost": true,
      "style": 123,
      "similarityBoost": 123,
      "stability": 123,
      "pronunciationDictionaries": [
        {
          "dictionaryId": "<string>",
          "versionId": "<string>"
        }
      ]
    },
    "cartesia": {
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "emotion": "<string>"
    },
    "playHt": {
      "userId": "<string>",
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "quality": "<string>",
      "temperature": 123,
      "emotion": 123,
      "voiceGuidance": 123,
      "styleGuidance": 123,
      "textGuidance": 123,
      "voiceConditioningSeconds": 123
    },
    "lmnt": {
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "conversational": true
    }
  },
  "languageHint": "<string>",
  "initialMessages": [
    {
      "role": "MESSAGE_ROLE_UNSPECIFIED",
      "text": "<string>",
      "invocationId": "<string>",
      "toolName": "<string>",
      "errorDetails": "<string>",
      "medium": "MESSAGE_MEDIUM_UNSPECIFIED",
      "callStageMessageIndex": 123,
      "callStageId": "<string>",
      "callState": {},
      "timespan": {
        "start": "<string>",
        "end": "<string>"
      }
    }
  ],
  "joinTimeout": "<string>",
  "maxDuration": "<string>",
  "timeExceededMessage": "<string>",
  "inactivityMessages": [
    {
      "duration": "<string>",
      "message": "<string>",
      "endBehavior": "END_BEHAVIOR_UNSPECIFIED"
    }
  ],
  "selectedTools": [
    {
      "toolId": "<string>",
      "toolName": "<string>",
      "temporaryTool": {
        "modelToolName": "<string>",
        "description": "<string>",
        "dynamicParameters": [
          {
            "name": "<string>",
            "location": "PARAMETER_LOCATION_UNSPECIFIED",
            "schema": {},
            "required": true
          }
        ],
        "staticParameters": [
          {
            "name": "<string>",
            "location": "PARAMETER_LOCATION_UNSPECIFIED",
            "value": "<any>"
          }
        ],
        "automaticParameters": [
          {
            "name": "<string>",
            "location": "PARAMETER_LOCATION_UNSPECIFIED",
            "knownValue": "KNOWN_PARAM_UNSPECIFIED"
          }
        ],
        "requirements": {
          "httpSecurityOptions": {
            "options": [
              {
                "requirements": {},
                "ultravoxCallTokenRequirement": {
                  "scopes": [
                    "<string>"
                  ]
                }
              }
            ]
          },
          "requiredParameterOverrides": [
            "<string>"
          ]
        },
        "timeout": "<string>",
        "precomputable": true,
        "http": {
          "baseUrlPattern": "<string>",
          "httpMethod": "<string>"
        },
        "client": {},
        "defaultReaction": "AGENT_REACTION_UNSPECIFIED"
      },
      "nameOverride": "<string>",
      "authTokens": {},
      "parameterOverrides": {}
    }
  ],
  "medium": {
    "webRtc": {},
    "twilio": {},
    "serverWebSocket": {
      "inputSampleRate": 123,
      "outputSampleRate": 123,
      "clientBufferSizeMs": 123
    },
    "telnyx": {},
    "plivo": {},
    "exotel": {}
  },
  "recordingEnabled": true,
  "firstSpeaker": "FIRST_SPEAKER_UNSPECIFIED",
  "transcriptOptional": true,
  "initialOutputMedium": "MESSAGE_MEDIUM_UNSPECIFIED",
  "vadSettings": {
    "turnEndpointDelay": "<string>",
    "minimumTurnDuration": "<string>",
    "minimumInterruptionDuration": "<string>",
    "frameActivationThreshold": 123
  },
  "firstSpeakerSettings": {
    "user": {
      "fallback": {
        "delay": "<string>",
        "text": "<string>"
      }
    },
    "agent": {
      "uninterruptible": true,
      "text": "<string>",
      "delay": "<string>"
    }
  },
  "experimentalSettings": {},
  "metadata": {},
  "initialState": {}
}'
{
  "callId": "3c90c3cc-0d44-4b50-8888-8dd25736052a",
  "clientVersion": "<string>",
  "created": "2023-11-07T05:31:56Z",
  "joined": "2023-11-07T05:31:56Z",
  "ended": "2023-11-07T05:31:56Z",
  "endReason": "unjoined",
  "firstSpeaker": "FIRST_SPEAKER_AGENT",
  "firstSpeakerSettings": {
    "user": {
      "fallback": {
        "delay": "<string>",
        "text": "<string>"
      }
    },
    "agent": {
      "uninterruptible": true,
      "text": "<string>",
      "delay": "<string>"
    }
  },
  "inactivityMessages": [
    {
      "duration": "<string>",
      "message": "<string>",
      "endBehavior": "END_BEHAVIOR_UNSPECIFIED"
    }
  ],
  "initialOutputMedium": "MESSAGE_MEDIUM_VOICE",
  "joinTimeout": "30s",
  "joinUrl": "<string>",
  "languageHint": "<string>",
  "maxDuration": "3600s",
  "medium": {
    "webRtc": {},
    "twilio": {},
    "serverWebSocket": {
      "inputSampleRate": 123,
      "outputSampleRate": 123,
      "clientBufferSizeMs": 123
    },
    "telnyx": {},
    "plivo": {},
    "exotel": {}
  },
  "model": "fixie-ai/ultravox",
  "recordingEnabled": false,
  "systemPrompt": "<string>",
  "temperature": 0,
  "timeExceededMessage": "<string>",
  "voice": "<string>",
  "externalVoice": {
    "elevenLabs": {
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "useSpeakerBoost": true,
      "style": 123,
      "similarityBoost": 123,
      "stability": 123,
      "pronunciationDictionaries": [
        {
          "dictionaryId": "<string>",
          "versionId": "<string>"
        }
      ]
    },
    "cartesia": {
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "emotion": "<string>"
    },
    "playHt": {
      "userId": "<string>",
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "quality": "<string>",
      "temperature": 123,
      "emotion": 123,
      "voiceGuidance": 123,
      "styleGuidance": 123,
      "textGuidance": 123,
      "voiceConditioningSeconds": 123
    },
    "lmnt": {
      "voiceId": "<string>",
      "model": "<string>",
      "speed": 123,
      "conversational": true
    }
  },
  "transcriptOptional": true,
  "errorCount": 0,
  "vadSettings": {
    "turnEndpointDelay": "<string>",
    "minimumTurnDuration": "<string>",
    "minimumInterruptionDuration": "<string>",
    "frameActivationThreshold": 123
  },
  "shortSummary": "<string>",
  "summary": "<string>",
  "experimentalSettings": "<any>",
  "metadata": {},
  "initialState": {}
}

Authorizations

X-API-Key
string
header
required

API key

Query Parameters

enableGreetingPrompt
boolean
default:true

Adds a prompt for a greeting if there's not an initial message that the model would naturally respond to (a user message or tool result).

priorCallId
string

The UUID of a prior call. When specified, the new call will use the same properites as the prior call unless overriden in this request's body. The new call will also use the prior call's message history as its own initial_messages. (It's illegal to also set initial_messages in the body.)

Body

application/json

A request to start a call.

systemPrompt
string

The system prompt provided to the model during generations.

temperature
number

The model temperature, between 0 and 1. Defaults to 0.

model
string

The model used for generations. Defaults to fixie-ai/ultravox.

voice
string

The ID (or name if unique) of the voice the agent should use for this call.

externalVoice
object

A voice not known to Ultravox Realtime that can nonetheless be used for this call. Your account must have an API key set for the provider of the voice. Either this or voice may be set, but not both.

languageHint
string

A BCP47 language code that may be used to guide speech recognition and synthesis.

initialMessages
object[]

The conversation history to start from for this call.

A message exchanged during a call.

joinTimeout
string

A timeout for joining the call. Defaults to 30 seconds.

maxDuration
string

The maximum duration of the call. Defaults to 1 hour.

timeExceededMessage
string

What the agent should say immediately before hanging up if the call's time limit is reached.

inactivityMessages
object[]

Messages spoken by the agent when the user is inactive for the specified duration. Durations are cumulative, so a message m > 1 with duration 30s will be spoken 30 seconds after message m-1.

A message the agent should say after some duration. The duration's meaning varies depending on the context.

selectedTools
object[]

The tools available to the agent for (the first stage of) this call.

A tool selected for a particular call. Exactly one of tool_id, tool_name, or temporary_tool should be set.

medium
object

The medium used for this call.

recordingEnabled
boolean

Whether the call should be recorded.

firstSpeaker
enum<string>

Who should talk first when the call starts. Typically set to FIRST_SPEAKER_USER for outgoing calls and left as the default (FIRST_SPEAKER_AGENT) otherwise. Deprecated. Prefer firstSpeakerSettings. If both are set, they must match.

Available options:
FIRST_SPEAKER_UNSPECIFIED,
FIRST_SPEAKER_AGENT,
FIRST_SPEAKER_USER
transcriptOptional
boolean

Indicates whether a transcript is optional for the call.

initialOutputMedium
enum<string>

The medium to use for the call initially. May be altered by the client later. Defaults to voice.

Available options:
MESSAGE_MEDIUM_UNSPECIFIED,
MESSAGE_MEDIUM_VOICE,
MESSAGE_MEDIUM_TEXT
vadSettings
object

VAD settings for the call.

firstSpeakerSettings
object

The settings for the initial message to get a conversation started. Defaults to agent: {} which means the agent will start the conversation with an (interruptible) greeting generated based on the system prompt and any initial messages. (If first_speaker is set and this is not, first_speaker will be used instead.)

experimentalSettings
object

Experimental settings for the call.

metadata
object

Optional metadata key-value pairs to associate with the call. All values must be strings. Keys may not start with "ultravox.", which is reserved for system-provided metadata.

initialState
object

The initial state of the call stage which is readable/writable by tools.

Response

201 - application/json
callId
string
required
clientVersion
string | null
required

The version of the client that joined this call.

created
string
required
joined
string | null
required
ended
string | null
required
endReason
required

The reason the call ended.

  • unjoined - Client never joined
  • hangup - Client hung up
  • agent_hangup - Agent hung up
  • timeout - Call timed out
  • connection_error - Connection error
  • system_error - System error
Available options:
unjoined,
hangup,
agent_hangup,
timeout,
connection_error,
system_error
firstSpeaker
enum<string>
required
deprecated

Who was supposed to talk first when the call started. Typically set to FIRST_SPEAKER_USER for outgoing calls and left as the default (FIRST_SPEAKER_AGENT) otherwise.

Available options:
FIRST_SPEAKER_AGENT,
FIRST_SPEAKER_USER
firstSpeakerSettings
object
required

Settings for the initial message to get the call started.

initialOutputMedium
enum<string>
required

The medium used initially by the agent. May later be changed by the client.

Available options:
MESSAGE_MEDIUM_VOICE,
MESSAGE_MEDIUM_TEXT
joinUrl
string | null
required
errorCount
integer
default:0
required

The number of errors in this call.

shortSummary
string | null
required

A short summary of the call.

summary
string | null
required

A summary of the call.

experimentalSettings
any
required

Experimental settings for the call.

metadata
object
required

Optional metadata key-value pairs to associate with the call. All values must be strings.

initialState
object
required

The initial state of the call which is readable/writable by tools.

inactivityMessages
object[]

Messages spoken by the agent when the user is inactive for the specified duration. Durations are cumulative, so a message m > 1 with duration 30s will be spoken 30 seconds after message m-1.

A message the agent should say after some duration. The duration's meaning varies depending on the context.

joinTimeout
string
default:30s
languageHint
string | null

BCP47 language code that may be used to guide speech recognition.

Maximum length: 16
maxDuration
string
default:3600s
medium
object

Details about a call's protocol. By default, calls occur over WebRTC using the Ultravox client SDK. Setting a different call medium will prepare the server for a call using a different protocol. At most one call medium may be set.

model
string
default:fixie-ai/ultravox
recordingEnabled
boolean
default:false
systemPrompt
string | null
temperature
number
default:0
Required range: 0 <= x <= 1
timeExceededMessage
string | null
voice
string | null
externalVoice
object

A voice not known to Ultravox Realtime that can nonetheless be used for a call. Such voices are significantly less validated than normal voices and you'll be responsible for your own TTS-related errors. Exactly one field must be set.

transcriptOptional
boolean
default:true
deprecated

Indicates whether a transcript is optional for the call.

vadSettings
object

VAD settings for the call.