1. Fetch Session Token
Call the token endpoint with your API key to get a session token and WebSocket URL.Copy
Ask AI
const response = await fetch(`${API_URL}/api/v1/sdk/token`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${API_KEY}`,
'Content-Type': 'application/json',
},
});
const { token, ws_url } = await response.json();
2. Connect WebSocket
Open the WebSocket connection and send the token as the first message. Handle both text (control) and binary (audio) messages.Copy
Ask AI
const ws = new WebSocket(ws_url);
ws.binaryType = 'arraybuffer';
ws.onopen = () => {
ws.send(JSON.stringify({ token }));
};
ws.onmessage = (event) => {
if (typeof event.data === 'string') {
const msg = JSON.parse(event.data);
switch (msg.type) {
case 'connected':
// Session is ready — start sending mic audio
startMicCapture();
break;
case 'agent_ready':
// AI agent is listening and responding
break;
case 'session_ended':
// msg.reason: 'agent_disconnected' | 'session_closed' | 'server_shutdown'
cleanup();
break;
case 'error':
// msg.code + msg.message
console.error(msg.code, msg.message);
break;
}
} else {
// Binary = PCM audio from the AI agent
playAudioFrame(event.data);
}
};
3. Microphone Capture
Use the Web Audio API to capture microphone input, downsample to 16kHz if needed, convert to Int16, and send 640-byte frames over the WebSocket.Copy
Ask AI
const SAMPLE_RATE = 16_000;
const FRAME_SAMPLES = 320; // 20ms at 16kHz
async function startMicCapture() {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
});
const ctx = new AudioContext({ sampleRate: SAMPLE_RATE });
// Resume if browser auto-suspends (autoplay policy)
if (ctx.state === 'suspended') await ctx.resume();
const source = ctx.createMediaStreamSource(stream);
const processor = ctx.createScriptProcessor(4096, 1, 1);
let residual = new Float32Array(0);
processor.onaudioprocess = (e) => {
const input = e.inputBuffer.getChannelData(0);
const nativeRate = ctx.sampleRate;
// Downsample if browser ignored our requested sampleRate
let samples;
if (Math.abs(nativeRate - SAMPLE_RATE) > 1) {
const ratio = nativeRate / SAMPLE_RATE;
const len = Math.floor(input.length / ratio);
samples = new Float32Array(len);
for (let i = 0; i < len; i++) {
samples[i] = input[Math.floor(i * ratio)];
}
} else {
samples = new Float32Array(input); // copy — input buffer is reused
}
// Combine with leftover from previous callback
const combined = new Float32Array(residual.length + samples.length);
combined.set(residual);
combined.set(samples, residual.length);
// Emit complete 640-byte frames
let offset = 0;
while (offset + FRAME_SAMPLES <= combined.length) {
const frame = combined.subarray(offset, offset + FRAME_SAMPLES);
const int16 = float32ToInt16(frame);
ws.send(int16.buffer.slice(0)); // independent buffer copy
offset += FRAME_SAMPLES;
}
residual = combined.slice(offset);
};
source.connect(processor);
processor.connect(ctx.destination); // required for ScriptProcessor to fire
}
function float32ToInt16(f32) {
const i16 = new Int16Array(f32.length);
for (let i = 0; i < f32.length; i++) {
const s = Math.max(-1, Math.min(1, f32[i]));
i16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
return i16;
}
4. Audio Playback
Convert incoming PCM frames to Float32 and schedule them for gapless playback usingAudioBufferSourceNode.
Copy
Ask AI
const playbackCtx = new AudioContext({ sampleRate: 16000 });
let nextStartTime = 0;
function playAudioFrame(pcmData) {
const int16 = new Int16Array(pcmData);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
}
const buffer = playbackCtx.createBuffer(1, float32.length, 16000);
buffer.getChannelData(0).set(float32);
const source = playbackCtx.createBufferSource();
source.buffer = buffer;
source.connect(playbackCtx.destination);
// Schedule for gapless playback
const now = playbackCtx.currentTime;
if (nextStartTime < now) nextStartTime = now;
source.start(nextStartTime);
nextStartTime += buffer.duration;
}
To visualize agent audio, insert an
AnalyserNode between the source and destination, then read frequency data with getByteFrequencyData().5. Complete Example
A minimal standalone HTML page that connects to the Voice API and enables bidirectional voice conversation.Copy
Ask AI
<!DOCTYPE html>
<html>
<body>
<button id="start">Start</button>
<button id="stop" disabled>Stop</button>
<script>
const API_URL = 'https://your-api-url.com';
const API_KEY = 'your_api_key';
let ws, micStream, audioCtx, processor, playbackCtx, nextStart = 0;
document.getElementById('start').onclick = async () => {
// 1. Get token
const res = await fetch(`${API_URL}/api/v1/sdk/token`, {
method: 'POST',
headers: { 'Authorization': `Bearer ${API_KEY}` },
});
const { token, ws_url } = await res.json();
// 2. Connect WebSocket
ws = new WebSocket(ws_url);
ws.binaryType = 'arraybuffer';
ws.onopen = () => ws.send(JSON.stringify({ token }));
ws.onmessage = (e) => {
if (typeof e.data === 'string') {
const msg = JSON.parse(e.data);
if (msg.type === 'connected') startMic();
if (msg.type === 'session_ended') cleanup();
} else {
playPCM(e.data);
}
};
document.getElementById('start').disabled = true;
document.getElementById('stop').disabled = false;
};
document.getElementById('stop').onclick = () => cleanup();
async function startMic() {
micStream = await navigator.mediaDevices.getUserMedia({
audio: { channelCount: 1, echoCancellation: true, noiseSuppression: true },
});
audioCtx = new AudioContext({ sampleRate: 16000 });
if (audioCtx.state === 'suspended') await audioCtx.resume();
const src = audioCtx.createMediaStreamSource(micStream);
processor = audioCtx.createScriptProcessor(4096, 1, 1);
let residual = new Float32Array(0);
processor.onaudioprocess = (ev) => {
const inp = new Float32Array(ev.inputBuffer.getChannelData(0));
const combined = new Float32Array(residual.length + inp.length);
combined.set(residual);
combined.set(inp, residual.length);
let off = 0;
while (off + 320 <= combined.length) {
const f = combined.subarray(off, off + 320);
const i16 = new Int16Array(320);
for (let i = 0; i < 320; i++) {
const s = Math.max(-1, Math.min(1, f[i]));
i16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
ws.send(i16.buffer.slice(0));
off += 320;
}
residual = combined.slice(off);
};
src.connect(processor);
processor.connect(audioCtx.destination);
}
function playPCM(buf) {
if (!playbackCtx) playbackCtx = new AudioContext({ sampleRate: 16000 });
const i16 = new Int16Array(buf);
const f32 = new Float32Array(i16.length);
for (let i = 0; i < i16.length; i++)
f32[i] = i16[i] / (i16[i] < 0 ? 0x8000 : 0x7fff);
const ab = playbackCtx.createBuffer(1, f32.length, 16000);
ab.getChannelData(0).set(f32);
const s = playbackCtx.createBufferSource();
s.buffer = ab;
s.connect(playbackCtx.destination);
const now = playbackCtx.currentTime;
if (nextStart < now) nextStart = now;
s.start(nextStart);
nextStart += ab.duration;
}
function cleanup() {
processor?.disconnect();
micStream?.getTracks().forEach(t => t.stop());
audioCtx?.close();
ws?.close();
document.getElementById('start').disabled = false;
document.getElementById('stop').disabled = true;
}
</script>
</body>
</html>