Skip to main content

1. Fetch Session Token

Call the token endpoint with your API key to get a session token and WebSocket URL.
const response = await fetch(`${API_URL}/api/v1/sdk/token`, {
  method: 'POST',
  headers: {
    'Authorization': `Bearer ${API_KEY}`,
    'Content-Type': 'application/json',
  },
});

const { token, ws_url } = await response.json();

2. Connect WebSocket

Open the WebSocket connection and send the token as the first message. Handle both text (control) and binary (audio) messages.
const ws = new WebSocket(ws_url);
ws.binaryType = 'arraybuffer';

ws.onopen = () => {
  ws.send(JSON.stringify({ token }));
};

ws.onmessage = (event) => {
  if (typeof event.data === 'string') {
    const msg = JSON.parse(event.data);

    switch (msg.type) {
      case 'connected':
        // Session is ready — start sending mic audio
        startMicCapture();
        break;

      case 'agent_ready':
        // AI agent is listening and responding
        break;

      case 'session_ended':
        // msg.reason: 'agent_disconnected' | 'session_closed' | 'server_shutdown'
        cleanup();
        break;

      case 'error':
        // msg.code + msg.message
        console.error(msg.code, msg.message);
        break;
    }
  } else {
    // Binary = PCM audio from the AI agent
    playAudioFrame(event.data);
  }
};

3. Microphone Capture

Use the Web Audio API to capture microphone input, downsample to 16kHz if needed, convert to Int16, and send 640-byte frames over the WebSocket.
const SAMPLE_RATE = 16_000;
const FRAME_SAMPLES = 320; // 20ms at 16kHz

async function startMicCapture() {
  const stream = await navigator.mediaDevices.getUserMedia({
    audio: {
      channelCount: 1,
      echoCancellation: true,
      noiseSuppression: true,
      autoGainControl: true,
    },
  });

  const ctx = new AudioContext({ sampleRate: SAMPLE_RATE });

  // Resume if browser auto-suspends (autoplay policy)
  if (ctx.state === 'suspended') await ctx.resume();

  const source = ctx.createMediaStreamSource(stream);
  const processor = ctx.createScriptProcessor(4096, 1, 1);
  let residual = new Float32Array(0);

  processor.onaudioprocess = (e) => {
    const input = e.inputBuffer.getChannelData(0);
    const nativeRate = ctx.sampleRate;

    // Downsample if browser ignored our requested sampleRate
    let samples;
    if (Math.abs(nativeRate - SAMPLE_RATE) > 1) {
      const ratio = nativeRate / SAMPLE_RATE;
      const len = Math.floor(input.length / ratio);
      samples = new Float32Array(len);
      for (let i = 0; i < len; i++) {
        samples[i] = input[Math.floor(i * ratio)];
      }
    } else {
      samples = new Float32Array(input); // copy — input buffer is reused
    }

    // Combine with leftover from previous callback
    const combined = new Float32Array(residual.length + samples.length);
    combined.set(residual);
    combined.set(samples, residual.length);

    // Emit complete 640-byte frames
    let offset = 0;
    while (offset + FRAME_SAMPLES <= combined.length) {
      const frame = combined.subarray(offset, offset + FRAME_SAMPLES);
      const int16 = float32ToInt16(frame);
      ws.send(int16.buffer.slice(0)); // independent buffer copy
      offset += FRAME_SAMPLES;
    }
    residual = combined.slice(offset);
  };

  source.connect(processor);
  processor.connect(ctx.destination); // required for ScriptProcessor to fire
}

function float32ToInt16(f32) {
  const i16 = new Int16Array(f32.length);
  for (let i = 0; i < f32.length; i++) {
    const s = Math.max(-1, Math.min(1, f32[i]));
    i16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
  }
  return i16;
}

4. Audio Playback

Convert incoming PCM frames to Float32 and schedule them for gapless playback using AudioBufferSourceNode.
const playbackCtx = new AudioContext({ sampleRate: 16000 });
let nextStartTime = 0;

function playAudioFrame(pcmData) {
  const int16 = new Int16Array(pcmData);
  const float32 = new Float32Array(int16.length);
  for (let i = 0; i < int16.length; i++) {
    float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
  }

  const buffer = playbackCtx.createBuffer(1, float32.length, 16000);
  buffer.getChannelData(0).set(float32);

  const source = playbackCtx.createBufferSource();
  source.buffer = buffer;
  source.connect(playbackCtx.destination);

  // Schedule for gapless playback
  const now = playbackCtx.currentTime;
  if (nextStartTime < now) nextStartTime = now;
  source.start(nextStartTime);
  nextStartTime += buffer.duration;
}
To visualize agent audio, insert an AnalyserNode between the source and destination, then read frequency data with getByteFrequencyData().

5. Complete Example

A minimal standalone HTML page that connects to the Voice API and enables bidirectional voice conversation.
<!DOCTYPE html>
<html>
<body>
  <button id="start">Start</button>
  <button id="stop" disabled>Stop</button>
  <script>
    const API_URL = 'https://your-api-url.com';
    const API_KEY = 'your_api_key';

    let ws, micStream, audioCtx, processor, playbackCtx, nextStart = 0;

    document.getElementById('start').onclick = async () => {
      // 1. Get token
      const res = await fetch(`${API_URL}/api/v1/sdk/token`, {
        method: 'POST',
        headers: { 'Authorization': `Bearer ${API_KEY}` },
      });
      const { token, ws_url } = await res.json();

      // 2. Connect WebSocket
      ws = new WebSocket(ws_url);
      ws.binaryType = 'arraybuffer';
      ws.onopen = () => ws.send(JSON.stringify({ token }));

      ws.onmessage = (e) => {
        if (typeof e.data === 'string') {
          const msg = JSON.parse(e.data);
          if (msg.type === 'connected') startMic();
          if (msg.type === 'session_ended') cleanup();
        } else {
          playPCM(e.data);
        }
      };

      document.getElementById('start').disabled = true;
      document.getElementById('stop').disabled = false;
    };

    document.getElementById('stop').onclick = () => cleanup();

    async function startMic() {
      micStream = await navigator.mediaDevices.getUserMedia({
        audio: { channelCount: 1, echoCancellation: true, noiseSuppression: true },
      });
      audioCtx = new AudioContext({ sampleRate: 16000 });
      if (audioCtx.state === 'suspended') await audioCtx.resume();

      const src = audioCtx.createMediaStreamSource(micStream);
      processor = audioCtx.createScriptProcessor(4096, 1, 1);
      let residual = new Float32Array(0);

      processor.onaudioprocess = (ev) => {
        const inp = new Float32Array(ev.inputBuffer.getChannelData(0));
        const combined = new Float32Array(residual.length + inp.length);
        combined.set(residual);
        combined.set(inp, residual.length);

        let off = 0;
        while (off + 320 <= combined.length) {
          const f = combined.subarray(off, off + 320);
          const i16 = new Int16Array(320);
          for (let i = 0; i < 320; i++) {
            const s = Math.max(-1, Math.min(1, f[i]));
            i16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
          }
          ws.send(i16.buffer.slice(0));
          off += 320;
        }
        residual = combined.slice(off);
      };
      src.connect(processor);
      processor.connect(audioCtx.destination);
    }

    function playPCM(buf) {
      if (!playbackCtx) playbackCtx = new AudioContext({ sampleRate: 16000 });
      const i16 = new Int16Array(buf);
      const f32 = new Float32Array(i16.length);
      for (let i = 0; i < i16.length; i++)
        f32[i] = i16[i] / (i16[i] < 0 ? 0x8000 : 0x7fff);

      const ab = playbackCtx.createBuffer(1, f32.length, 16000);
      ab.getChannelData(0).set(f32);
      const s = playbackCtx.createBufferSource();
      s.buffer = ab;
      s.connect(playbackCtx.destination);

      const now = playbackCtx.currentTime;
      if (nextStart < now) nextStart = now;
      s.start(nextStart);
      nextStart += ab.duration;
    }

    function cleanup() {
      processor?.disconnect();
      micStream?.getTracks().forEach(t => t.stop());
      audioCtx?.close();
      ws?.close();
      document.getElementById('start').disabled = false;
      document.getElementById('stop').disabled = true;
    }
  </script>
</body>
</html>