Create text-to-speech audio stream over the WebSocket

Table of contents

January 7, 2025

  1. Query Parameters
  2. Responses
  3. Examples
  4. Model
  5. Try It

Use POST audio/create-stream to obtain token and payload.
To see the provided below code in action use Try It.

wss://api.useapi.net/v1/minimax/audio/wss?token=token

Query Parameters
Responses
Examples
var player = null;
var ws = null;

const urlCreateStream = 'https://api.useapi.net/v1/minimax/audio/create-stream';
const wssCreateStream = 'wss://api.useapi.net/v1/minimax/audio/wss';
const urlAudio = 'https://api.useapi.net/v1/minimax/audio';

class DynamicAudioPlayer {
    constructor() {
        this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
        this.audioQueue = [];
        this.isPlaying = false;
        this.currentSource = null;
        this.onAudioFinishedCallback = null;
    }

    async loadAudioData(base64Chunk, finishCallback) {
        try {
            const byteArray = this.hexStringToByteArray('fffbe8c4' + base64Chunk);
            this.audioQueue.push(byteArray);
            this.onAudioFinishedCallback = finishCallback;

            if (!this.isPlaying) {
                this.isPlaying = true;
                await this.playNextChunk();
            }
        } catch (e) {
            console.error("Error decoding audio data:", e);
        }
    }

    async playNextChunk() {
        if (this.audioQueue.length > 0) {
            const byteArray = this.audioQueue.shift();
            const audioBuffer = await this.audioContext.decodeAudioData(byteArray.buffer);
            this.scheduleAudioBuffer(audioBuffer);
        } else {
            this.isPlaying = false;
            if (this.onAudioFinishedCallback) {
                this.onAudioFinishedCallback();
                this.onAudioFinishedCallback = null;
            }
        }
    }

    scheduleAudioBuffer(audioBuffer) {
        const source = this.audioContext.createBufferSource();
        source.buffer = audioBuffer;
        source.connect(this.audioContext.destination);
        source.start();
        this.currentSource = source;

        source.onended = () => {
            this.playNextChunk();
        };
    }

    stop() {
        this.audioQueue = [];
        if (this.currentSource) {
            this.currentSource.stop();
            this.currentSource = null;
        }
        this.isPlaying = false;
    }

    hexStringToByteArray(hexString) {
        const bytes = new Uint8Array(hexString.length / 2);
        for (let i = 0; i < hexString.length; i += 2) {
            bytes[i / 2] = parseInt(hexString.substring(i, i + 2), 16);
        }
        return bytes;
    }
}

async function streamAudio(data, callback, finishCallback) {
    const parseData = async (wssData) => {
        try {
            const json = JSON.parse(wssData);

            let audio;

            if (json.data?.audio) {
                audio = json.data.audio;
                json.data.audio = `…omitted ${audio.length} bytes of raw audio…`;
            }

            if (json.data?.status == 1 && audio)
                player.loadAudioData(audio, finishCallback);

            if (callback)
                callback({ status: 200, json });

            if (json.data?.status == 2 && json.trace_id) {
                const { trace_id } = json;
                const { headers, body } = data;
                const { account } = JSON.parse(body);

                callback({ text: `⌛ GET ${urlAudio} ⁝ looking for generated mp3 using matched trace_id (${trace_id})…` });

                const response = await fetch(`${urlAudio}${account ? '/?account=' + account : ''}`, { headers });

                const text = await response.text();

                if (!response.ok) {
                    callback({ status: response.status, text });
                    return;
                }

                const { audio_list } = JSON.parse(text);

                const item = audio_list?.find(d => d.audio_title?.endsWith(`_${trace_id}`));

                const { audio_url } = item ?? {};

                callback({ status: response.status, json: item, text: '👉🏻 ' + audio_url });
            }
        } catch (error) {
            console.error(`Failed to parse JSON: ${error}`, wssData);
        }
    };

    if (player)
        player.stop();
    else
        player = new DynamicAudioPlayer();

    if (ws) {
        ws.close();
        ws = null;
    }

    callback({ text: `⏳ Requesting WebSocket token and payload from ${urlCreateStream}…` });

    const response = await fetch(urlCreateStream, data);

    const text = await response.text();

    callback({ status: response.status, text });

    if (!response.ok)
        return;

    const { token, payload } = JSON.parse(text);

    callback({ text: `⌛ Establishing WebSocket connection to ${wssCreateStream}…` });

    ws = new WebSocket(`${wssCreateStream}/?token=${token}`);

    ws.addEventListener('open', () => {
        callback({ text: `🚀 Sending payload over WebSocket connection` });
        ws.send(JSON.stringify({ payload }));
    });

    ws.addEventListener('message', async event => {
        await parseData(event.data);
    });

    ws.addEventListener('error', event => {
        const text = `🛑 WebSocket error: ${JSON.stringify(event)}`;
        console.error(text);
        callback({ text });
    });

    ws.addEventListener('close', event => {
        console.log('WebSocket close', event);
    });
}

// Here's how you call above functions

const data = {
    method: 'POST',
    headers: {
        'Authorization': `Bearer ${api_token_value}`,
        'Content-Type': 'application/json'
    },
    body: JSON.stringify({
        text: 'your text goes here',
        voice_id: 'desired voice'
    })
};

await streamAudio(
    data,
    // optional progress callback
    (status, json, text) => {
        console.log(`callback`, { status, json, text });
    },
    // optional playback completed callback
    () => {
        console.log(`playback completed`);
    }
);    
Model

The below model represent WebSocket message payload object.
The value of data.status can be either 1 (progress) or 2 (completed). Once generation is completed you can locate the generated mp3 by matching then trace_id to the end of audio_list[].audio_url returned by the GET audio endpoint.

{ // TypeScript, all fields are optional
  data: {
    audio: string
    status: number
    ced: string
  }
  extra_info?: {
    audio_length: number
    audio_sample_rate: number
    audio_size: number
    bitrate: number
    word_count: number
    invisible_character_ratio: number
    usage_characters: number
    audio_format: string
    audio_channel: number
  }
  input_sensitive: boolean
  trace_id: string
  base_resp: {
    status_code: number
    status_msg: string
  }
}
Try It

See above code in action at Try It POST audio/create-stream.