Create text-to-speech audio stream

Table of contents

December 23, 2024

  1. Request Headers
  2. Request Body
  3. Responses
  4. Examples
  5. Model
  6. Try It

Please configure at least one www.hailuo.ai account for this endpoint, see Setup MiniMax for details.

This endpoint creates a near real-time audio stream from the provided text.

  • Average time to response is 3 seconds.
  • Up to 20 parallel jobs per account are supported.
  • Currently, this service is offered free of charge.

Over 300 pre-built voices provided GET audio/voices supporting the following:

  • Languages: English, Chinese (Mandarin), Spanish, French, Russian, Portuguese, Indonesian, German, Japanese, Korean, Italian, Cantonese
  • Emotions: happy, sad, angry, fearful, disgusted, surprised, neutral
  • Accents: US (General), English, Indian
  • Ages: Young Adult, Adult, Middle-Aged, Senior
  • Genders: Male, Female

https://api.useapi.net/v1/minimax/audio/create-stream

Request Headers
Authorization: Bearer {API token}
Content-Type: application/json
# Alternatively you can use multipart/form-data
# Content-Type: multipart/form-data
Request Body
{
    "account": "Optional MiniMax www.hailuo.ai API account",
    "text": "Required text",
    "voice_id": "Required voice id"
}
  • account is optional when only one www.hailuo.ai account configured. However, if you have multiple accounts configured, this parameter becomes required.

  • text is required. Insert <#0.5#> to add a 0.5s pause between sentences. Adjust the duration as needed.
    Maximum length: 3000 characters.

  • voice_id is required. Use GET audio/voices to get list of all available voices.

  • language_boost is optional. Use tag_name from array voice_tag_language of GET audio/config.
    Default value Auto.

  • emotion is optional. Use value from array t2a_emotion of GET audio/config.
    Default value Auto.

  • vol is optional.
    Default 1.

  • speed is optional.
    Valid range: 0.5…2, default 1.

  • pitch is optional.
    Valid range: -12…12, default 0.

  • deepen_lighten is optional.
    Valid range: -100…100, default 0.

  • stronger_softer is optional.
    Valid range: -100…100, default 0.

  • nasal_crisp is optional.
    Valid range: -100…100, default 0.

  • spacious_echo is optional.
    Supported values: true, false (default).

  • lofi_telephone is optional.
    Supported values: true, false (default).

  • robotic is optional.
    Supported values: true, false (default).

  • auditorium_echo is optional.
    Supported values: true, false (default).

Responses
  • 200 OK

    Response headers:

    Content-Type: text/event-stream
    Transfer-Encoding: chunked

    id:1
    event:audio_chunk
    data:{"data":{"audio":"<raw audio stream>","status":1,"ced":""},"input_sensitive":false,"trace_id":"d72c1c7952db30f1d4a023bede7d15f6","base_resp":{"status_code":0,"status_msg":""}}
    
    id:2
    event:audio_chunk
    data:{"data":{"audio":"<raw audio stream>","status":2,"ced":""},"extra_info":{"audio_length":0,"audio_sample_rate":32000,"audio_size":15597,"bitrate":128000,"word_count":5,"invisible_character_ratio":0,"usage_characters":5,"audio_format":"mp3","audio_channel":1},"input_sensitive":false,"trace_id":"d72c1c7952db30f1d4a023bede7d15f6","base_resp":{"status_code":0,"status_msg":"success"}}
    
  • 400 Bad Request

    {
      "error": "<Error message>"
    }
    
  • 401 Unauthorized

    {
      "error": "Unauthorized"
    }
    
  • 429 Too Many Requests

    Wait in a loop for at least 5..10 seconds and retry again. If you receive this message frequently, you may want to consider adding more accounts to spread the load.

Examples

The code below is what this page is using. Feel free to use your preferred LLM to adapt it to your desired language/environment.

JavaScript code
class DynamicAudioPlayer {
    constructor() {
        this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
        this.audioQueue = [];
        this.isPlaying = false;
        this.currentSource = null;
        this.onAudioFinishedCallback = null;
    }

    async loadAudioData(base64Chunk, finishCallback) {
        try {
            const byteArray = this.hexStringToByteArray('fffbe8c4' + base64Chunk);
            this.audioQueue.push(byteArray);
            this.onAudioFinishedCallback = finishCallback;

            if (!this.isPlaying) {
                this.isPlaying = true;
                await this.playNextChunk();
            }
        } catch (e) {
            console.error("Error decoding audio data:", e);
        }
    }

    async playNextChunk() {
        if (this.audioQueue.length > 0) {
            const byteArray = this.audioQueue.shift();
            const audioBuffer = await this.audioContext.decodeAudioData(byteArray.buffer);
            this.scheduleAudioBuffer(audioBuffer);
        } else {
            this.isPlaying = false;
            if (this.onAudioFinishedCallback) {
                this.onAudioFinishedCallback();
                this.onAudioFinishedCallback = null;
            }
        }
    }

    scheduleAudioBuffer(audioBuffer) {
        const source = this.audioContext.createBufferSource();
        source.buffer = audioBuffer;
        source.connect(this.audioContext.destination);
        source.start();
        this.currentSource = source;

        source.onended = () => {
            this.playNextChunk();
        };
    }

    stop() {
        this.audioQueue = [];
        if (this.currentSource) {
            this.currentSource.stop();
            this.currentSource = null;
        }
        this.isPlaying = false;
    }

    hexStringToByteArray(hexString) {
        const bytes = new Uint8Array(hexString.length / 2);
        for (let i = 0; i < hexString.length; i += 2) {
            bytes[i / 2] = parseInt(hexString.substring(i, i + 2), 16);
        }
        return bytes;
    }
}

var player = null;

async function streamAudio(data, callback, finishCallback) {
    if (player)
        player.stop();
    else
        player = new DynamicAudioPlayer();

    const response = await fetch('https://api.useapi.net/v1/minimax/audio/create-stream', data);

    if (!response.ok) {
        const jsonData = await response.json();
        if (callback)
            callback(response.status, jsonData);
        throw new Error(`Failed with ${response.status}`)
    }

    const reader = response.body?.getReader();
    const decoder = new TextDecoder('utf-8');
    let buffer = '';

    const parseData = async (data) => {
        try {
            const jsonData = JSON.parse(data.startsWith('data:') ? data.slice(5) : data);

            if (callback)
                callback(200, jsonData);

            if (jsonData.data?.status == 1 && jsonData.data.audio) {
                player.loadAudioData(jsonData.data?.audio, finishCallback);
            }
        } catch (error) {
            console.error(`Failed to parse JSON: ${error}`, data);
        }
    };

    while (reader) {
        const { done, value } = await reader.read();

        if (done) break;

        buffer += decoder.decode(value, { stream: true });

        let lines = buffer.split('\n');
        buffer = lines.pop() ?? '';

        for (const line of lines)
            if (line.startsWith('data:'))
                parseData(line);
    }
}

// Here's how you call above functions

const data = {
    method: 'POST',
    headers: {
        'Authorization': `Bearer ${api_token_value}`,
        'Content-Type': 'application/json'
    },
    body: JSON.stringify({
        text: 'your text goes here',
        voice_id: 'desired voice'
    })
};

await streamAudio(
    data,
    // optional progress callback
    (status, json) => {
        console.log(`callback`, { status, json });
    },
    // optional playback completed callback
    () => {
        console.log(`playback completed`);
    });    

Model

The below model is what follows after data:.
The value of data.status can be either 1 (progress) or 2 (completed). Once generation is completed you can locate the generated mp3 by matching then trace_id to the end of audio_list[].audio_url returned by the GET audio endpoint.

{ // TypeScript, all fields are optional
  data: {
    audio: string
    status: number
    ced: string
  }
  extra_info?: {
    audio_length: number
    audio_sample_rate: number
    audio_size: number
    bitrate: number
    word_count: number
    invisible_character_ratio: number
    usage_characters: number
    audio_format: string
    audio_channel: number
  }
  input_sensitive: boolean
  trace_id: string
  base_resp: {
    status_code: number
    status_msg: string
  }
}
Try It