Create text-to-speech audio stream
Table of contents
December 23, 2024
Please configure at least one www.hailuo.ai account for this endpoint, see Setup MiniMax for details.
This endpoint creates a near real-time audio stream from the provided text.
- Average time to response is 3 seconds.
- Up to 20 parallel jobs per account are supported.
- Currently, this service is offered free of charge.
Over 300 pre-built voices provided GET audio/voices supporting the following:
- Languages: English, Chinese (Mandarin), Spanish, French, Russian, Portuguese, Indonesian, German, Japanese, Korean, Italian, Cantonese
- Emotions: happy, sad, angry, fearful, disgusted, surprised, neutral
- Accents: US (General), English, Indian
- Ages: Young Adult, Adult, Middle-Aged, Senior
- Genders: Male, Female
https://api.useapi.net/v1/minimax/audio/create-stream
Request Headers
Authorization: Bearer {API token}
Content-Type: application/json
# Alternatively you can use multipart/form-data
# Content-Type: multipart/form-data
API token
is required, see Setup useapi.net for details.
Request Body
{
"account": "Optional MiniMax www.hailuo.ai API account",
"text": "Required text",
"voice_id": "Required voice id"
}
-
account
is optional when only onewww.hailuo.ai
account configured. However, if you have multiple accounts configured, this parameter becomes required. -
text
is required. Insert<#0.5#>
to add a 0.5s pause between sentences. Adjust the duration as needed.
Maximum length: 3000 characters. -
voice_id
is required. Use GET audio/voices to get list of all available voices. -
language_boost
is optional. Use tag_name from arrayvoice_tag_language
of GET audio/config.
Default valueAuto
. -
emotion
is optional. Use value from arrayt2a_emotion
of GET audio/config.
Default valueAuto
. -
vol
is optional.
Default 1. -
speed
is optional.
Valid range: 0.5…2, default 1. -
pitch
is optional.
Valid range: -12…12, default 0. -
deepen_lighten
is optional.
Valid range: -100…100, default 0. -
stronger_softer
is optional.
Valid range: -100…100, default 0. -
nasal_crisp
is optional.
Valid range: -100…100, default 0. -
spacious_echo
is optional.
Supported values:true
,false
(default). -
lofi_telephone
is optional.
Supported values:true
,false
(default). -
robotic
is optional.
Supported values:true
,false
(default). -
auditorium_echo
is optional.
Supported values:true
,false
(default).
Responses
-
Response headers:
Content-Type: text/event-stream
Transfer-Encoding: chunkedid:1 event:audio_chunk data:{"data":{"audio":"<raw audio stream>","status":1,"ced":""},"input_sensitive":false,"trace_id":"d72c1c7952db30f1d4a023bede7d15f6","base_resp":{"status_code":0,"status_msg":""}} id:2 event:audio_chunk data:{"data":{"audio":"<raw audio stream>","status":2,"ced":""},"extra_info":{"audio_length":0,"audio_sample_rate":32000,"audio_size":15597,"bitrate":128000,"word_count":5,"invisible_character_ratio":0,"usage_characters":5,"audio_format":"mp3","audio_channel":1},"input_sensitive":false,"trace_id":"d72c1c7952db30f1d4a023bede7d15f6","base_resp":{"status_code":0,"status_msg":"success"}}
-
{ "error": "<Error message>" }
-
{ "error": "Unauthorized" }
-
Wait in a loop for at least 5..10 seconds and retry again. If you receive this message frequently, you may want to consider adding more accounts to spread the load.
Examples
The code below is what this page is using. Feel free to use your preferred LLM to adapt it to your desired language/environment.
JavaScript code
class DynamicAudioPlayer {
constructor() {
this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
this.audioQueue = [];
this.isPlaying = false;
this.currentSource = null;
this.onAudioFinishedCallback = null;
}
async loadAudioData(base64Chunk, finishCallback) {
try {
const byteArray = this.hexStringToByteArray('fffbe8c4' + base64Chunk);
this.audioQueue.push(byteArray);
this.onAudioFinishedCallback = finishCallback;
if (!this.isPlaying) {
this.isPlaying = true;
await this.playNextChunk();
}
} catch (e) {
console.error("Error decoding audio data:", e);
}
}
async playNextChunk() {
if (this.audioQueue.length > 0) {
const byteArray = this.audioQueue.shift();
const audioBuffer = await this.audioContext.decodeAudioData(byteArray.buffer);
this.scheduleAudioBuffer(audioBuffer);
} else {
this.isPlaying = false;
if (this.onAudioFinishedCallback) {
this.onAudioFinishedCallback();
this.onAudioFinishedCallback = null;
}
}
}
scheduleAudioBuffer(audioBuffer) {
const source = this.audioContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(this.audioContext.destination);
source.start();
this.currentSource = source;
source.onended = () => {
this.playNextChunk();
};
}
stop() {
this.audioQueue = [];
if (this.currentSource) {
this.currentSource.stop();
this.currentSource = null;
}
this.isPlaying = false;
}
hexStringToByteArray(hexString) {
const bytes = new Uint8Array(hexString.length / 2);
for (let i = 0; i < hexString.length; i += 2) {
bytes[i / 2] = parseInt(hexString.substring(i, i + 2), 16);
}
return bytes;
}
}
var player = null;
async function streamAudio(data, callback, finishCallback) {
if (player)
player.stop();
else
player = new DynamicAudioPlayer();
const response = await fetch('https://api.useapi.net/v1/minimax/audio/create-stream', data);
if (!response.ok) {
const jsonData = await response.json();
if (callback)
callback(response.status, jsonData);
throw new Error(`Failed with ${response.status}`)
}
const reader = response.body?.getReader();
const decoder = new TextDecoder('utf-8');
let buffer = '';
const parseData = async (data) => {
try {
const jsonData = JSON.parse(data.startsWith('data:') ? data.slice(5) : data);
if (callback)
callback(200, jsonData);
if (jsonData.data?.status == 1 && jsonData.data.audio) {
player.loadAudioData(jsonData.data?.audio, finishCallback);
}
} catch (error) {
console.error(`Failed to parse JSON: ${error}`, data);
}
};
while (reader) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
let lines = buffer.split('\n');
buffer = lines.pop() ?? '';
for (const line of lines)
if (line.startsWith('data:'))
parseData(line);
}
}
// Here's how you call above functions
const data = {
method: 'POST',
headers: {
'Authorization': `Bearer ${api_token_value}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
text: 'your text goes here',
voice_id: 'desired voice'
})
};
await streamAudio(
data,
// optional progress callback
(status, json) => {
console.log(`callback`, { status, json });
},
// optional playback completed callback
() => {
console.log(`playback completed`);
});
Model
The below model is what follows after data:
.
The value of data.status
can be either 1
(progress) or 2
(completed). Once generation is completed you can locate the generated mp3
by matching then trace_id
to the end of audio_list[].audio_url
returned by the GET audio endpoint.
{ // TypeScript, all fields are optional
data: {
audio: string
status: number
ced: string
}
extra_info?: {
audio_length: number
audio_sample_rate: number
audio_size: number
bitrate: number
word_count: number
invisible_character_ratio: number
usage_characters: number
audio_format: string
audio_channel: number
}
input_sensitive: boolean
trace_id: string
base_resp: {
status_code: number
status_msg: string
}
}