Processing Audio from Open AI Text to Speech Api

Hey! I am using Open Ai text-to-speech API to process the serial input text and convert it into audio. The request is sent to the API, and the audio stream data is received, but while processing the data, I get a Processing timed-out error. I initially had the timeout at 30 seconds and the buffer size of the audio at 1046; now, at 3 minutes of timeout and 4096 as buffer size, I’m still getting the same error. I am confused as to why I keep getting the same error even after that much processing time.

For your information, I’m using the ESP-Wroom-32 Module and Open AI’s Text-to-Speech API.
The goal is to process the audio file and output it through the Bluetooth device that I have.

#include <WiFi.h>
#include <WiFiClientSecure.h>
#include <HTTPClient.h>
#include <ArduinoJson.h>
#include <BluetoothA2DPSink.h>

// Replace with your network credentials
const char* ssid = "ssid";
const char* password = "password";
const char* bluetoothDeviceName = "bluetooth_speaker"; 

// Replace with your OpenAI API key
const char* serverName = "https://api.openai.com/v1/audio/speech";
const char* apiKey = "API KEY";

// Bluetooth settings
BluetoothA2DPSink a2dp_sink;
WiFiClientSecure client;
HTTPClient http;

const char* root_ca = \
"-----BEGIN CERTIFICATE-----\n" \
"MIICCTCCAY6gAwIBAgINAgPlwGjvYxqccpBQUjAKBggqhkjOPQQDAzBHMQswCQYD\n" \
"VQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2VzIExMQzEUMBIG\n" \
"A1UEAxMLR1RTIFJvb3QgUjQwHhcNMTYwNjIyMDAwMDAwWhcNMzYwNjIyMDAwMDAw\n" \
"WjBHMQswCQYDVQQGEwJVUzEiMCAGA1UEChMZR29vZ2xlIFRydXN0IFNlcnZpY2Vz\n" \
"IExMQzEUMBIGA1UEAxMLR1RTIFJvb3QgUjQwdjAQBgcqhkjOPQIBBgUrgQQAIgNi\n" \
"AATzdHOnaItgrkO4NcWBMHtLSZ37wWHO5t5GvWvVYRg1rkDdc/eJkTBa6zzuhXyi\n" \
"QHY7qca4R9gq55KRanPpsXI5nymfopjTX15YhmUPoYRlBtHci8nHc8iMai/lxKvR\n" \
"HYqjQjBAMA4GA1UdDwEB/wQEAwIBhjAPBgNVHRMBAf8EBTADAQH/MB0GA1UdDgQW\n" \
"BBSATNbrdP9JNqPV2Py1PsVq8JQdjDAKBggqhkjOPQQDAwNpADBmAjEA6ED/g94D\n" \
"9J+uHXqnLrmvT/aDHQ4thQEd0dlq7A/Cr8deVl5c1RxYIigL9zC2L7F8AjEA8GE8\n" \
"p/SgguMh1YQdc4acLa/KNJvxn7kjNuK8YAOdgLOaVsjh4rsUecrNIdSUtUlD\n" \
"-----END CERTIFICATE-----\n" \
"";


std::vector<uint8_t> audioData;
String audioUrl;

bool getTTS(String text);
void streamAudio();
void audio_data_callback(const uint8_t *data, uint32_t length);


bool getTTS(String text) {
    if (WiFi.status() == WL_CONNECTED) {
        Serial.println("WiFi is connected. Sending request to OpenAI TTS...");

        http.begin(client, serverName);
        http.addHeader("Content-Type", "application/json");
        http.addHeader("Authorization", "Bearer " + String(apiKey));

        String jsonRequest = "{\"model\":\"tts-1\",\"input\":\"" + text + "\",\"voice\":\"alloy\"}";
        Serial.println("Request JSON: " + jsonRequest);

        int httpResponseCode = http.POST(jsonRequest);

        if (httpResponseCode > 0) {
            Serial.print("HTTP Response code: ");
            Serial.println(httpResponseCode);

            if (httpResponseCode == 200) {
                Serial.println("Request sent. Processing response...");

                WiFiClient *stream = http.getStreamPtr();
                unsigned long startTime = millis();
                const unsigned long timeout = 180000; // 60 seconds timeout

                const int bufferSize = 4096; // Adjust buffer size as needed
                std::vector<uint8_t> buffer(bufferSize);
                
                while (stream->connected() || stream->available()) {
                    int available = stream->available();
                    if (available > 0) {
                        int bytesToRead = min(available, bufferSize);
                        int bytesRead = stream->readBytes(reinterpret_cast<char*>(buffer.data()), bytesToRead);
                        audioData.insert(audioData.end(), buffer.begin(), buffer.begin() + bytesRead);

                        // Reset the timeout timer
                        startTime = millis();
                        Serial.print("Bytes read: ");
                        Serial.println(bytesRead);
                    }

                    // Check for timeout
                    if (millis() - startTime > timeout) {
                        Serial.println("Response processing timed out.");
                        http.end();
                        return false;
                    }
                }

                Serial.print("Total audio data received: ");
                Serial.println(audioData.size());
                http.end();
                return true;
            } else {
                Serial.println("Request failed. Check the response body.");
                String response = http.getString();
                Serial.println("Response body:");
                Serial.println(response);
            }
        } else {
            Serial.print("Error on sending POST: ");
            Serial.println(httpResponseCode);

            if (httpResponseCode == -1) {
                Serial.println("Connection failed - Please check SSL setup or connection.");
            } else {
                Serial.println("Unexpected error.");
            }
        }

        http.end();
    } else {
        Serial.println("WiFi Disconnected");
    }
    return false;
}

void audio_data_callback(const uint8_t *data, uint32_t length) {
    if (!audioData.empty()) {
        uint32_t bytesToWrite = min(length, (uint32_t)audioData.size());
        memcpy((void *)data, audioData.data(), bytesToWrite);
        audioData.erase(audioData.begin(), audioData.begin() + bytesToWrite);
    }
}

void streamAudio() {
    if (!audioData.empty()) {
        a2dp_sink.set_stream_reader(audio_data_callback);
        a2dp_sink.start("Bluetooth Speaker");
        Serial.println("Streaming audio to Bluetooth speaker...");
    } else {
        Serial.println("No audio data available to stream.");
    }
}

void setup() {
    Serial.begin(115200);
    delay(1000); // Wait for the serial monitor to open

    // Connect to WiFi
    Serial.print("Connecting to WiFi....");
    WiFi.begin(ssid, password);
    while (WiFi.status() != WL_CONNECTED) {
        delay(1000);
        Serial.print(".");
    }
    Serial.println("Connected to WiFi");
     // Test HTTPS connection
  client.setCACert(root_ca);
  Serial.println("Testing HTTPS connection...");
  if (!client.connect("api.openai.com", 443)) {
    Serial.println("Connection to API failed!");
  } else {
    Serial.println("Connected to api.openai.com!");
    
  }

    // Test the TTS function
    String prompt = "Hello";
    if (getTTS(prompt)) {
        streamAudio();
    } else {
        Serial.println("Failed to get TTS");
    }
}

void loop() {
    // Add any additional logic for your application here
}

Maybe the text to speech server is overloaded, and timing out.

Some review of the OpenAI contract and docs might be in order. Do they guarantee a data rate?


Looking at the output, I have the audio data from the API in the serial output. I guess there’s some problem in my code while processing that data, which is why it’s timing out.

I think you need to ask the AI support thing.

That might solve the problem, I might As well give it a try. However, Do you think I should have more size for the response audio. Will the esp chip have enough memory? It already has 118k of the flash.

There are newer esp boards with more flash and psram.

Consider following the on line example to produce an MP3 file for the given text input, then save that and tell us how large that file is.

Using .wav (raw) audio file format, 16 bit encoding at 16,000 SPS, 60 seconds of speech occupies about 2 megabytes.

This topic was automatically closed 180 days after the last reply. New replies are no longer allowed.