Issue playing full test-to-speech through Google API

Hello, and happy new year. I have been working on this issue for a couple of weeks and I can't solve it. I'm hoping one of you can help. Thank you in advance.

Expected result: speak full text-to-speech

Unintended result: Will only speak a few words. i.e. "hello good morning". If additional words are added then only a blip of static is played.

Below I have hardware listed with pinouts, arduino code, and output from the serial monitor. Thank you in advance for taking any time to troubleshoot this for me.

Hardware:

ESP32-S3-DevKitC-1-N8R2
Partition Scheme: "Default 8M w/ spiffs (3MB APP/1.5MB spiffs"
QSPI PSRAM
Flash: QIO 80Mhz

Max98357 - ESP32

  • VIN - 5v
  • GND - GND
  • SD - 21
  • GAIN - (floating)
  • DIN - 9
  • BCLK - 14
  • LRC - 40

INMP441 - ESP32

  • VDD - 3v3
  • GND - GND
  • L/R - GND
  • SD - 17
  • WS - 40
  • SCK - 14
//Arduino v2.3.4

#include <Arduino.h>
#include <WiFi.h>
#include <HTTPClient.h>
#include "driver/i2s.h"
#include <ArduinoJson.h>
#include <mbedtls/base64.h>

// I2S Pins
#define I2S_BCLK          GPIO_NUM_14
#define I2S_LRCLK         GPIO_NUM_40
#define I2S_DOUT          GPIO_NUM_9
#define I2S_DIN           GPIO_NUM_17
#define I2S_AMP_SD        GPIO_NUM_21

#define SAMPLE_RATE       16000
#define MAX_AUDIO_DATA    100000 

class TTSPlayer {
private:
    HTTPClient http;
    const char* apiKey;
    int16_t* audioBuffer = nullptr;
    size_t audioDataSize = 0;
    float gain = 2.0f;

    void initI2S() {
        i2s_config_t i2s_config = {
            .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
            .sample_rate = SAMPLE_RATE,
            .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
            .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
            .communication_format = I2S_COMM_FORMAT_I2S,
            .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
            .dma_buf_count = 8,
            .dma_buf_len = 64,
            .use_apll = false,
            .tx_desc_auto_clear = true,
            .fixed_mclk = 0
        };

        i2s_pin_config_t pin_config = {
            .bck_io_num = I2S_BCLK,
            .ws_io_num = I2S_LRCLK,
            .data_out_num = I2S_DOUT,
            .data_in_num = I2S_PIN_NO_CHANGE
        };

        i2s_driver_install(I2S_NUM_0, &i2s_config, 0, NULL);
        i2s_set_pin(I2S_NUM_0, &pin_config);
        i2s_zero_dma_buffer(I2S_NUM_0);
    }

    void playAudio() {
        Serial.println("Starting playback...");
        i2s_zero_dma_buffer(I2S_NUM_0);
        size_t totalBytesWritten = 0;
        int16_t* bufferPtr = audioBuffer;
        size_t samplesToWrite = audioDataSize / 2;

        while (totalBytesWritten < samplesToWrite) {

            size_t chunkSize = 1028;
            if (samplesToWrite - totalBytesWritten < chunkSize) {
                chunkSize = samplesToWrite - totalBytesWritten;
            }

            int16_t tempBuffer[1028];


            for (size_t i = 0; i < chunkSize; i++) {
                int32_t sample = (int32_t)(bufferPtr[i] * gain);


                if (sample > INT16_MAX) {
                    sample = INT16_MAX;
                } else if (sample < INT16_MIN) {
                    sample = INT16_MIN;
                }

                tempBuffer[i] = (int16_t)sample;
            }

            size_t bytesToWrite = chunkSize * sizeof(int16_t);
            size_t numBytesWritten = 0;
            esp_err_t result = i2s_write(I2S_NUM_0, tempBuffer, bytesToWrite, &numBytesWritten, portMAX_DELAY);
            
            if (result != ESP_OK) {
                Serial.println("I2S Write Error");
                break;
            }

            bufferPtr += chunkSize;
            totalBytesWritten += chunkSize;
        }
        i2s_zero_dma_buffer(I2S_NUM_0);
        Serial.println("Playback complete!");
    }

public:
    TTSPlayer(const char* apiKeyParam) : apiKey(apiKeyParam) {
        pinMode(I2S_AMP_SD, OUTPUT);
        digitalWrite(I2S_AMP_SD, HIGH);
        initI2S();


        audioBuffer = (int16_t*)heap_caps_malloc(MAX_AUDIO_DATA, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
        if (audioBuffer == NULL) {
            Serial.println("Failed to allocate audio buffer");
        }
    }

    ~TTSPlayer() {
        if (audioBuffer) {
            free(audioBuffer);
        }
    }

    void playTTS(const char* text) {
        const char* tts_endpoint = "https://texttospeech.googleapis.com/v1/text:synthesize";
        String url = String(tts_endpoint) + "?key=" + apiKey;
        
        http.begin(url);
        http.addHeader("Content-Type", "application/json");

        DynamicJsonDocument doc(1024);
        doc["input"]["text"] = text;
        doc["voice"]["languageCode"] = "en-US";
        doc["audioConfig"]["audioEncoding"] = "LINEAR16";
        doc["audioConfig"]["sampleRateHertz"] = SAMPLE_RATE;
        doc["audioConfig"]["volumeGainDb"] = 0.0;
        
        String requestBody;
        serializeJson(doc, requestBody);

        Serial.println("Sending request to Google Cloud TTS...");
        int httpCode = http.POST(requestBody);
        
        if (httpCode > 0) {
            Serial.printf("HTTP Response code: %d\n", httpCode);
            
            if (httpCode == HTTP_CODE_OK) {
                String response = http.getString();
                DynamicJsonDocument responseDoc(32768);
                DeserializationError error = deserializeJson(responseDoc, response);
                
                if (!error) {
                    const char* audioContent = responseDoc["audioContent"];
                    if (audioContent && strlen(audioContent) > 0) {
                        Serial.println("Decoding audio data...");
                        
                        size_t decodedLength = strlen(audioContent) * 3 / 4;
                        uint8_t* decodedAudio = (uint8_t*)malloc(decodedLength);
                        
                        if (decodedAudio) {
                            size_t outputLength;
                            int decodeResult = mbedtls_base64_decode(
                                decodedAudio,
                                decodedLength,
                                &outputLength,
                                (const unsigned char*)audioContent,
                                strlen(audioContent)
                            );
                            
                            if (decodeResult == 0) {
                                Serial.printf("Decoded %d bytes of audio data\n", outputLength);
                                

                                memset(audioBuffer, 0, MAX_AUDIO_DATA);
                                

                                if (outputLength <= MAX_AUDIO_DATA) {
                                    memcpy(audioBuffer, decodedAudio, outputLength);
                                    audioDataSize = outputLength;
                                    
                                    playAudio();
                                } else {
                                    Serial.println("Audio data too large for buffer");
                                }
                            }
                            free(decodedAudio);
                        }
                    }
                }
            }
        }
        http.end();
    }
};

const char* ssid = "ssid";
const char* password = "password";
const char* apiKey = "apiKey";

TTSPlayer* player;

void setup() {
    Serial.begin(115200);
    
    WiFi.begin(ssid, password);
    while (WiFi.status() != WL_CONNECTED) {
        delay(500);
        Serial.print(".");
    }
    Serial.println("\nWiFi connected");

    player = new TTSPlayer(apiKey);
    player->playTTS("Hello good morning");
}

void loop() {
    delay(1000);
}

//Serial Monitor output

"hello good morning"

[SETUP] Starting initialization...
[WIFI] Connecting to WiFi.
[WIFI] Connected!
[WIFI] IP address: 192.168.1.42
[INIT] Initializing TTSPlayer...
[I2S] Initializing I2S...
[I2S] Initialization complete
[INIT] Allocated 100000 bytes for audio buffer
[INIT] TTSPlayer initialization complete
[SETUP] Playing test message...

[TTS] Preparing TTS request...
[TTS] Text to synthesize: "Hello good morning"
[TTS] Sending request to Google Cloud TTS...
[TTS] HTTP Response code: 200
[TTS] Request took 1103 ms
[TTS] Received response length: 55297 bytes
[TTS] Received base64 audio data length: 55272
[TTS] Decoding base64 audio data...
[TTS] Base64 decode complete in 81 ms
[TTS] Decoded audio size: 41454 bytes
[TTS] Starting audio playback...
[PLAY] Starting audio playback...
[PLAY] Total audio data size: 41454 bytes
[PLAY] Total samples to write: 20727
[PLAY] Progress: 16384/20727 samples (79.0%)
[PLAY] Playback complete! Duration: 1266 ms
[PLAY] Average playback rate: 31.98 KB/s

"hello good morning sir"

[SETUP] Starting initialization...
[WIFI] Connecting to WiFi..
[WIFI] Connected!
[WIFI] IP address: 192.168.1.42
[INIT] Initializing TTSPlayer...
[I2S] Initializing I2S...
[I2S] Initialization complete
[INIT] Allocated 100000 bytes for audio buffer
[INIT] TTSPlayer initialization complete
[SETUP] Playing test message...

[TTS] Preparing TTS request...
[TTS] Text to synthesize: "Hello good morning sir"
[TTS] Sending request to Google Cloud TTS...
[TTS] HTTP Response code: 200
[TTS] Request took 791 ms
[TTS] Received response length: 66913 bytes
[TTS] JSON parsing failed: NoMemory

Did you not notice the out of memory. I see 1 explicit malloc with no free, and a second if the name is true in
audioBuffer = (int16_t*)heap_caps_malloc(MAX_AUDIO_DATA, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
the free for that is only in the destructor

Thank you for noticing that. I am new to Arduino and trying to teach myself. I added free(decodedAudio);
The same issue persists though

                             if (decodeResult == 0) {
                                Serial.printf("Decoded %d bytes of audio data\n", outputLength);
                                
                                memset(audioBuffer, 0, MAX_AUDIO_DATA);
                                
                                if (outputLength <= MAX_AUDIO_DATA) {
                                    memcpy(audioBuffer, decodedAudio, outputLength);
                                    audioDataSize = outputLength;
                                    
                                    playAudio();
                                } else {
                                    Serial.println("Audio data too large for buffer");
                                }
                                free(decodedAudio);
                            }

This topic was automatically closed 180 days after the last reply. New replies are no longer allowed.