Hello, and happy new year. I have been working on this issue for a couple of weeks and I can't solve it. I'm hoping one of you can help. Thank you in advance.
Expected result: speak full text-to-speech
Unintended result: Will only speak a few words. i.e. "hello good morning". If additional words are added then only a blip of static is played.
Below I have hardware listed with pinouts, arduino code, and output from the serial monitor. Thank you in advance for taking any time to troubleshoot this for me.
Hardware:
ESP32-S3-DevKitC-1-N8R2
Partition Scheme: "Default 8M w/ spiffs (3MB APP/1.5MB spiffs"
QSPI PSRAM
Flash: QIO 80Mhz
Max98357 - ESP32
- VIN - 5v
- GND - GND
- SD - 21
- GAIN - (floating)
- DIN - 9
- BCLK - 14
- LRC - 40
INMP441 - ESP32
- VDD - 3v3
- GND - GND
- L/R - GND
- SD - 17
- WS - 40
- SCK - 14
//Arduino v2.3.4
#include <Arduino.h>
#include <WiFi.h>
#include <HTTPClient.h>
#include "driver/i2s.h"
#include <ArduinoJson.h>
#include <mbedtls/base64.h>
// I2S Pins
#define I2S_BCLK GPIO_NUM_14
#define I2S_LRCLK GPIO_NUM_40
#define I2S_DOUT GPIO_NUM_9
#define I2S_DIN GPIO_NUM_17
#define I2S_AMP_SD GPIO_NUM_21
#define SAMPLE_RATE 16000
#define MAX_AUDIO_DATA 100000
class TTSPlayer {
private:
HTTPClient http;
const char* apiKey;
int16_t* audioBuffer = nullptr;
size_t audioDataSize = 0;
float gain = 2.0f;
void initI2S() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
.sample_rate = SAMPLE_RATE,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_I2S,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 64,
.use_apll = false,
.tx_desc_auto_clear = true,
.fixed_mclk = 0
};
i2s_pin_config_t pin_config = {
.bck_io_num = I2S_BCLK,
.ws_io_num = I2S_LRCLK,
.data_out_num = I2S_DOUT,
.data_in_num = I2S_PIN_NO_CHANGE
};
i2s_driver_install(I2S_NUM_0, &i2s_config, 0, NULL);
i2s_set_pin(I2S_NUM_0, &pin_config);
i2s_zero_dma_buffer(I2S_NUM_0);
}
void playAudio() {
Serial.println("Starting playback...");
i2s_zero_dma_buffer(I2S_NUM_0);
size_t totalBytesWritten = 0;
int16_t* bufferPtr = audioBuffer;
size_t samplesToWrite = audioDataSize / 2;
while (totalBytesWritten < samplesToWrite) {
size_t chunkSize = 1028;
if (samplesToWrite - totalBytesWritten < chunkSize) {
chunkSize = samplesToWrite - totalBytesWritten;
}
int16_t tempBuffer[1028];
for (size_t i = 0; i < chunkSize; i++) {
int32_t sample = (int32_t)(bufferPtr[i] * gain);
if (sample > INT16_MAX) {
sample = INT16_MAX;
} else if (sample < INT16_MIN) {
sample = INT16_MIN;
}
tempBuffer[i] = (int16_t)sample;
}
size_t bytesToWrite = chunkSize * sizeof(int16_t);
size_t numBytesWritten = 0;
esp_err_t result = i2s_write(I2S_NUM_0, tempBuffer, bytesToWrite, &numBytesWritten, portMAX_DELAY);
if (result != ESP_OK) {
Serial.println("I2S Write Error");
break;
}
bufferPtr += chunkSize;
totalBytesWritten += chunkSize;
}
i2s_zero_dma_buffer(I2S_NUM_0);
Serial.println("Playback complete!");
}
public:
TTSPlayer(const char* apiKeyParam) : apiKey(apiKeyParam) {
pinMode(I2S_AMP_SD, OUTPUT);
digitalWrite(I2S_AMP_SD, HIGH);
initI2S();
audioBuffer = (int16_t*)heap_caps_malloc(MAX_AUDIO_DATA, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (audioBuffer == NULL) {
Serial.println("Failed to allocate audio buffer");
}
}
~TTSPlayer() {
if (audioBuffer) {
free(audioBuffer);
}
}
void playTTS(const char* text) {
const char* tts_endpoint = "https://texttospeech.googleapis.com/v1/text:synthesize";
String url = String(tts_endpoint) + "?key=" + apiKey;
http.begin(url);
http.addHeader("Content-Type", "application/json");
DynamicJsonDocument doc(1024);
doc["input"]["text"] = text;
doc["voice"]["languageCode"] = "en-US";
doc["audioConfig"]["audioEncoding"] = "LINEAR16";
doc["audioConfig"]["sampleRateHertz"] = SAMPLE_RATE;
doc["audioConfig"]["volumeGainDb"] = 0.0;
String requestBody;
serializeJson(doc, requestBody);
Serial.println("Sending request to Google Cloud TTS...");
int httpCode = http.POST(requestBody);
if (httpCode > 0) {
Serial.printf("HTTP Response code: %d\n", httpCode);
if (httpCode == HTTP_CODE_OK) {
String response = http.getString();
DynamicJsonDocument responseDoc(32768);
DeserializationError error = deserializeJson(responseDoc, response);
if (!error) {
const char* audioContent = responseDoc["audioContent"];
if (audioContent && strlen(audioContent) > 0) {
Serial.println("Decoding audio data...");
size_t decodedLength = strlen(audioContent) * 3 / 4;
uint8_t* decodedAudio = (uint8_t*)malloc(decodedLength);
if (decodedAudio) {
size_t outputLength;
int decodeResult = mbedtls_base64_decode(
decodedAudio,
decodedLength,
&outputLength,
(const unsigned char*)audioContent,
strlen(audioContent)
);
if (decodeResult == 0) {
Serial.printf("Decoded %d bytes of audio data\n", outputLength);
memset(audioBuffer, 0, MAX_AUDIO_DATA);
if (outputLength <= MAX_AUDIO_DATA) {
memcpy(audioBuffer, decodedAudio, outputLength);
audioDataSize = outputLength;
playAudio();
} else {
Serial.println("Audio data too large for buffer");
}
}
free(decodedAudio);
}
}
}
}
}
http.end();
}
};
const char* ssid = "ssid";
const char* password = "password";
const char* apiKey = "apiKey";
TTSPlayer* player;
void setup() {
Serial.begin(115200);
WiFi.begin(ssid, password);
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.print(".");
}
Serial.println("\nWiFi connected");
player = new TTSPlayer(apiKey);
player->playTTS("Hello good morning");
}
void loop() {
delay(1000);
}
//Serial Monitor output
"hello good morning"
[SETUP] Starting initialization...
[WIFI] Connecting to WiFi.
[WIFI] Connected!
[WIFI] IP address: 192.168.1.42
[INIT] Initializing TTSPlayer...
[I2S] Initializing I2S...
[I2S] Initialization complete
[INIT] Allocated 100000 bytes for audio buffer
[INIT] TTSPlayer initialization complete
[SETUP] Playing test message...
[TTS] Preparing TTS request...
[TTS] Text to synthesize: "Hello good morning"
[TTS] Sending request to Google Cloud TTS...
[TTS] HTTP Response code: 200
[TTS] Request took 1103 ms
[TTS] Received response length: 55297 bytes
[TTS] Received base64 audio data length: 55272
[TTS] Decoding base64 audio data...
[TTS] Base64 decode complete in 81 ms
[TTS] Decoded audio size: 41454 bytes
[TTS] Starting audio playback...
[PLAY] Starting audio playback...
[PLAY] Total audio data size: 41454 bytes
[PLAY] Total samples to write: 20727
[PLAY] Progress: 16384/20727 samples (79.0%)
[PLAY] Playback complete! Duration: 1266 ms
[PLAY] Average playback rate: 31.98 KB/s
"hello good morning sir"
[SETUP] Starting initialization...
[WIFI] Connecting to WiFi..
[WIFI] Connected!
[WIFI] IP address: 192.168.1.42
[INIT] Initializing TTSPlayer...
[I2S] Initializing I2S...
[I2S] Initialization complete
[INIT] Allocated 100000 bytes for audio buffer
[INIT] TTSPlayer initialization complete
[SETUP] Playing test message...
[TTS] Preparing TTS request...
[TTS] Text to synthesize: "Hello good morning sir"
[TTS] Sending request to Google Cloud TTS...
[TTS] HTTP Response code: 200
[TTS] Request took 791 ms
[TTS] Received response length: 66913 bytes
[TTS] JSON parsing failed: NoMemory