Subject: Help Needed with ESP32 Audio Transcription to Deepgram
Hello Arduino Community,
I'm working on a voice recognition project using an ESP32 and an INMP441 microphone, where I intend to record audio, store it on an SD card, and then send it to Deepgram for transcription.
Project Overview:
- Microcontroller: ESP32
- Microphone: INMP441 (I2S)
- Audio Storage: SD card
- Transcription Service: Deepgram
- Sample Rate: 16 kHz
Issues:
I have verified the Wi-Fi connection and can connect to Deepgram successfully. However, the transcription is not happening. I don't receive any HTTP response codes (200, -1, 400, etc.) when sending the audio.
Complete Code:
cpp
#include <driver/i2s.h>
#include <SPI.h>
#include <SD.h>
#include <WiFi.h>
#include <WiFiClientSecure.h>
const char* wifi_name = "iot";
const char* wifi_password = "12345678";
const int CS_PIN = 5;
const int chunk_size = 1024;
uint8_t buffer[chunk_size];
const char* server_url = "api.deepgram.com";
const char* apikey = "my key";
const int server_port = 443;
String transcription;
String response;
WiFiClientSecure client;
i2s_config_t i2s_config;
i2s_pin_config_t i2s_pin_config;
File audio_file;
long start_time;
void setup() {
Serial.begin(115200);
// Initializing the WiFi
WiFi.begin(wifi_name, wifi_password);
Serial.println("Connecting to WiFi...");
while (WiFi.status() != WL_CONNECTED) {
Serial.print(".");
delay(500);
}
Serial.println("\nConnected to WiFi");
Serial.print("IP ADDRESS IS: ");
Serial.println(WiFi.localIP());
// Initializing the SPI bus and SD card module
SPI.begin(18, 19, 23, CS_PIN);
if (!SD.begin(CS_PIN)) {
Serial.println("Failed to initialize SD card");
return;
}
Serial.println("SD card initialization completed");
// Initializing the I2S mic (INMP441)
i2s_config.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX);
i2s_config.sample_rate = 16000;
i2s_config.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT;
i2s_config.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT;
i2s_config.communication_format = (i2s_comm_format_t)(I2S_COMM_FORMAT_I2S | I2S_COMM_FORMAT_I2S_MSB);
i2s_config.fixed_mclk = 0;
i2s_config.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1;
i2s_config.dma_buf_count = 8;
i2s_config.dma_buf_len = 64;
i2s_config.use_apll = false;
i2s_config.tx_desc_auto_clear = true;
// Initializing the pin configuration of INMP441
i2s_pin_config.bck_io_num = 33;
i2s_pin_config.ws_io_num = 22;
i2s_pin_config.data_in_num = 35;
i2s_pin_config.data_out_num = I2S_PIN_NO_CHANGE;
// Installing the drivers and setting up I2S
i2s_driver_install(I2S_NUM_0, &i2s_config, 0, NULL);
i2s_set_pin(I2S_NUM_0, &i2s_pin_config);
i2s_zero_dma_buffer(I2S_NUM_0);
}
void loop() {
record();
http_request();
deepgram_transcription();
Serial.println("Finally success..............");
delay(5000);
Serial.println("Before sending next data");
}
void record() {
Serial.println("Starting to record...");
start_time = millis();
size_t byte_read;
int16_t audio_data[1024];
File audio_file = SD.open("/test.wav", FILE_WRITE);
if (!audio_file) {
Serial.println("Failed to open file for writing");
return;
}
while ((millis() - start_time) <= 2500) {
i2s_read(I2S_NUM_0, (void*)audio_data, sizeof(audio_data), &byte_read, portMAX_DELAY);
audio_file.write((uint8_t*)audio_data, byte_read);
}
audio_file.close();
Serial.println("Audio recording COMPLETED.");
}
void http_request() {
client.setInsecure();
if (client.connect(server_url, server_port)) {
Serial.println("Connected to Deepgram ");
client.println("POST /v1/listen?language=en&model=nova-2 HTTP/1.1");
client.println("Host: api.deepgram.com");
client.print("Authorization: Token apikey ");
client.println("Content-Type: audio/raw");
client.println("Transfer-Encoding: chunked");
client.println("Connection: close");
client.println();
audio_file = SD.open("/test.wav");
if (!audio_file) {
Serial.println("Failed to open the file");
return;
}
while (audio_file.available()) {
size_t bytes_read = audio_file.read(buffer, sizeof(buffer));
client.write(buffer, bytes_read);
}
client.println("0\r\n\r\n"); // End of chunked encoding
audio_file.close();
} else {
Serial.println("Failed to connect to Deepgram");
}
}
void deepgram_transcription() {
while (client.connected()) {
response = client.readStringUntil('\n');
if (response == "\r") {
break;
}
Serial.println("Server response: ");
Serial.print(response);
}
if (client.available()) {
transcription = client.readString();
Serial.println("TRANSCRIPTION: ");
Serial.print(transcription);
}
client.stop();
}
Additional Context:
- I have verified that the audio file is being recorded correctly.
- I am not getting any response or error codes from Deepgram when sending the audio.
- I would appreciate any guidance on how to troubleshoot this issue or any suggestions for what might be going wrong in the code.
Thank you for your help!