i was trying to transcribe a wav file generated by esp32 connected to an inmp441, then upload it to a python flask server, transcribe it using local whisper and return it to the esp32 to show it on the serial printer. my problem is that i was unable to upload the wav file, http error 400. i tried to test the whisper server with a curl command and a saved wav file and it works correctly.
#include <driver/i2s.h>
#include <SPIFFS.h>
#include <WiFi.h>
#include <HTTPClient.h>
#define I2S_WS 10
#define I2S_SD 11
#define I2S_SCK 12
#define I2S_PORT I2S_NUM_0
#define I2S_SAMPLE_RATE (16000)
#define I2S_SAMPLE_BITS (16)
#define I2S_READ_LEN (16 * 1024)
#define RECORD_TIME (5) //Seconds
#define I2S_CHANNEL_NUM (1)
#define FLASH_RECORD_SIZE (I2S_CHANNEL_NUM * I2S_SAMPLE_RATE * I2S_SAMPLE_BITS / 8 * RECORD_TIME)
File file;
const char filename[] = "/recording.wav";
const int headerSize = 44;
bool isWIFIConnected;
void setup() {
Serial.begin(115200);
SPIFFSInit();
i2sInit();
xTaskCreate(i2s_adc, "i2s_adc", 1024 * 3, NULL, 1, NULL);
delay(500);
xTaskCreate(wifiConnect, "wifi_Connect", 4096, NULL, 0, NULL);
}
void loop() {
}
void SPIFFSInit(){
if(!SPIFFS.begin(true)){
Serial.println("SPIFFS initialisation failed!");
while(1) yield();
}
//SPIFFS.format();
SPIFFS.remove(filename);
file = SPIFFS.open(filename, FILE_WRITE);
if(!file){
Serial.println("File is not available!");
}
byte header[headerSize];
wavHeader(header, FLASH_RECORD_SIZE);
file.write(header, headerSize);
listSPIFFS();
}
void i2sInit(){
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
.sample_rate = I2S_SAMPLE_RATE,
.bits_per_sample = i2s_bits_per_sample_t(I2S_SAMPLE_BITS),
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_I2S | I2S_COMM_FORMAT_I2S_MSB),
.intr_alloc_flags = 0,
.dma_buf_count = 64,
.dma_buf_len = 1024,
.use_apll = 1
};
i2s_driver_install(I2S_PORT, &i2s_config, 0, NULL);
const i2s_pin_config_t pin_config = {
.bck_io_num = I2S_SCK,
.ws_io_num = I2S_WS,
.data_out_num = -1,
.data_in_num = I2S_SD
};
i2s_set_pin(I2S_PORT, &pin_config);
}
void i2s_adc_data_scale(uint8_t * d_buff, uint8_t* s_buff, uint32_t len)
{
uint32_t j = 0;
uint32_t dac_value = 0;
for (int i = 0; i < len; i += 2) {
dac_value = ((((uint16_t) (s_buff[i + 1] & 0xf) << 8) | ((s_buff[i + 0]))));
d_buff[j++] = 0;
d_buff[j++] = dac_value * 256 / 2048;
}
}
void i2s_adc(void *arg)
{
int i2s_read_len = I2S_READ_LEN;
int flash_wr_size = 0;
size_t bytes_read;
char* i2s_read_buff = (char*) calloc(i2s_read_len, sizeof(char));
uint8_t* flash_write_buff = (uint8_t*) calloc(i2s_read_len, sizeof(char));
i2s_read(I2S_PORT, (void*) i2s_read_buff, i2s_read_len, &bytes_read, portMAX_DELAY);
i2s_read(I2S_PORT, (void*) i2s_read_buff, i2s_read_len, &bytes_read, portMAX_DELAY);
Serial.println(" *** Recording Start *** ");
while (flash_wr_size < FLASH_RECORD_SIZE) {
//read data from I2S bus, in this case, from ADC.
i2s_read(I2S_PORT, (void*) i2s_read_buff, i2s_read_len, &bytes_read, portMAX_DELAY);
//example_disp_buf((uint8_t*) i2s_read_buff, 64);
//save original data from I2S(ADC) into flash.
i2s_adc_data_scale(flash_write_buff, (uint8_t*)i2s_read_buff, i2s_read_len);
file.write((const byte*) flash_write_buff, i2s_read_len);
flash_wr_size += i2s_read_len;
ets_printf("Sound recording %u%%\n", flash_wr_size * 100 / FLASH_RECORD_SIZE);
ets_printf("Never Used Stack Size: %u\n", uxTaskGetStackHighWaterMark(NULL));
}
file.close();
free(i2s_read_buff);
i2s_read_buff = NULL;
free(flash_write_buff);
flash_write_buff = NULL;
listSPIFFS();
if(isWIFIConnected){
uploadFile();
}
vTaskDelete(NULL);
}
void example_disp_buf(uint8_t* buf, int length)
{
printf("======\n");
for (int i = 0; i < length; i++) {
printf("%02x ", buf[i]);
if ((i + 1) % 8 == 0) {
printf("\n");
}
}
printf("======\n");
}
void wavHeader(byte* header, int wavSize){
header[0] = 'R';
header[1] = 'I';
header[2] = 'F';
header[3] = 'F';
unsigned int fileSize = wavSize + headerSize - 8;
header[4] = (byte)(fileSize & 0xFF);
header[5] = (byte)((fileSize >> 8) & 0xFF);
header[6] = (byte)((fileSize >> 16) & 0xFF);
header[7] = (byte)((fileSize >> 24) & 0xFF);
header[8] = 'W';
header[9] = 'A';
header[10] = 'V';
header[11] = 'E';
header[12] = 'f';
header[13] = 'm';
header[14] = 't';
header[15] = ' ';
header[16] = 0x10;
header[17] = 0x00;
header[18] = 0x00;
header[19] = 0x00;
header[20] = 0x01;
header[21] = 0x00;
header[22] = 0x01;
header[23] = 0x00;
header[24] = 0x80;
header[25] = 0x3E;
header[26] = 0x00;
header[27] = 0x00;
header[28] = 0x00;
header[29] = 0x7D;
header[30] = 0x01;
header[31] = 0x00;
header[32] = 0x02;
header[33] = 0x00;
header[34] = 0x10;
header[35] = 0x00;
header[36] = 'd';
header[37] = 'a';
header[38] = 't';
header[39] = 'a';
header[40] = (byte)(wavSize & 0xFF);
header[41] = (byte)((wavSize >> 8) & 0xFF);
header[42] = (byte)((wavSize >> 16) & 0xFF);
header[43] = (byte)((wavSize >> 24) & 0xFF);
}
void listSPIFFS(void) {
Serial.println(F("\r\nListing SPIFFS files:"));
static const char line[] PROGMEM = "=================================================";
Serial.println(FPSTR(line));
Serial.println(F(" File name Size"));
Serial.println(FPSTR(line));
fs::File root = SPIFFS.open("/");
if (!root) {
Serial.println(F("Failed to open directory"));
return;
}
if (!root.isDirectory()) {
Serial.println(F("Not a directory"));
return;
}
fs::File file = root.openNextFile();
while (file) {
if (file.isDirectory()) {
Serial.print("DIR : ");
String fileName = file.name();
Serial.print(fileName);
} else {
String fileName = file.name();
Serial.print(" " + fileName);
// File path can be 31 characters maximum in SPIFFS
int spaces = 33 - fileName.length(); // Tabulate nicely
if (spaces < 1) spaces = 1;
while (spaces--) Serial.print(" ");
String fileSize = (String) file.size();
spaces = 10 - fileSize.length(); // Tabulate nicely
if (spaces < 1) spaces = 1;
while (spaces--) Serial.print(" ");
Serial.println(fileSize + " bytes");
}
file = root.openNextFile();
}
Serial.println(FPSTR(line));
Serial.println();
delay(1000);
}
void wifiConnect(void *pvParameters){
isWIFIConnected = false;
char* ssid = "*****";
char* password = "*****";
WiFi.begin(ssid, password);
WiFi.setTxPower(WIFI_POWER_8_5dBm);
while(WiFi.status() != WL_CONNECTED){
vTaskDelay(500);
Serial.print(".");
}
isWIFIConnected = true;
while(true){
vTaskDelay(1000);
}
}
void uploadFile(){
file = SPIFFS.open(filename, FILE_READ);
if(!file){
Serial.println("FILE IS NOT AVAILABLE!");
return;
}
Serial.println("===> Upload FILE to python flask Server");
Serial.println(file.name());
Serial.println(file.size());
HTTPClient client;
client.begin("http://192.168.1.112:5000/upload");
client.addHeader("Content-Type", "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW");
String boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW";
String body = "--" + boundary + "\r\n";
body += "Content-Disposition: form-data; name=\"file\"; filename=\"recording.wav\"\r\n";
body += "Content-Type: audio/wav\r\n\r\n";
int fileSize = file.size();
uint8_t *fileBuffer = new uint8_t[fileSize];
file.read(fileBuffer, fileSize);
file.close();
// Send the file data
int httpResponseCode = client.sendRequest("POST", body + String((char*)fileBuffer, fileSize) + "\r\n--" + boundary + "--");
Serial.print("httpResponseCode : ");
Serial.println(httpResponseCode);
if(httpResponseCode == 200){
String response = client.getString();
Serial.println("==================== Transcription ====================");
Serial.println(response);
Serial.println("==================== End ====================");
}else{
Serial.println("Error");
}
delete[] fileBuffer;
client.end();
}
this is the python code
from flask import Flask, abort, request
from flask_cors import CORS
from tempfile import NamedTemporaryFile
import whisper
import torch
import logging
logging.basicConfig(level=logging.DEBUG)
# Check if NVIDIA GPU is available
torch.cuda.is_available()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Load the Whisper model:
model = whisper.load_model("base")
app = Flask(__name__)
CORS(app)
@app.route("/")
def hello():
return "Whisper service active!!"
@app.route('/upload', methods=['POST'])
def handler():
if not request.files:
# If the user didn't submit any files, return a 400 (Bad Request) error.
abort(400)
# return "file not uploaded"
# For each file, let's store the results in a list of dictionaries.
results = []
# Loop over every file that the user submitted.
for filename, handle in request.files.items():
# Create a temporary file.
# The location of the temporary file is available in `temp.name`.
temp = NamedTemporaryFile(delete=False)
# Write the user's uploaded file to the temporary file.
# The file will get deleted when it drops out of scope.
handle.save(temp)
# Let's get the transcript of the temporary file.
result = model.transcribe(temp.name, language="spanish", fp16=False)
# Now we can store the result object for this file.
results.append({
'filename': filename,
'transcript': result['text'],
})
# This will be automatically converted to JSON.
return {'results': results}
app.run(host='192.168.1.112', debug = True)
the arduino IDE debug
File name Size
=================================================
recording.wav 163884 bytes
=================================================
===> Upload FILE to python flask Server
recording.wav
163884
[ 9396][D][HTTPClient.cpp:293] beginInternal(): protocol: http, host: 192.168.1.112 port: 5000 url: /upload
[ 9558][D][HTTPClient.cpp:574] sendRequest(): request type: 'POST' redirCount: 0
[ 9781][D][HTTPClient.cpp:1112] connect(): connected to 192.168.1.112:5000
[ 11065][D][HTTPClient.cpp:1257] handleHeaderResponse(): code: 400
[ 11071][D][HTTPClient.cpp:1260] handleHeaderResponse(): size: 167
[ 11077][D][HTTPClient.cpp:618] sendRequest(): sendRequest code=400
httpResponseCode : 400
Error
[ 11084][D][HTTPClient.cpp:373] disconnect(): still data in buffer (167), clean up.
[ 11094][D][HTTPClient.cpp:380] disconnect(): tcp stop
Thanks for posting all the code, and output as code. Before getting to your stated issue.... it's excessive to start a separate task to connect to WiFi. It could just as easily be called from setup
, since it just runs briefly and then the task ends with an infinite loop.
The wavHeader
code is long and tediously lists 44 byte offsets. Computers are really good at counting. There's also no need to mask with 0xFF
if you're assigning to byte
-- any "extra" bits get lopped off anyway. You do need a cast to avoid a narrowing warning, but something like the following is more readable and less error-prone:
constexpr int headerSize = 44;
#define U32_BYTES_LE(x) static_cast<byte>(x), \
static_cast<byte>(x >> 8), static_cast<byte>(x >> 16), static_cast<byte>(x >> 24)
void wavHeader(byte *dest, int wavSize) {
unsigned chunkSize = wavSize + headerSize - 8;
byte header[] = {
'R', 'I', 'F', 'F',
U32_BYTES_LE(chunkSize),
'W', 'A', 'V', 'E',
'f', 'm', 't', ' ',
U32_BYTES_LE(0x10),
0x01, 0x00, 0x01, 0x00, 0x80, 0x3E, 0x00, 0x00,
0x00, 0x7D, 0x01, 0x00, 0x02, 0x00, 0x10, 0x00,
'd', 'a', 't', 'a',
U32_BYTES_LE(wavSize),
};
static_assert(sizeof(header) == headerSize);
memcpy(dest, header, sizeof(header));
}
The static_assert
catches mistakes like forgetting the space at the end of the "fmt "
tag
This topic's title mentions web sockets, but I don't see any of those. You might as well print the response body when the response is not 200
-- the error message can be helpful. If you have a random boundary assigned to a variable, you should use that variable when possible; the order of these two statements can be flipped to be:
String boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW";
client.addHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
Does your ESP32 board have PSRAM? Is it Enabled (near the bottom of the board-specific options in the bottom half of the Tools menu)? If not, the maximum String
size is 64KB. If you exceed the max, the alloc/concat has no effect and the object is marked invalid. Your .wav
is 100KB bigger.
You could try a .wav
that is under the limit, which would have to be short and perhaps low-res, to see if all the other code works.
It would be more efficient overall to implement a Stream
that can do the HTTP multipart stuff; that wraps the File
, which is also a subclass of Stream
. HTTPClient::sendRequest
has an overload that takes a Stream
.
Has the server code been tested with something like curl
, which can do the upload?
curl -v -F file=@recording.wav 'http://192.168.1.112:5000/upload'
hi kenB thanks for the response, the curl command works fine, i have a esp32-s3-n16r8, the psram is enabled on the arduino IDE, i flipped the boundary string as you suggest.
im a relative newbie using esp32, could you enlighten me about to do a more efficient multipart stream.
i tried to send the audio file directly as follow but get the same http 400 error
HTTPClient client;
client.begin("http://192.168.1.112:5000/upload");
client.addHeader("Content-Type", "audio/wav");
int httpResponseCode = client.sendRequest("POST", &file, file.size());
Serial.print("httpResponseCode : ");
Serial.println(httpResponseCode);
i recorded just 2 seconds, the file decrease to 65k but not change at all.
with this modification to the code iam able to copy the file to the server but it is corrupted. the file have 33kb and inside have the wav header but cant play it.
HTTPClient client;
client.begin("http://192.168.1.112:5000/upload");
String boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW";
client.addHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
String body = "--" + boundary + "\r\n";
body += "Content-Disposition: form-data; name=\"file\"; filename=\"recording.wav\"\r\n";
body += "Content-Type: audio/wav\r\n\r\n";
body += file.readString();
body += "\r\n--" + boundary + "--\r\n";
// Calculate the content length
int contentLength = body.length();
// Set the Content-Length header
client.addHeader("Content-Length", String(contentLength));
int httpResponseCode = client.sendRequest("POST", body );
Serial.print("httpResponseCode : ");
Serial.println(httpResponseCode);
At any point, you can test the maximum possible String
length -- considering both the hard-coded limit and available unfragmented heap -- with a single loop like this
for (size_t z = 6000; ; z += 10000) {
Serial.print(z);
String s;
if (s.reserve(z)) {
Serial.println(" is OK");
delay(10);
} else {
Serial.println(" is too big");
delay(2500); // if you need to pause to see the result
break;
}
}
It's the same status code, 400 Bad Request
, but the response body may have details for what it wrong.
In this case, the Python server code is expecting request.files
; a direct upload that is not multipart will do abort(400)
, which won't say anything useful. (You could add a message to that.) But maybe it will when trying multipart, and there's an error with it. So definitely do the
String response = client.getString();
Serial.println(response);
regardless of the httpResponseCode
. You could alter the server to handle both direct or multipart. The direct upload of a File
as a Stream
is -- as you have discovered -- much simpler on the client/Arduino side. But you should also be able to make the multipart work as well.
You mean 33KB out of 65KB? Is any data that is present actually correct, but some of it is missing (which parts); or is it all garbled with the wrong bytes? Does the header have the correct lengths after the "RIFF
" and "data
"? What is the Content-Length
that is sent? Is that the complete file length plus the several dozen other bytes that are added?
It will take a little time to put together a multipart stream.
i was recording 5 seconds 160884bytes, the file created on the server have only 32812bytes, some of the data saved are correct.
first lines of the file uploaded>
RIFF$q WAVEfmt €> }ÿ data q ÿ û ý þ þ þ þ ý ý þ þ þ ý ý ÿ ÿ þ þ ý ý ú ü ü û û û û ü ü ü ü ü ú ú ù ú ú ø ø ó ó ñ ÷ ÷ ÷ ÷ ù ù þ ü ü ü ü ÿ ÿ ý ù ù õ õ ö ö
ü ú ú ÷ ÷ ý ý ÿ ý ý ü ü ü ü û ù ù ô ô ö ö û ø ø ÷ ÷ ú ú ø ÷ ÷ ù ù ø ø þ ô ö ö ø ø ù ù ý ù ÷ ÷ ø ø ù ù ù ø ù ù ù ù ÷ ÷ ÷ õ ö ö ó ó ÷ ÷ õ ÷ ô ô õ õ ö ö õ ÷ ÷ ÷ ö ö ú ú ô þ ý ý ü ü ñ û û û í í
the content length is calculated with the total body and send it as addheader
An aside about examining binary data
When dealing with binary data, use something like hexdump
, which should be installed on Linux or Mac
$ hexdump -C test_8u_16.wav | head -n 4
00000000 52 49 46 46 84 64 00 00 57 41 56 45 66 6d 74 20 |RIFF.d..WAVEfmt |
00000010 12 00 00 00 01 00 01 00 80 3e 00 00 80 3e 00 00 |.........>...>..|
00000020 01 00 08 00 00 00 64 61 74 61 cd 3f 00 00 80 80 |......data.?....|
00000030 80 80 80 80 80 7f 7e 80 7e 80 7e 7e 7e 7f 7d 7f |......~.~.~~~.}.|
I found that file in the ESP8266Audio library I happen to have installed. Looking at the (little-endian) size after "RIFF
", 0x6484
is 8 less than the size of the file, 25740 bytes, as expected. The size of the "fmt
" chunk happens to be 0x12
bytes instead of your 0x10
; the "data
" starts in the expected place accordingly, that many bytes after the four-byte size.
The file fragment you posted has some binary-to-text conversion: for one thing looks like all the 00
are presented as spaces (hex 20
)
00000000 52 49 46 46 24 71 02 20 57 41 56 45 66 6d 74 20 |RIFF$q. WAVEfmt |
00000010 10 20 20 20 01 20 01 20 e2 82 ac 3e 20 20 20 7d |. . . ...> }|
00000020 01 c3 bf 02 20 10 20 64 61 74 61 20 71 02 20 20 |.... . data q. |
And looking at the text here in the forum, there is the €
sign at offset 24, which has 0x80
in the wavHeader
function. But that's the appropriate character only on Windows; in any case, the UTF-8 encoding for it is that "e2 82 ac
", as shown just above. So brute-force reversing this -- unfortunately changing any legitimate 20
to 00
(like at the end of "fmt
")
$ pbpaste | tr ' ' '\000' | iconv -f utf-8 -t windows-1252 | hexdump -C
00000000 52 49 46 46 24 71 02 00 57 41 56 45 66 6d 74 00 |RIFF$q..WAVEfmt.|
00000010 10 00 00 00 01 00 01 00 80 3e 00 00 00 7d 01 ff |.........>...}..|
00000020 02 00 10 00 64 61 74 61 00 71 02 00 00 03 00 ff |....data.q......|
For whatever reason, after "7d 01
", at offset 31 should be "00
" but is instead 0xFF
. I don't know enough about the .wav
format to say whether that is significant.
Anyway, the particulars of how the binary data got mangled don't matter if you can avoid it in the first place The "
RIFF
" size is 0x027124
, or 160036. That's a lot closer to 160884, but still off by eight hundred bytes, more than all the extra stuff that is added. But not nearly as bad as 32812.
Back to the main problem
To troubleshoot this, verify the size/length at each step; e.g. did concatenating the body
actually work? And you don't actually have to set the Content-Length
manually when sending a String
, HTTPClient
will do it for you.
So for example
int contentLength = body.length();
Serial.println(contentLength); // did the concat work?
int httpResponseCode = client.sendRequest("POST", body );
Unfortunately, that doesn't tell you about the next step: how many bytes were actually sent. If you use a Stream
, HTTPClient
will print a debug message with that info. ESP32 actually has a StreamString
that combines the two (HTTPClient
uses it). You'll need
#include <StreamString.h>
and then change the body
initialization slightly
StreamString body;
body += "--" + boundary + "\r\n";
Unlike with a String
, for a Stream
, the sendRequest
takes a pointer; and because it's a stream, you need to pass the length, otherwise it won't set the Content-Length
int contentLength = body.length();
Serial.println(contentLength); // did the concat work?
int httpResponseCode = client.sendRequest("POST", &body, contentLength);
With Debug messages enabled, the output will be something like
[ 2134][D][HTTPClient.cpp:812] sendRequest(): Stream payload written: 161000
[ 3063][D][HTTPClient.cpp:1257] handleHeaderResponse(): code: 200
(I made up that 161000
.) Does it match the contentLength
, or the 33KB?
i will try your sugestions and get back.