Modifying the Micro_speech example in the TensorFlowLite library

System information

  • Windows 10
  • TensorFlow installed from Arduino library
  • 1.15.0-ALPHA precompiled
  • ESP32, Arduino

Following the micro_speech example in the Arduino_TensorFlowLite library, I tried to modify the code so that it could run with my 32 bits per sample I2S microphone.
However, this proved to be a challenge since the audio_provider files require a certain number of samples of a certain size.
What happened, in the end, is that the code was stuck in a loop gathering data.

I’m not sure where to even begin debugging this, but I’m guessing the capture buffer needs to be adjusted in order to accommodate the larger sample size

So far I’ve modified the arduino_audio_provider.cpp file to be the following

#include "audio_provider.h"
#include <Arduino.h>
#include <driver/i2s.h>
#include "micro_features_micro_model_settings.h"
const i2s_port_t I2S_PORT = I2S_NUM_0;
#define BUFFER_SIZE 1024
TaskHandle_t audioInputTaskHandle;
namespace {
bool g_is_audio_initialized = false;
// An internal buffer able to fit 16x our sample size
constexpr int kAudioCaptureBufferSize = BUFFER_SIZE * 32;
int32_t g_audio_capture_buffer[BUFFER_SIZE];
// A buffer that holds our output
int16_t g_audio_output_buffer[kMaxAudioSampleSize];
// Mark as volatile so we can check in a while loop to see if
// any samples have arrived yet.
volatile int32_t g_latest_audio_timestamp = 0;
}  // namespace
volatile uint32_t testerino = 0;
volatile bool audioCaptureFlag = 0;
void CaptureSamples( void *) {
  while(1)
  {
    if(audioCaptureFlag)
    {
      // Determine the index, in the history of all samples, of the last sample
      const int32_t start_sample_offset =
          g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);//0
      // Determine the index of this sample in our ring buffer
      const int capture_index = start_sample_offset % kAudioCaptureBufferSize;//0
      size_t bytesRead;
      i2s_read(I2S_PORT, (void*) g_audio_capture_buffer + capture_index, BUFFER_SIZE, &bytesRead, portMAX_DELAY);
      // This is how many bytes of new data we have each time this is called
      const int number_of_samples = bytesRead / 8;
      // Calculate what timestamp the last audio sample represents
      const int32_t time_in_ms =
          g_latest_audio_timestamp +
          (number_of_samples / (kAudioSampleFrequency / 1000)); //6,12
      
      // Read the data to the correct place in our buffer
      // Serial.println(g_latest_audio_timestamp);
      // Serial.println(start_sample_offset);
      Serial.println(capture_index);
      delay(1);
      // for(uint16_t i = 0; i < bytesRead; i++)
        // Serial.printf("%d, %ld\n", i, g_audio_capture_buffer[i]);
        // Serial.println(g_audio_capture_buffer[i]);
      
      // PDM.read(g_audio_capture_buffer + capture_index, DEFAULT_PDM_BUFFER_SIZE);
      // This is how we let the outside world know that new audio data has arrived.
      // Serial.print("latest timestamp: ");
      // Serial.println(bytesRead);
      // Serial.flush();
      // delay(1);
      testerino++;

      g_latest_audio_timestamp = time_in_ms;
    }
    vTaskDelay(1);
  }
}

TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
    Serial.println("init audio recording");
    delay(10);

  // Hook up the callback that will be called with each sample
  // PDM.onReceive(CaptureSamples);
  // Start listening for audio: MONO @ 16KHz with gain at 20
  // PDM.begin(1, kAudioSampleFrequency);
  // PDM.setGain(20);
   // The I2S config as per the example
  const i2s_config_t i2s_config = {
      .mode = i2s_mode_t(I2S_MODE_MASTER | I2S_MODE_RX), // Receive, not transfer
      .sample_rate = 16000,                         // 16KHz
      .bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT, // could only get it to work with 32bits
      .channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT, // although the SEL config should be left, it seems to transmit on right
      .communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_I2S | I2S_COMM_FORMAT_I2S_MSB),
      .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,     // Interrupt level 1
      .dma_buf_count = 8,                           // number of buffers
      .dma_buf_len = BUFFER_SIZE                              // 32 samples per buffer (minimum)
  };

  // The pin config as per the setup
  const i2s_pin_config_t pin_config = {
      .bck_io_num = 13,   // BCKL
      .ws_io_num = 15,    // LRCL
      .data_out_num = -1, // not used (only for speakers)
      .data_in_num = 32   // DOUT
  };
  esp_err_t err;
  // Configuring the I2S driver and pins.
  // This function must be called before any I2S driver read/write operations.
  err = i2s_driver_install(I2S_PORT, &i2s_config, 0, NULL);
  if (err != ESP_OK) {
    Serial.printf("Failed installing driver: %d\n", err);
    while (true);
  }
  err = i2s_set_pin(I2S_PORT, &pin_config);
  if (err != ESP_OK) {
    Serial.printf("Failed setting pin: %d\n", err);
    while (true);
  }
  Serial.println("I2S driver installed.");
  delay(1);
  // uint32_t samples[8];
  // int bytes_read = i2s_pop_sample(I2S_PORT, (char *)&sample, portMAX_DELAY); // no timeout
  // size_t bytes_read;
  // esp_err_t error = i2s_read(I2S_PORT, (char *)samples, 8, &bytes_read, portMAX_DELAY);
  // Serial.println(error == ESP_OK);
  // delay(1);
  // if (bytes_read > 0) {
  //   float mean = 0;
  //   for (int i = 0; i < bytes_read; ++i) {
  //     mean += samples[i];
  //   }
  //   Serial.println(mean);
  // }
  // delay(10);
  // Block until we have our first audio sample
  xTaskCreate(CaptureSamples, "adc_read_task", kAudioCaptureBufferSize, NULL, 5, &audioInputTaskHandle);
  audioCaptureFlag = 1;

  while (!g_latest_audio_timestamp) {
  }

  return kTfLiteOk;
}

TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
                             int start_ms, int duration_ms,
                             int* audio_samples_size, int16_t** audio_samples) {
  // Set everything up to start receiving audio
  if (!g_is_audio_initialized) {
    TfLiteStatus init_status = InitAudioRecording(error_reporter);
    if (init_status != kTfLiteOk) {
      return init_status;
    }
    g_is_audio_initialized = true;
  }
  else
  {
    // Serial.println("task resumed");
    // Serial.flush();
    // delay(10);
    // xTaskCreate(CaptureSamples, "adc_read_task", kAudioCaptureBufferSize, NULL, 1, &audioInputTaskHandle);
    audioCaptureFlag = 1;
    // vTaskResume(audioInputTaskHandle);
  }


  // This next part should only be called when the main thread notices that the
  // latest audio sample data timestamp has changed, so that there's new data
  // in the capture ring buffer. The ring buffer will eventually wrap around and
  // overwrite the data, but the assumption is that the main thread is checking
  // often enough and the buffer is large enough that this call will be made
  // before that happens.

  // Determine the index, in the history of all samples, of the first
  // sample we want
  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
  // Determine how many samples we want in total
  const int duration_sample_count =
      duration_ms * (kAudioSampleFrequency / 1000);
  // Serial.println(duration_sample_count);
  // delay(10);
  for (int i = 0; i < duration_sample_count; ++i) {
    
    // delay(1);
    // For each sample, transform its index in the history of all samples into
    // its index in g_audio_capture_buffer
    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
    
    // Write the sample to the output buffer
    g_audio_output_buffer[i] = ((int16_t*)g_audio_capture_buffer)[capture_index];
    // Serial.printf("inside loop %d\n", i);
    // delay(1);
  }
  
  // Serial.printf("got samples\n");
  audioCaptureFlag = 0;
  // Serial.println(testerino);
  // delay(10);
  // testerino = 0;
  // vTaskDelete(audioInputTaskHandle);
  
  // Serial.println("task deleted");
  // delay(10);


  // Set pointers to provide access to the audio
  *audio_samples_size = kMaxAudioSampleSize;
  *audio_samples = g_audio_output_buffer;

  return kTfLiteOk;
}

int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }

I’m very new to the TFLite Arduino library and I’d appreciate some help on how to modify the example.