ESPHome  2024.12.2
micro_wake_word.cpp
Go to the documentation of this file.
1 #include "micro_wake_word.h"
2 #include "streaming_model.h"
3 
4 #ifdef USE_ESP_IDF
5 
6 #include "esphome/core/hal.h"
7 #include "esphome/core/helpers.h"
8 #include "esphome/core/log.h"
9 
10 #include <frontend.h>
11 #include <frontend_util.h>
12 
13 #include <tensorflow/lite/core/c/common.h>
14 #include <tensorflow/lite/micro/micro_interpreter.h>
15 #include <tensorflow/lite/micro/micro_mutable_op_resolver.h>
16 
17 #include <cmath>
18 
19 namespace esphome {
20 namespace micro_wake_word {
21 
22 static const char *const TAG = "micro_wake_word";
23 
24 static const size_t SAMPLE_RATE_HZ = 16000; // 16 kHz
25 static const size_t BUFFER_LENGTH = 64; // 0.064 seconds
26 static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
27 static const size_t INPUT_BUFFER_SIZE = 16 * SAMPLE_RATE_HZ / 1000; // 16ms * 16kHz / 1000ms
28 
30 
31 static const LogString *micro_wake_word_state_to_string(State state) {
32  switch (state) {
33  case State::IDLE:
34  return LOG_STR("IDLE");
36  return LOG_STR("START_MICROPHONE");
38  return LOG_STR("STARTING_MICROPHONE");
40  return LOG_STR("DETECTING_WAKE_WORD");
42  return LOG_STR("STOP_MICROPHONE");
44  return LOG_STR("STOPPING_MICROPHONE");
45  default:
46  return LOG_STR("UNKNOWN");
47  }
48 }
49 
51  ESP_LOGCONFIG(TAG, "microWakeWord:");
52  ESP_LOGCONFIG(TAG, " models:");
53  for (auto &model : this->wake_word_models_) {
54  model.log_model_config();
55  }
56 #ifdef USE_MICRO_WAKE_WORD_VAD
57  this->vad_model_->log_model_config();
58 #endif
59 }
60 
62  ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
63 
65  this->mark_failed();
66  return;
67  }
68 
69  ESP_LOGCONFIG(TAG, "Micro Wake Word initialized");
70 
71  this->frontend_config_.window.size_ms = FEATURE_DURATION_MS;
72  this->frontend_config_.window.step_size_ms = this->features_step_size_;
73  this->frontend_config_.filterbank.num_channels = PREPROCESSOR_FEATURE_SIZE;
74  this->frontend_config_.filterbank.lower_band_limit = 125.0;
75  this->frontend_config_.filterbank.upper_band_limit = 7500.0;
76  this->frontend_config_.noise_reduction.smoothing_bits = 10;
77  this->frontend_config_.noise_reduction.even_smoothing = 0.025;
78  this->frontend_config_.noise_reduction.odd_smoothing = 0.06;
79  this->frontend_config_.noise_reduction.min_signal_remaining = 0.05;
80  this->frontend_config_.pcan_gain_control.enable_pcan = 1;
81  this->frontend_config_.pcan_gain_control.strength = 0.95;
82  this->frontend_config_.pcan_gain_control.offset = 80.0;
83  this->frontend_config_.pcan_gain_control.gain_bits = 21;
84  this->frontend_config_.log_scale.enable_log = 1;
85  this->frontend_config_.log_scale.scale_shift = 6;
86 }
87 
88 void MicroWakeWord::add_wake_word_model(const uint8_t *model_start, float probability_cutoff,
89  size_t sliding_window_average_size, const std::string &wake_word,
90  size_t tensor_arena_size) {
91  this->wake_word_models_.emplace_back(model_start, probability_cutoff, sliding_window_average_size, wake_word,
92  tensor_arena_size);
93 }
94 
95 #ifdef USE_MICRO_WAKE_WORD_VAD
96 void MicroWakeWord::add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size,
97  size_t tensor_arena_size) {
98  this->vad_model_ = make_unique<VADModel>(model_start, probability_cutoff, sliding_window_size, tensor_arena_size);
99 }
100 #endif
101 
103  switch (this->state_) {
104  case State::IDLE:
105  break;
107  ESP_LOGD(TAG, "Starting Microphone");
108  this->microphone_->start();
110  this->high_freq_.start();
111  break;
113  if (this->microphone_->is_running()) {
115  }
116  break;
118  while (!this->has_enough_samples_()) {
119  this->read_microphone_();
120  }
122  if (this->detect_wake_words_()) {
123  ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
124  this->detected_ = true;
126  }
127  break;
129  ESP_LOGD(TAG, "Stopping Microphone");
130  this->microphone_->stop();
132  this->high_freq_.stop();
133  this->unload_models_();
134  this->deallocate_buffers_();
135  break;
137  if (this->microphone_->is_stopped()) {
138  this->set_state_(State::IDLE);
139  if (this->detected_) {
141  this->detected_ = false;
142  this->detected_wake_word_ = "";
143  }
144  }
145  break;
146  }
147 }
148 
150  if (!this->is_ready()) {
151  ESP_LOGW(TAG, "Wake word detection can't start as the component hasn't been setup yet");
152  return;
153  }
154 
155  if (this->is_failed()) {
156  ESP_LOGW(TAG, "Wake word component is marked as failed. Please check setup logs");
157  return;
158  }
159 
160  if (!this->load_models_() || !this->allocate_buffers_()) {
161  ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers");
162  this->status_set_error();
163  } else {
164  this->status_clear_error();
165  }
166 
167  if (this->status_has_error()) {
168  ESP_LOGW(TAG, "Wake word component has an error. Please check logs");
169  return;
170  }
171 
172  if (this->state_ != State::IDLE) {
173  ESP_LOGW(TAG, "Wake word is already running");
174  return;
175  }
176 
177  this->reset_states_();
179 }
180 
182  if (this->state_ == State::IDLE) {
183  ESP_LOGW(TAG, "Wake word is already stopped");
184  return;
185  }
186  if (this->state_ == State::STOPPING_MICROPHONE) {
187  ESP_LOGW(TAG, "Wake word is already stopping");
188  return;
189  }
191 }
192 
194  ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(micro_wake_word_state_to_string(this->state_)),
195  LOG_STR_ARG(micro_wake_word_state_to_string(state)));
196  this->state_ = state;
197 }
198 
200  size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
201  if (bytes_read == 0) {
202  return 0;
203  }
204 
205  size_t bytes_free = this->ring_buffer_->free();
206 
207  if (bytes_free < bytes_read) {
208  ESP_LOGW(TAG,
209  "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
210  "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
211  bytes_free, bytes_read);
212 
213  this->ring_buffer_->reset();
214  }
215 
216  return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
217 }
218 
221 
222  if (this->input_buffer_ == nullptr) {
223  this->input_buffer_ = audio_samples_allocator.allocate(INPUT_BUFFER_SIZE * sizeof(int16_t));
224  if (this->input_buffer_ == nullptr) {
225  ESP_LOGE(TAG, "Could not allocate input buffer");
226  return false;
227  }
228  }
229 
230  if (this->preprocessor_audio_buffer_ == nullptr) {
231  this->preprocessor_audio_buffer_ = audio_samples_allocator.allocate(this->new_samples_to_get_());
232  if (this->preprocessor_audio_buffer_ == nullptr) {
233  ESP_LOGE(TAG, "Could not allocate the audio preprocessor's buffer.");
234  return false;
235  }
236  }
237 
238  if (this->ring_buffer_ == nullptr) {
239  this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
240  if (this->ring_buffer_ == nullptr) {
241  ESP_LOGE(TAG, "Could not allocate ring buffer");
242  return false;
243  }
244  }
245 
246  return true;
247 }
248 
251  audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
252  this->input_buffer_ = nullptr;
253  audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
254  this->preprocessor_audio_buffer_ = nullptr;
255 }
256 
258  // Setup preprocesor feature generator
259  if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) {
260  ESP_LOGD(TAG, "Failed to populate frontend state");
261  FrontendFreeStateContents(&this->frontend_state_);
262  return false;
263  }
264 
265  // Setup streaming models
266  for (auto &model : this->wake_word_models_) {
267  if (!model.load_model(this->streaming_op_resolver_)) {
268  ESP_LOGE(TAG, "Failed to initialize a wake word model.");
269  return false;
270  }
271  }
272 #ifdef USE_MICRO_WAKE_WORD_VAD
273  if (!this->vad_model_->load_model(this->streaming_op_resolver_)) {
274  ESP_LOGE(TAG, "Failed to initialize VAD model.");
275  return false;
276  }
277 #endif
278 
279  return true;
280 }
281 
283  FrontendFreeStateContents(&this->frontend_state_);
284 
285  for (auto &model : this->wake_word_models_) {
286  model.unload_model();
287  }
288 #ifdef USE_MICRO_WAKE_WORD_VAD
289  this->vad_model_->unload_model();
290 #endif
291 }
292 
294  int8_t audio_features[PREPROCESSOR_FEATURE_SIZE];
295 
296  if (!this->generate_features_for_window_(audio_features)) {
297  return;
298  }
299 
300  // Increase the counter since the last positive detection
301  this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
302 
303  for (auto &model : this->wake_word_models_) {
304  // Perform inference
305  model.perform_streaming_inference(audio_features);
306  }
307 #ifdef USE_MICRO_WAKE_WORD_VAD
308  this->vad_model_->perform_streaming_inference(audio_features);
309 #endif
310 }
311 
313  // Verify we have processed samples since the last positive detection
314  if (this->ignore_windows_ < 0) {
315  return false;
316  }
317 
318 #ifdef USE_MICRO_WAKE_WORD_VAD
319  bool vad_state = this->vad_model_->determine_detected();
320 #endif
321 
322  for (auto &model : this->wake_word_models_) {
323  if (model.determine_detected()) {
324 #ifdef USE_MICRO_WAKE_WORD_VAD
325  if (vad_state) {
326 #endif
327  this->detected_wake_word_ = model.get_wake_word();
328  return true;
329 #ifdef USE_MICRO_WAKE_WORD_VAD
330  } else {
331  ESP_LOGD(TAG, "Wake word model predicts %s, but VAD model doesn't.", model.get_wake_word().c_str());
332  }
333 #endif
334  }
335  }
336 
337  return false;
338 }
339 
341  return this->ring_buffer_->available() >=
342  (this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)) * sizeof(int16_t);
343 }
344 
345 bool MicroWakeWord::generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE]) {
346  // Ensure we have enough new audio samples in the ring buffer for a full window
347  if (!this->has_enough_samples_()) {
348  return false;
349  }
350 
351  size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_),
352  this->new_samples_to_get_() * sizeof(int16_t), pdMS_TO_TICKS(200));
353 
354  if (bytes_read == 0) {
355  ESP_LOGE(TAG, "Could not read data from Ring Buffer");
356  } else if (bytes_read < this->new_samples_to_get_() * sizeof(int16_t)) {
357  ESP_LOGD(TAG, "Partial Read of Data by Model");
358  ESP_LOGD(TAG, "Could only read %d bytes when required %d bytes ", bytes_read,
359  (int) (this->new_samples_to_get_() * sizeof(int16_t)));
360  return false;
361  }
362 
363  size_t num_samples_read;
364  struct FrontendOutput frontend_output = FrontendProcessSamples(
365  &this->frontend_state_, this->preprocessor_audio_buffer_, this->new_samples_to_get_(), &num_samples_read);
366 
367  for (size_t i = 0; i < frontend_output.size; ++i) {
368  // These scaling values are set to match the TFLite audio frontend int8 output.
369  // The feature pipeline outputs 16-bit signed integers in roughly a 0 to 670
370  // range. In training, these are then arbitrarily divided by 25.6 to get
371  // float values in the rough range of 0.0 to 26.0. This scaling is performed
372  // for historical reasons, to match up with the output of other feature
373  // generators.
374  // The process is then further complicated when we quantize the model. This
375  // means we have to scale the 0.0 to 26.0 real values to the -128 to 127
376  // signed integer numbers.
377  // All this means that to get matching values from our integer feature
378  // output into the tensor input, we have to perform:
379  // input = (((feature / 25.6) / 26.0) * 256) - 128
380  // To simplify this and perform it in 32-bit integer math, we rearrange to:
381  // input = (feature * 256) / (25.6 * 26.0) - 128
382  constexpr int32_t value_scale = 256;
383  constexpr int32_t value_div = 666; // 666 = 25.6 * 26.0 after rounding
384  int32_t value = ((frontend_output.values[i] * value_scale) + (value_div / 2)) / value_div;
385  value -= 128;
386  if (value < -128) {
387  value = -128;
388  }
389  if (value > 127) {
390  value = 127;
391  }
392  features[i] = value;
393  }
394 
395  return true;
396 }
397 
399  ESP_LOGD(TAG, "Resetting buffers and probabilities");
400  this->ring_buffer_->reset();
401  this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
402  for (auto &model : this->wake_word_models_) {
403  model.reset_probabilities();
404  }
405 #ifdef USE_MICRO_WAKE_WORD_VAD
406  this->vad_model_->reset_probabilities();
407 #endif
408 }
409 
410 bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<20> &op_resolver) {
411  if (op_resolver.AddCallOnce() != kTfLiteOk)
412  return false;
413  if (op_resolver.AddVarHandle() != kTfLiteOk)
414  return false;
415  if (op_resolver.AddReshape() != kTfLiteOk)
416  return false;
417  if (op_resolver.AddReadVariable() != kTfLiteOk)
418  return false;
419  if (op_resolver.AddStridedSlice() != kTfLiteOk)
420  return false;
421  if (op_resolver.AddConcatenation() != kTfLiteOk)
422  return false;
423  if (op_resolver.AddAssignVariable() != kTfLiteOk)
424  return false;
425  if (op_resolver.AddConv2D() != kTfLiteOk)
426  return false;
427  if (op_resolver.AddMul() != kTfLiteOk)
428  return false;
429  if (op_resolver.AddAdd() != kTfLiteOk)
430  return false;
431  if (op_resolver.AddMean() != kTfLiteOk)
432  return false;
433  if (op_resolver.AddFullyConnected() != kTfLiteOk)
434  return false;
435  if (op_resolver.AddLogistic() != kTfLiteOk)
436  return false;
437  if (op_resolver.AddQuantize() != kTfLiteOk)
438  return false;
439  if (op_resolver.AddDepthwiseConv2D() != kTfLiteOk)
440  return false;
441  if (op_resolver.AddAveragePool2D() != kTfLiteOk)
442  return false;
443  if (op_resolver.AddMaxPool2D() != kTfLiteOk)
444  return false;
445  if (op_resolver.AddPad() != kTfLiteOk)
446  return false;
447  if (op_resolver.AddPack() != kTfLiteOk)
448  return false;
449  if (op_resolver.AddSplitV() != kTfLiteOk)
450  return false;
451 
452  return true;
453 }
454 
455 } // namespace micro_wake_word
456 } // namespace esphome
457 
458 #endif // USE_ESP_IDF
void add_wake_word_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size)
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition: component.cpp:27
Trigger< std::string > * wake_word_detected_trigger_
bool detect_wake_words_()
Checks every model&#39;s recent probabilities to determine if the wake word has been predicted.
bool is_failed() const
Definition: component.cpp:143
std::unique_ptr< RingBuffer > ring_buffer_
T * allocate(size_t n)
Definition: helpers.h:690
HighFrequencyLoopRequester high_freq_
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
Definition: automation.h:95
bool is_ready() const
Definition: component.cpp:144
bool status_has_error() const
Definition: component.cpp:150
std::vector< WakeWordModel > wake_word_models_
void update_model_probabilities_()
Performs inference with each configured model.
void status_set_error(const char *message="unspecified")
Definition: component.cpp:159
bool register_streaming_ops_(tflite::MicroMutableOpResolver< 20 > &op_resolver)
Returns true if successfully registered the streaming model&#39;s TensorFlow operations.
microphone::Microphone * microphone_
void start()
Start running the loop continuously.
Definition: helpers.cpp:670
bool allocate_buffers_()
Allocates memory for input_buffer_, preprocessor_audio_buffer_, and ring_buffer_. ...
bool has_enough_samples_()
Tests if there are enough samples in the ring buffer to generate new features.
tflite::MicroMutableOpResolver< 20 > streaming_op_resolver_
bool generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE])
Generates features for a window of audio samples.
void deallocate_buffers_()
Frees memory allocated for input_buffer_ and preprocessor_audio_buffer_.
void stop()
Stop running the loop continuously.
Definition: helpers.cpp:676
void deallocate(T *p, size_t n)
Definition: helpers.h:709
void reset_states_()
Resets the ring buffer, ignore_windows_, and sliding window probabilities.
void status_clear_error()
Definition: component.cpp:172
virtual size_t read(int16_t *buf, size_t len)=0
virtual void mark_failed()
Mark this component as failed.
Definition: component.cpp:118
Implementation of SPI Controller mode.
Definition: a01nyub.cpp:7
void add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
An STL allocator that uses SPI or internal RAM.
Definition: helpers.h:675
bool load_models_()
Loads streaming models and prepares the feature generation frontend.
static std::unique_ptr< RingBuffer > create(size_t len)
Definition: ring_buffer.cpp:22
std::unique_ptr< VADModel > vad_model_
void unload_models_()
Deletes each model&#39;s TFLite interpreters and frees tensor arena memory.
bool state
Definition: fan.h:34
size_t read_microphone_()
Reads audio from microphone into the ring buffer.