ESPHome  2024.11.0
micro_wake_word.h
Go to the documentation of this file.
1 #pragma once
2 
3 #ifdef USE_ESP_IDF
4 
6 #include "streaming_model.h"
7 
11 
13 
14 #include <frontend_util.h>
15 
16 #include <tensorflow/lite/core/c/common.h>
17 #include <tensorflow/lite/micro/micro_interpreter.h>
18 #include <tensorflow/lite/micro/micro_mutable_op_resolver.h>
19 
20 namespace esphome {
21 namespace micro_wake_word {
22 
23 enum State {
30 };
31 
32 // The number of audio slices to process before accepting a positive detection
33 static const uint8_t MIN_SLICES_BEFORE_DETECTION = 74;
34 
35 class MicroWakeWord : public Component {
36  public:
37  void setup() override;
38  void loop() override;
39  float get_setup_priority() const override;
40  void dump_config() override;
41 
42  void start();
43  void stop();
44 
45  bool is_running() const { return this->state_ != State::IDLE; }
46 
47  void set_features_step_size(uint8_t step_size) { this->features_step_size_ = step_size; }
48 
49  void set_microphone(microphone::Microphone *microphone) { this->microphone_ = microphone; }
50 
52 
53  void add_wake_word_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_average_size,
54  const std::string &wake_word, size_t tensor_arena_size);
55 
56 #ifdef USE_MICRO_WAKE_WORD_VAD
57  void add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size,
58  size_t tensor_arena_size);
59 #endif
60 
61  protected:
66 
67  std::unique_ptr<RingBuffer> ring_buffer_;
68 
69  std::vector<WakeWordModel> wake_word_models_;
70 
71 #ifdef USE_MICRO_WAKE_WORD_VAD
72  std::unique_ptr<VADModel> vad_model_;
73 #endif
74 
75  tflite::MicroMutableOpResolver<20> streaming_op_resolver_;
76 
77  // Audio frontend handles generating spectrogram features
78  struct FrontendConfig frontend_config_;
79  struct FrontendState frontend_state_;
80 
81  // When the wake word detection first starts, we ignore this many audio
82  // feature slices before accepting a positive detection
83  int16_t ignore_windows_{-MIN_SLICES_BEFORE_DETECTION};
84 
86 
87  // Stores audio read from the microphone before being added to the ring buffer.
88  int16_t *input_buffer_{nullptr};
89  // Stores audio to be fed into the audio frontend for generating features.
90  int16_t *preprocessor_audio_buffer_{nullptr};
91 
92  bool detected_{false};
93  std::string detected_wake_word_{""};
94 
95  void set_state_(State state);
96 
99  bool has_enough_samples_();
100 
108  size_t read_microphone_();
109 
112  bool allocate_buffers_();
113 
115  void deallocate_buffers_();
116 
119  bool load_models_();
120 
123  void unload_models_();
124 
131 
138  bool detect_wake_words_();
139 
147  bool generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE]);
148 
150  void reset_states_();
151 
153  bool register_streaming_ops_(tflite::MicroMutableOpResolver<20> &op_resolver);
154 
155  inline uint16_t new_samples_to_get_() { return (this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)); }
156 };
157 
158 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<MicroWakeWord> {
159  public:
160  void play(Ts... x) override { this->parent_->start(); }
161 };
162 
163 template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<MicroWakeWord> {
164  public:
165  void play(Ts... x) override { this->parent_->stop(); }
166 };
167 
168 template<typename... Ts> class IsRunningCondition : public Condition<Ts...>, public Parented<MicroWakeWord> {
169  public:
170  bool check(Ts... x) override { return this->parent_->is_running(); }
171 };
172 
173 } // namespace micro_wake_word
174 } // namespace esphome
175 
176 #endif // USE_ESP_IDF
void add_wake_word_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_average_size, const std::string &wake_word, size_t tensor_arena_size)
void set_features_step_size(uint8_t step_size)
Trigger< std::string > * wake_word_detected_trigger_
bool detect_wake_words_()
Checks every model&#39;s recent probabilities to determine if the wake word has been predicted.
uint16_t x
Definition: tt21100.cpp:17
std::unique_ptr< RingBuffer > ring_buffer_
Helper class to request loop() to be called as fast as possible.
Definition: helpers.h:613
HighFrequencyLoopRequester high_freq_
std::vector< WakeWordModel > wake_word_models_
void update_model_probabilities_()
Performs inference with each configured model.
bool register_streaming_ops_(tflite::MicroMutableOpResolver< 20 > &op_resolver)
Returns true if successfully registered the streaming model&#39;s TensorFlow operations.
microphone::Microphone * microphone_
Base class for all automation conditions.
Definition: automation.h:74
bool allocate_buffers_()
Allocates memory for input_buffer_, preprocessor_audio_buffer_, and ring_buffer_. ...
bool has_enough_samples_()
Tests if there are enough samples in the ring buffer to generate new features.
tflite::MicroMutableOpResolver< 20 > streaming_op_resolver_
bool generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE])
Generates features for a window of audio samples.
void deallocate_buffers_()
Frees memory allocated for input_buffer_ and preprocessor_audio_buffer_.
void reset_states_()
Resets the ring buffer, ignore_windows_, and sliding window probabilities.
Trigger< std::string > * get_wake_word_detected_trigger() const
Implementation of SPI Controller mode.
Definition: a01nyub.cpp:7
void add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size, size_t tensor_arena_size)
void set_microphone(microphone::Microphone *microphone)
bool load_models_()
Loads streaming models and prepares the feature generation frontend.
std::unique_ptr< VADModel > vad_model_
Helper class to easily give an object a parent of type T.
Definition: helpers.h:522
void unload_models_()
Deletes each model&#39;s TFLite interpreters and frees tensor arena memory.
bool state
Definition: fan.h:34
size_t read_microphone_()
Reads audio from microphone into the ring buffer.