ESPHome  2024.4.0
voice_assistant.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include "esphome/core/defines.h"
4 
5 #ifdef USE_VOICE_ASSISTANT
6 
9 #include "esphome/core/helpers.h"
11 
15 #ifdef USE_SPEAKER
17 #endif
18 #ifdef USE_MEDIA_PLAYER
20 #endif
22 
23 #ifdef USE_ESP_ADF
24 #include <esp_vad.h>
25 #endif
26 
27 namespace esphome {
28 namespace voice_assistant {
29 
30 // Version 1: Initial version
31 // Version 2: Adds raw speaker support
32 static const uint32_t LEGACY_INITIAL_VERSION = 1;
33 static const uint32_t LEGACY_SPEAKER_SUPPORT = 2;
34 
35 enum VoiceAssistantFeature : uint32_t {
37  FEATURE_SPEAKER = 1 << 1,
39 };
40 
41 enum class State {
42  IDLE,
55 };
56 
57 enum AudioMode : uint8_t {
60 };
61 
62 class VoiceAssistant : public Component {
63  public:
64  void setup() override;
65  void loop() override;
66  float get_setup_priority() const override;
67  void start_streaming();
68  void start_streaming(struct sockaddr_storage *addr, uint16_t port);
69  void failed_to_start();
70 
71  void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; }
72 #ifdef USE_SPEAKER
73  void set_speaker(speaker::Speaker *speaker) {
74  this->speaker_ = speaker;
75  this->local_output_ = true;
76  }
77 #endif
78 #ifdef USE_MEDIA_PLAYER
80  this->media_player_ = media_player;
81  this->local_output_ = true;
82  }
83 #endif
84 
85  uint32_t get_legacy_version() const {
86 #ifdef USE_SPEAKER
87  if (this->speaker_ != nullptr) {
88  return LEGACY_SPEAKER_SUPPORT;
89  }
90 #endif
91  return LEGACY_INITIAL_VERSION;
92  }
93 
94  uint32_t get_feature_flags() const {
95  uint32_t flags = 0;
97 #ifdef USE_SPEAKER
98  if (this->speaker_ != nullptr) {
101  }
102 #endif
103  return flags;
104  }
105 
106  void request_start(bool continuous, bool silence_detection);
107  void request_stop();
108 
109  void on_event(const api::VoiceAssistantEventResponse &msg);
110  void on_audio(const api::VoiceAssistantAudio &msg);
111 
112  bool is_running() const { return this->state_ != State::IDLE; }
113  void set_continuous(bool continuous) { this->continuous_ = continuous; }
114  bool is_continuous() const { return this->continuous_; }
115 
116  void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; }
117 #ifdef USE_ESP_ADF
118  void set_vad_threshold(uint8_t vad_threshold) { this->vad_threshold_ = vad_threshold; }
119 #endif
120 
121  void set_noise_suppression_level(uint8_t noise_suppression_level) {
122  this->noise_suppression_level_ = noise_suppression_level;
123  }
124  void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; }
125  void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; }
126 
127  Trigger<> *get_intent_end_trigger() const { return this->intent_end_trigger_; }
128  Trigger<> *get_intent_start_trigger() const { return this->intent_start_trigger_; }
129  Trigger<> *get_listening_trigger() const { return this->listening_trigger_; }
130  Trigger<> *get_end_trigger() const { return this->end_trigger_; }
131  Trigger<> *get_start_trigger() const { return this->start_trigger_; }
132  Trigger<> *get_stt_vad_end_trigger() const { return this->stt_vad_end_trigger_; }
133  Trigger<> *get_stt_vad_start_trigger() const { return this->stt_vad_start_trigger_; }
134 #ifdef USE_SPEAKER
135  Trigger<> *get_tts_stream_start_trigger() const { return this->tts_stream_start_trigger_; }
136  Trigger<> *get_tts_stream_end_trigger() const { return this->tts_stream_end_trigger_; }
137 #endif
138  Trigger<> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; }
139  Trigger<std::string> *get_stt_end_trigger() const { return this->stt_end_trigger_; }
140  Trigger<std::string> *get_tts_end_trigger() const { return this->tts_end_trigger_; }
141  Trigger<std::string> *get_tts_start_trigger() const { return this->tts_start_trigger_; }
142  Trigger<std::string, std::string> *get_error_trigger() const { return this->error_trigger_; }
143  Trigger<> *get_idle_trigger() const { return this->idle_trigger_; }
144 
145  Trigger<> *get_client_connected_trigger() const { return this->client_connected_trigger_; }
146  Trigger<> *get_client_disconnected_trigger() const { return this->client_disconnected_trigger_; }
147 
148  void client_subscription(api::APIConnection *client, bool subscribe);
149  api::APIConnection *get_api_connection() const { return this->api_client_; }
150 
151  void set_wake_word(const std::string &wake_word) { this->wake_word_ = wake_word; }
152 
153  protected:
154  int read_microphone_();
155  void set_state_(State state);
156  void set_state_(State state, State desired_state);
157  void signal_stop_();
158 
159  std::unique_ptr<socket::Socket> socket_ = nullptr;
160  struct sockaddr_storage dest_addr_;
161 
162  Trigger<> *intent_end_trigger_ = new Trigger<>();
163  Trigger<> *intent_start_trigger_ = new Trigger<>();
164  Trigger<> *listening_trigger_ = new Trigger<>();
165  Trigger<> *end_trigger_ = new Trigger<>();
166  Trigger<> *start_trigger_ = new Trigger<>();
167  Trigger<> *stt_vad_start_trigger_ = new Trigger<>();
168  Trigger<> *stt_vad_end_trigger_ = new Trigger<>();
169 #ifdef USE_SPEAKER
170  Trigger<> *tts_stream_start_trigger_ = new Trigger<>();
171  Trigger<> *tts_stream_end_trigger_ = new Trigger<>();
172 #endif
173  Trigger<> *wake_word_detected_trigger_ = new Trigger<>();
174  Trigger<std::string> *stt_end_trigger_ = new Trigger<std::string>();
175  Trigger<std::string> *tts_end_trigger_ = new Trigger<std::string>();
176  Trigger<std::string> *tts_start_trigger_ = new Trigger<std::string>();
178  Trigger<> *idle_trigger_ = new Trigger<>();
179 
180  Trigger<> *client_connected_trigger_ = new Trigger<>();
181  Trigger<> *client_disconnected_trigger_ = new Trigger<>();
182 
183  api::APIConnection *api_client_{nullptr};
184 
185  microphone::Microphone *mic_{nullptr};
186 #ifdef USE_SPEAKER
187  void write_speaker_();
188  speaker::Speaker *speaker_{nullptr};
189  uint8_t *speaker_buffer_;
190  size_t speaker_buffer_index_{0};
191  size_t speaker_buffer_size_{0};
192  size_t speaker_bytes_received_{0};
193  bool wait_for_stream_end_{false};
194  bool stream_ended_{false};
195 #endif
196 #ifdef USE_MEDIA_PLAYER
197  media_player::MediaPlayer *media_player_{nullptr};
198 #endif
199 
200  bool local_output_{false};
201 
202  std::string conversation_id_{""};
203 
204  std::string wake_word_{""};
205 
207 
208 #ifdef USE_ESP_ADF
209  vad_handle_t vad_instance_;
210  uint8_t vad_threshold_{5};
211  uint8_t vad_counter_{0};
212 #endif
213  std::unique_ptr<RingBuffer> ring_buffer_;
214 
217  uint8_t auto_gain_;
219 
220  uint8_t *send_buffer_;
221  int16_t *input_buffer_;
222 
223  bool continuous_{false};
225 
227  State desired_state_{State::IDLE};
228 
229  AudioMode audio_mode_{AUDIO_MODE_UDP};
230  bool udp_socket_running_{false};
231  bool start_udp_socket_();
232 };
233 
234 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {
235  TEMPLATABLE_VALUE(std::string, wake_word);
236 
237  public:
238  void play(Ts... x) override {
239  this->parent_->set_wake_word(this->wake_word_.value(x...));
240  this->parent_->request_start(false, this->silence_detection_);
241  }
242 
243  void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; }
244 
245  protected:
247 };
248 
249 template<typename... Ts> class StartContinuousAction : public Action<Ts...>, public Parented<VoiceAssistant> {
250  public:
251  void play(Ts... x) override { this->parent_->request_start(true, true); }
252 };
253 
254 template<typename... Ts> class StopAction : public Action<Ts...>, public Parented<VoiceAssistant> {
255  public:
256  void play(Ts... x) override { this->parent_->request_stop(); }
257 };
258 
259 template<typename... Ts> class IsRunningCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
260  public:
261  bool check(Ts... x) override { return this->parent_->is_running() || this->parent_->is_continuous(); }
262 };
263 
264 template<typename... Ts> class ConnectedCondition : public Condition<Ts...>, public Parented<VoiceAssistant> {
265  public:
266  bool check(Ts... x) override { return this->parent_->get_api_connection() != nullptr; }
267 };
268 
269 extern VoiceAssistant *global_voice_assistant; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
270 
271 } // namespace voice_assistant
272 } // namespace esphome
273 
274 #endif // USE_VOICE_ASSISTANT
void setup()
void loop()
void set_microphone(microphone::Microphone *mic)
HighFrequencyLoopRequester high_freq_
VoiceAssistant * global_voice_assistant
uint16_t x
Definition: tt21100.cpp:17
Helper class to request loop() to be called as fast as possible.
Definition: helpers.h:603
api::APIConnection * get_api_connection() const
void set_wake_word(const std::string &wake_word)
void set_noise_suppression_level(uint8_t noise_suppression_level)
void set_volume_multiplier(float volume_multiplier)
Base class for all automation conditions.
Definition: automation.h:74
Trigger< std::string > * get_tts_start_trigger() const
const uint32_t flags
Definition: stm32flash.h:85
Trigger< std::string, std::string > * get_error_trigger() const
std::unique_ptr< RingBuffer > ring_buffer_
void set_speaker(speaker::Speaker *speaker)
void set_vad_threshold(uint8_t vad_threshold)
Trigger< std::string > * get_tts_end_trigger() const
void set_use_wake_word(bool use_wake_word)
void set_media_player(media_player::MediaPlayer *media_player)
void set_silence_detection(bool silence_detection)
This is a workaround until we can figure out a way to get the tflite-micro idf component code availab...
Definition: a01nyub.cpp:7
Trigger< std::string > * get_stt_end_trigger() const
Helper class to easily give an object a parent of type T.
Definition: helpers.h:515
bool state
Definition: fan.h:34