ESPHome  2024.12.2
voice_assistant.cpp
Go to the documentation of this file.
1 #include "voice_assistant.h"
2 
3 #ifdef USE_VOICE_ASSISTANT
4 
5 #include "esphome/core/log.h"
6 
7 #include <cinttypes>
8 #include <cstdio>
9 
10 namespace esphome {
11 namespace voice_assistant {
12 
13 static const char *const TAG = "voice_assistant";
14 
15 #ifdef SAMPLE_RATE_HZ
16 #undef SAMPLE_RATE_HZ
17 #endif
18 
19 static const size_t SAMPLE_RATE_HZ = 16000;
20 static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
21 static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000;
22 static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t);
23 static const size_t RECEIVE_SIZE = 1024;
24 static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
25 
27 
29 
31  this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
32  if (this->socket_ == nullptr) {
33  ESP_LOGE(TAG, "Could not create socket");
34  this->mark_failed();
35  return false;
36  }
37  int enable = 1;
38  int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
39  if (err != 0) {
40  ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
41  // we can still continue
42  }
43  err = this->socket_->setblocking(false);
44  if (err != 0) {
45  ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
46  this->mark_failed();
47  return false;
48  }
49 
50 #ifdef USE_SPEAKER
51  if (this->speaker_ != nullptr) {
52  struct sockaddr_storage server;
53 
54  socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
55  if (sl == 0) {
56  ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
57  this->mark_failed();
58  return false;
59  }
60 
61  err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
62  if (err != 0) {
63  ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
64  this->mark_failed();
65  return false;
66  }
67  }
68 #endif
69  this->udp_socket_running_ = true;
70  return true;
71 }
72 
74  if (this->send_buffer_ != nullptr) {
75  return true; // Already allocated
76  }
77 
78 #ifdef USE_SPEAKER
79  if (this->speaker_ != nullptr) {
81  this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
82  if (this->speaker_buffer_ == nullptr) {
83  ESP_LOGW(TAG, "Could not allocate speaker buffer");
84  return false;
85  }
86  }
87 #endif
88 
90  this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
91  if (this->input_buffer_ == nullptr) {
92  ESP_LOGW(TAG, "Could not allocate input buffer");
93  return false;
94  }
95 
96 #ifdef USE_ESP_ADF
97  this->vad_instance_ = vad_create(VAD_MODE_4);
98 #endif
99 
100  this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
101  if (this->ring_buffer_ == nullptr) {
102  ESP_LOGW(TAG, "Could not allocate ring buffer");
103  return false;
104  }
105 
107  this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
108  if (send_buffer_ == nullptr) {
109  ESP_LOGW(TAG, "Could not allocate send buffer");
110  return false;
111  }
112 
113  return true;
114 }
115 
117  if (this->send_buffer_ != nullptr) {
118  memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
119  }
120 
121  if (this->input_buffer_ != nullptr) {
122  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
123  }
124 
125  if (this->ring_buffer_ != nullptr) {
126  this->ring_buffer_->reset();
127  }
128 
129 #ifdef USE_SPEAKER
130  if (this->speaker_buffer_ != nullptr) {
131  memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
132 
133  this->speaker_buffer_size_ = 0;
134  this->speaker_buffer_index_ = 0;
135  this->speaker_bytes_received_ = 0;
136  }
137 #endif
138 }
139 
142  send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
143  this->send_buffer_ = nullptr;
144 
145  if (this->ring_buffer_ != nullptr) {
146  this->ring_buffer_.reset();
147  this->ring_buffer_ = nullptr;
148  }
149 
150 #ifdef USE_ESP_ADF
151  if (this->vad_instance_ != nullptr) {
152  vad_destroy(this->vad_instance_);
153  this->vad_instance_ = nullptr;
154  }
155 #endif
156 
158  input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
159  this->input_buffer_ = nullptr;
160 
161 #ifdef USE_SPEAKER
162  if (this->speaker_buffer_ != nullptr) {
164  speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
165  this->speaker_buffer_ = nullptr;
166  }
167 #endif
168 }
169 
171  this->conversation_id_ = "";
172  ESP_LOGD(TAG, "reset conversation ID");
173 }
174 
176  size_t bytes_read = 0;
177  if (this->mic_->is_running()) { // Read audio into input buffer
178  bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
179  if (bytes_read == 0) {
180  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
181  return 0;
182  }
183  // Write audio into ring buffer
184  this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
185  } else {
186  ESP_LOGD(TAG, "microphone not running");
187  }
188  return bytes_read;
189 }
190 
192  if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
194  if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
196  } else {
198  }
199  this->continuous_ = false;
200  this->signal_stop_();
201  this->clear_buffers_();
202  return;
203  }
204  switch (this->state_) {
205  case State::IDLE: {
206  if (this->continuous_ && this->desired_state_ == State::IDLE) {
207  this->idle_trigger_->trigger();
208 #ifdef USE_ESP_ADF
209  if (this->use_wake_word_) {
211  } else
212 #endif
213  {
215  }
216  } else {
217  this->high_freq_.stop();
218  }
219  break;
220  }
222  ESP_LOGD(TAG, "Starting Microphone");
223  if (!this->allocate_buffers_()) {
224  this->status_set_error("Failed to allocate buffers");
225  return;
226  }
227  if (this->status_has_error()) {
228  this->status_clear_error();
229  }
230  this->clear_buffers_();
231 
232  this->mic_->start();
233  this->high_freq_.start();
235  break;
236  }
238  if (this->mic_->is_running()) {
239  this->set_state_(this->desired_state_);
240  }
241  break;
242  }
243 #ifdef USE_ESP_ADF
244  case State::WAIT_FOR_VAD: {
245  this->read_microphone_();
246  ESP_LOGD(TAG, "Waiting for speech...");
248  break;
249  }
250  case State::WAITING_FOR_VAD: {
251  size_t bytes_read = this->read_microphone_();
252  if (bytes_read > 0) {
253  vad_state_t vad_state =
254  vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
255  if (vad_state == VAD_SPEECH) {
256  if (this->vad_counter_ < this->vad_threshold_) {
257  this->vad_counter_++;
258  } else {
259  ESP_LOGD(TAG, "VAD detected speech");
261 
262  // Reset for next time
263  this->vad_counter_ = 0;
264  }
265  } else {
266  if (this->vad_counter_ > 0) {
267  this->vad_counter_--;
268  }
269  }
270  }
271  break;
272  }
273 #endif
274  case State::START_PIPELINE: {
275  this->read_microphone_();
276  ESP_LOGD(TAG, "Requesting start...");
277  uint32_t flags = 0;
278  if (this->use_wake_word_)
280  if (this->silence_detection_)
282  api::VoiceAssistantAudioSettings audio_settings;
283  audio_settings.noise_suppression_level = this->noise_suppression_level_;
284  audio_settings.auto_gain = this->auto_gain_;
285  audio_settings.volume_multiplier = this->volume_multiplier_;
286 
288  msg.start = true;
289  msg.conversation_id = this->conversation_id_;
290  msg.flags = flags;
291  msg.audio_settings = audio_settings;
292  msg.wake_word_phrase = this->wake_word_;
293  this->wake_word_ = "";
294 
295  if (this->api_client_ == nullptr || !this->api_client_->send_voice_assistant_request(msg)) {
296  ESP_LOGW(TAG, "Could not request start");
297  this->error_trigger_->trigger("not-connected", "Could not request start");
298  this->continuous_ = false;
300  break;
301  }
303  this->set_timeout("reset-conversation_id", this->conversation_timeout_,
304  [this]() { this->reset_conversation_id(); });
305  break;
306  }
308  this->read_microphone_();
309  break; // State changed when udp server port received
310  }
312  this->read_microphone_();
313  size_t available = this->ring_buffer_->available();
314  while (available >= SEND_BUFFER_SIZE) {
315  size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
316  if (this->audio_mode_ == AUDIO_MODE_API) {
318  msg.data.assign((char *) this->send_buffer_, read_bytes);
320  } else {
321  if (!this->udp_socket_running_) {
322  if (!this->start_udp_socket_()) {
324  break;
325  }
326  }
327  this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
328  sizeof(this->dest_addr_));
329  }
330  available = this->ring_buffer_->available();
331  }
332 
333  break;
334  }
335  case State::STOP_MICROPHONE: {
336  if (this->mic_->is_running()) {
337  this->mic_->stop();
339  } else {
340  this->set_state_(this->desired_state_);
341  }
342  break;
343  }
345  if (this->mic_->is_stopped()) {
346  this->set_state_(this->desired_state_);
347  }
348  break;
349  }
351  break; // State changed by events
352  }
354  bool playing = false;
355 #ifdef USE_SPEAKER
356  if (this->speaker_ != nullptr) {
357  ssize_t received_len = 0;
358  if (this->audio_mode_ == AUDIO_MODE_UDP) {
359  if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
360  received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
361  if (received_len > 0) {
362  this->speaker_buffer_index_ += received_len;
363  this->speaker_buffer_size_ += received_len;
364  this->speaker_bytes_received_ += received_len;
365  }
366  } else {
367  ESP_LOGD(TAG, "Receive buffer full");
368  }
369  }
370  // Build a small buffer of audio before sending to the speaker
371  bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
372  if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
373  this->write_speaker_();
374  if (this->wait_for_stream_end_) {
375  this->cancel_timeout("playing");
376  if (end_of_stream) {
377  ESP_LOGD(TAG, "End of audio stream received");
378  this->cancel_timeout("speaker-timeout");
380  }
381  break; // We dont want to timeout here as the STREAM_END event will take care of that.
382  }
383  playing = this->speaker_->is_running();
384  }
385 #endif
386 #ifdef USE_MEDIA_PLAYER
387  if (this->media_player_ != nullptr) {
389  }
390 #endif
391  if (playing) {
392  this->set_timeout("playing", 2000, [this]() {
393  this->cancel_timeout("speaker-timeout");
395 
397  msg.success = true;
399  });
400  }
401  break;
402  }
404 #ifdef USE_SPEAKER
405  if (this->speaker_ != nullptr) {
406  if (this->speaker_buffer_size_ > 0) {
407  this->write_speaker_();
408  break;
409  }
410  if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
411  break;
412  }
413  ESP_LOGD(TAG, "Speaker has finished outputting all audio");
414  this->speaker_->stop();
415  this->cancel_timeout("speaker-timeout");
416  this->cancel_timeout("playing");
417 
418  this->clear_buffers_();
419 
420  this->wait_for_stream_end_ = false;
421  this->stream_ended_ = false;
422 
424  }
425 #endif
427  break;
428  }
429  default:
430  break;
431  }
432 }
433 
434 #ifdef USE_SPEAKER
436  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
437  if (this->speaker_buffer_size_ > 0) {
438  size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
439  size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
440  if (written > 0) {
441  memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
442  this->speaker_buffer_size_ -= written;
443  this->speaker_buffer_index_ -= written;
444  this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
445  } else {
446  ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
447  }
448  }
449  }
450 }
451 #endif
452 
454  if (!subscribe) {
455  if (this->api_client_ == nullptr || client != this->api_client_) {
456  ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
457  return;
458  }
459  this->api_client_ = nullptr;
461  return;
462  }
463 
464  if (this->api_client_ != nullptr) {
465  ESP_LOGE(TAG, "Multiple API Clients attempting to connect to Voice Assistant");
466  ESP_LOGE(TAG, "Current client: %s", this->api_client_->get_client_combined_info().c_str());
467  ESP_LOGE(TAG, "New client: %s", client->get_client_combined_info().c_str());
468  return;
469  }
470 
471  this->api_client_ = client;
473 }
474 
475 static const LogString *voice_assistant_state_to_string(State state) {
476  switch (state) {
477  case State::IDLE:
478  return LOG_STR("IDLE");
480  return LOG_STR("START_MICROPHONE");
482  return LOG_STR("STARTING_MICROPHONE");
483  case State::WAIT_FOR_VAD:
484  return LOG_STR("WAIT_FOR_VAD");
486  return LOG_STR("WAITING_FOR_VAD");
488  return LOG_STR("START_PIPELINE");
490  return LOG_STR("STARTING_PIPELINE");
492  return LOG_STR("STREAMING_MICROPHONE");
494  return LOG_STR("STOP_MICROPHONE");
496  return LOG_STR("STOPPING_MICROPHONE");
498  return LOG_STR("AWAITING_RESPONSE");
500  return LOG_STR("STREAMING_RESPONSE");
502  return LOG_STR("RESPONSE_FINISHED");
503  default:
504  return LOG_STR("UNKNOWN");
505  }
506 };
507 
509  State old_state = this->state_;
510  this->state_ = state;
511  ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
512  LOG_STR_ARG(voice_assistant_state_to_string(state)));
513 }
514 
515 void VoiceAssistant::set_state_(State state, State desired_state) {
516  this->set_state_(state);
517  this->desired_state_ = desired_state;
518  ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
519 }
520 
522  ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
523  this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
525 }
526 
528  if (this->state_ != State::STARTING_PIPELINE) {
529  this->signal_stop_();
530  return;
531  }
532 
533  ESP_LOGD(TAG, "Client started, streaming microphone");
534  this->audio_mode_ = AUDIO_MODE_API;
535 
536  if (this->mic_->is_running()) {
538  } else {
540  }
541 }
542 
543 void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
544  if (this->state_ != State::STARTING_PIPELINE) {
545  this->signal_stop_();
546  return;
547  }
548 
549  ESP_LOGD(TAG, "Client started, streaming microphone");
550  this->audio_mode_ = AUDIO_MODE_UDP;
551 
552  memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
553  if (this->dest_addr_.ss_family == AF_INET) {
554  ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
555  }
556 #if LWIP_IPV6
557  else if (this->dest_addr_.ss_family == AF_INET6) {
558  ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
559  }
560 #endif
561  else {
562  ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
563  return;
564  }
565 
566  if (this->mic_->is_running()) {
568  } else {
570  }
571 }
572 
573 void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
574  if (this->api_client_ == nullptr) {
575  ESP_LOGE(TAG, "No API client connected");
577  this->continuous_ = false;
578  return;
579  }
580  if (this->state_ == State::IDLE) {
581  this->continuous_ = continuous;
582  this->silence_detection_ = silence_detection;
583 #ifdef USE_ESP_ADF
584  if (this->use_wake_word_) {
586  } else
587 #endif
588  {
590  }
591  }
592 }
593 
595  this->continuous_ = false;
596 
597  switch (this->state_) {
598  case State::IDLE:
599  break;
602  case State::WAIT_FOR_VAD:
606  break;
609  this->signal_stop_();
611  break;
614  this->desired_state_ = State::IDLE;
615  break;
619  break; // Let the incoming audio stream finish then it will go to idle.
620  }
621 }
622 
624  memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
625  if (this->api_client_ == nullptr) {
626  return;
627  }
628  ESP_LOGD(TAG, "Signaling stop...");
630  msg.start = false;
632 }
633 
635  ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
636  switch (msg.event_type) {
638  ESP_LOGD(TAG, "Assist Pipeline running");
639  this->defer([this]() { this->start_trigger_->trigger(); });
640  break;
642  break;
644  ESP_LOGD(TAG, "Wake word detected");
645  this->defer([this]() { this->wake_word_detected_trigger_->trigger(); });
646  break;
647  }
649  ESP_LOGD(TAG, "STT started");
650  this->defer([this]() { this->listening_trigger_->trigger(); });
651  break;
653  std::string text;
654  for (auto arg : msg.data) {
655  if (arg.name == "text") {
656  text = std::move(arg.value);
657  }
658  }
659  if (text.empty()) {
660  ESP_LOGW(TAG, "No text in STT_END event");
661  return;
662  }
663  ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
664  this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
665  break;
666  }
668  ESP_LOGD(TAG, "Intent started");
669  this->defer([this]() { this->intent_start_trigger_->trigger(); });
670  break;
672  for (auto arg : msg.data) {
673  if (arg.name == "conversation_id") {
674  this->conversation_id_ = std::move(arg.value);
675  }
676  }
677  this->defer([this]() { this->intent_end_trigger_->trigger(); });
678  break;
679  }
681  std::string text;
682  for (auto arg : msg.data) {
683  if (arg.name == "text") {
684  text = std::move(arg.value);
685  }
686  }
687  if (text.empty()) {
688  ESP_LOGW(TAG, "No text in TTS_START event");
689  return;
690  }
691  ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
692  this->defer([this, text]() {
693  this->tts_start_trigger_->trigger(text);
694 #ifdef USE_SPEAKER
695  if (this->speaker_ != nullptr) {
696  this->speaker_->start();
697  }
698 #endif
699  });
700  break;
701  }
703  std::string url;
704  for (auto arg : msg.data) {
705  if (arg.name == "url") {
706  url = std::move(arg.value);
707  }
708  }
709  if (url.empty()) {
710  ESP_LOGW(TAG, "No url in TTS_END event");
711  return;
712  }
713  ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
714  this->defer([this, url]() {
715 #ifdef USE_MEDIA_PLAYER
716  if (this->media_player_ != nullptr) {
718  }
719 #endif
720  this->tts_end_trigger_->trigger(url);
721  });
723  this->set_state_(new_state, new_state);
724  break;
725  }
727  ESP_LOGD(TAG, "Assist Pipeline ended");
728  if (this->state_ == State::STREAMING_MICROPHONE) {
729  this->ring_buffer_->reset();
730 #ifdef USE_ESP_ADF
731  if (this->use_wake_word_) {
732  // No need to stop the microphone since we didn't use the speaker
734  } else
735 #endif
736  {
738  }
739  } else if (this->state_ == State::AWAITING_RESPONSE) {
740  // No TTS start event ("nevermind")
742  }
743  this->defer([this]() { this->end_trigger_->trigger(); });
744  break;
745  }
747  std::string code = "";
748  std::string message = "";
749  for (auto arg : msg.data) {
750  if (arg.name == "code") {
751  code = std::move(arg.value);
752  } else if (arg.name == "message") {
753  message = std::move(arg.value);
754  }
755  }
756  if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
757  // Don't change state here since either the "tts-end" or "run-end" events will do it.
758  return;
759  } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
760  // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
761  this->defer([this, code, message]() {
762  this->request_stop();
763  this->error_trigger_->trigger(code, message);
764  });
765  return;
766  }
767  ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
768  if (this->state_ != State::IDLE) {
769  this->signal_stop_();
771  }
772  this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); });
773  break;
774  }
776 #ifdef USE_SPEAKER
777  if (this->speaker_ != nullptr) {
778  this->wait_for_stream_end_ = true;
779  ESP_LOGD(TAG, "TTS stream start");
780  this->defer([this] { this->tts_stream_start_trigger_->trigger(); });
781  }
782 #endif
783  break;
784  }
786 #ifdef USE_SPEAKER
787  if (this->speaker_ != nullptr) {
788  this->stream_ended_ = true;
789  ESP_LOGD(TAG, "TTS stream end");
790  }
791 #endif
792  break;
793  }
795  ESP_LOGD(TAG, "Starting STT by VAD");
796  this->defer([this]() { this->stt_vad_start_trigger_->trigger(); });
797  break;
799  ESP_LOGD(TAG, "STT by VAD end");
801  this->defer([this]() { this->stt_vad_end_trigger_->trigger(); });
802  break;
803  default:
804  ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
805  break;
806  }
807 }
808 
810 #ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
811  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
812  if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
813  memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
814  this->speaker_buffer_index_ += msg.data.length();
815  this->speaker_buffer_size_ += msg.data.length();
816  this->speaker_bytes_received_ += msg.data.length();
817  ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length());
818  } else {
819  ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
820  }
821  }
822 #endif
823 }
824 
826  Timer timer = {
827  .id = msg.timer_id,
828  .name = msg.name,
829  .total_seconds = msg.total_seconds,
830  .seconds_left = msg.seconds_left,
831  .is_active = msg.is_active,
832  };
833  this->timers_[timer.id] = timer;
834  ESP_LOGD(TAG, "Timer Event");
835  ESP_LOGD(TAG, " Type: %" PRId32, msg.event_type);
836  ESP_LOGD(TAG, " %s", timer.to_string().c_str());
837 
838  switch (msg.event_type) {
840  this->timer_started_trigger_->trigger(timer);
841  break;
843  this->timer_updated_trigger_->trigger(timer);
844  break;
846  this->timer_cancelled_trigger_->trigger(timer);
847  this->timers_.erase(timer.id);
848  break;
850  this->timer_finished_trigger_->trigger(timer);
851  this->timers_.erase(timer.id);
852  break;
853  }
854 
855  if (this->timers_.empty()) {
856  this->cancel_interval("timer-event");
857  this->timer_tick_running_ = false;
858  } else if (!this->timer_tick_running_) {
859  this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
860  this->timer_tick_running_ = true;
861  }
862 }
863 
865  std::vector<Timer> res;
866  res.reserve(this->timers_.size());
867  for (auto &pair : this->timers_) {
868  auto &timer = pair.second;
869  if (timer.is_active && timer.seconds_left > 0) {
870  timer.seconds_left--;
871  }
872  res.push_back(timer);
873  }
874  this->timer_tick_trigger_->trigger(res);
875 }
876 
878 #ifdef USE_MEDIA_PLAYER
879  if (this->media_player_ != nullptr) {
880  this->tts_start_trigger_->trigger(msg.text);
883  this->tts_end_trigger_->trigger(msg.media_id);
884  this->end_trigger_->trigger();
885  }
886 #endif
887 }
888 
889 VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
890 
891 } // namespace voice_assistant
892 } // namespace esphome
893 
894 #endif // USE_VOICE_ASSISTANT
bool is_running() const
Definition: speaker.h:61
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition: component.cpp:52
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition: component.cpp:27
std::unordered_map< std::string, Timer > timers_
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
Definition: component.cpp:73
enums::VoiceAssistantTimerEvent event_type
Definition: api_pb2.h:1814
HighFrequencyLoopRequester high_freq_
VoiceAssistant * global_voice_assistant
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition: socket.cpp:51
std::unique_ptr< socket::Socket > socket_
sa_family_t ss_family
Definition: headers.h:92
Trigger< std::string > * tts_start_trigger_
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition: component.cpp:69
bool cancel_interval(const std::string &name)
Cancel an interval function.
Definition: component.cpp:56
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition: component.cpp:130
Trigger< std::string > * tts_end_trigger_
T * allocate(size_t n)
Definition: helpers.h:690
uint32_t socklen_t
Definition: headers.h:97
VoiceAssistantAudioSettings audio_settings
Definition: api_pb2.h:1751
Trigger< std::vector< Timer > > * timer_tick_trigger_
enums::VoiceAssistantEvent event_type
Definition: api_pb2.h:1788
void client_subscription(api::APIConnection *client, bool subscribe)
virtual bool has_buffered_data() const =0
std::vector< VoiceAssistantEventData > data
Definition: api_pb2.h:1789
std::string get_client_combined_info() const
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
Definition: automation.h:95
bool status_has_error() const
Definition: component.cpp:150
void status_set_error(const char *message="unspecified")
Definition: component.cpp:159
media_player::MediaPlayer * media_player_
void start()
Start running the loop continuously.
Definition: helpers.cpp:670
bool send_voice_assistant_request(const VoiceAssistantRequest &msg)
void stop()
Stop running the loop continuously.
Definition: helpers.cpp:676
const uint32_t flags
Definition: stm32flash.h:85
bool send_voice_assistant_announce_finished(const VoiceAssistantAnnounceFinished &msg)
void deallocate(T *p, size_t n)
Definition: helpers.h:709
std::unique_ptr< RingBuffer > ring_buffer_
bool send_voice_assistant_audio(const VoiceAssistantAudio &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
void status_clear_error()
Definition: component.cpp:172
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
MediaPlayerCall & set_announcement(bool announce)
virtual size_t read(int16_t *buf, size_t len)=0
virtual void start()=0
virtual void mark_failed()
Mark this component as failed.
Definition: component.cpp:118
Implementation of SPI Controller mode.
Definition: a01nyub.cpp:7
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
virtual size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait)
Plays the provided audio data.
Definition: speaker.h:37
An STL allocator that uses SPI or internal RAM.
Definition: helpers.h:675
static std::unique_ptr< RingBuffer > create(size_t len)
Definition: ring_buffer.cpp:22
MediaPlayerCall & set_media_url(const std::string &url)
virtual void stop()=0
void on_event(const api::VoiceAssistantEventResponse &msg)
bool state
Definition: fan.h:34
Trigger< std::string > * stt_end_trigger_
void request_start(bool continuous, bool silence_detection)
Trigger< std::string, std::string > * error_trigger_
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.