esphome · kahrendt · Aug 5, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/esphome/components/nabu/audio_decoder.cpp b/esphome/components/nabu/audio_decoder.cpp
@@ -110,21 +110,26 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
       }
     } else {
       // Try to decode more data
+      size_t bytes_available = this->input_ring_buffer_->available();
+      size_t bytes_to_read = std::min(bytes_available, this->internal_buffer_size_ - this->input_buffer_length_);
+
+      if ((this->potentially_failed_count_ > 0) && (bytes_to_read == 0)) {
+        // We didn't have enough data last time, and we have no new data, so just return
+        return AudioDecoderState::DECODING;
+      }
 
       // Shift unread data in input buffer to start
-      if ((this->input_buffer_length_ > 0) && (this->input_buffer_length_ < this->internal_buffer_size_)) {
+      if (this->input_buffer_length_ > 0) {
         memmove(this->input_buffer_, this->input_buffer_current_, this->input_buffer_length_);
       }
       this->input_buffer_current_ = this->input_buffer_;
 
       // read in new ring buffer data to fill the remaining input buffer
-      size_t bytes_available = this->input_ring_buffer_->available();
-      size_t bytes_to_read = std::min(bytes_available, this->internal_buffer_size_ - this->input_buffer_length_);
       size_t bytes_read = 0;
 
       if (bytes_to_read > 0) {
-        uint8_t *new_mp3_data = this->input_buffer_ + this->input_buffer_length_;
-        bytes_read = this->input_ring_buffer_->read((void *) new_mp3_data, bytes_to_read);
+        uint8_t *new_audio_data = this->input_buffer_ + this->input_buffer_length_;
+        bytes_read = this->input_ring_buffer_->read((void *) new_audio_data, bytes_to_read);
 
         this->input_buffer_length_ += bytes_read;
       }
@@ -153,6 +158,8 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
       ++this->potentially_failed_count_;
     } else if (state == FileDecoderState::END_OF_FILE) {
       this->end_of_file_ = true;
+    } else if (state == FileDecoderState::FAILED) {
+      return AudioDecoderState::FAILED;
     } else {
       this->potentially_failed_count_ = 0;
     }
@@ -191,9 +198,7 @@ FileDecoderState AudioDecoder::decode_wav_() {
 
           printf("sample channels: %d\n", this->channels_.value());
           printf("sample rate: %" PRId32 "\n", this->sample_rate_.value());
-          // printf("number of samples: %d\n",
-          //        this->wav_decoder_->chunk_bytes_left() / (this->channels_.value() * (this->bits_per_sample.value()
-          //        / 8)));
+          printf("bits per sample: %d\n", this->sample_depth_.value());
           this->wav_bytes_left_ = this->wav_decoder_->chunk_bytes_left();
           header_finished = true;
         } else if (result == wav_decoder::WAV_DECODER_SUCCESS_NEXT) {
@@ -226,11 +231,10 @@ FileDecoderState AudioDecoder::decode_wav_() {
       this->wav_bytes_left_ -= bytes_to_write;
     }
 
-    return FileDecoderState::MORE_TO_PROCESS;
+    return FileDecoderState::IDLE;
   }
 
   return FileDecoderState::END_OF_FILE;
-  // return DecoderState::FINISHED;
 }
 
 FileDecoderState AudioDecoder::decode_mp3_() {
@@ -280,14 +284,14 @@ FileDecoderState AudioDecoder::decode_flac_() {
     // Header hasn't been read
     auto result = this->flac_decoder_->read_header(this->input_buffer_length_);
 
-    if (result == flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
-      return FileDecoderState::POTENTIALLY_FAILED;
-    }
-
     size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
     this->input_buffer_current_ += bytes_consumed;
     this->input_buffer_length_ = this->flac_decoder_->get_bytes_left();
 
+    if (result == flac::FLAC_DECODER_HEADER_OUT_OF_DATA) {
+      return FileDecoderState::POTENTIALLY_FAILED;
+    }
+
     if (result != flac::FLAC_DECODER_SUCCESS) {
       printf("failed to read flac header. Error: %d\n", result);
       return FileDecoderState::FAILED;
@@ -311,10 +315,11 @@ FileDecoderState AudioDecoder::decode_flac_() {
       this->flac_decoder_->decode_frame(this->input_buffer_length_, (int16_t *) this->output_buffer_, &output_samples);
 
   if (result == flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
-    // not an issue, just needs more data!
+    // Not an issue, just needs more data that we'll get next time.
     return FileDecoderState::POTENTIALLY_FAILED;
   } else if (result > flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
     // Serious error, can't recover
+    printf("FLAC Decoder Error %d\n", result);
     return FileDecoderState::FAILED;
   }
 

diff --git a/esphome/components/nabu/audio_pipeline.cpp b/esphome/components/nabu/audio_pipeline.cpp
@@ -9,7 +9,7 @@ namespace nabu {
 
 static const size_t QUEUE_COUNT = 10;
 
-static const size_t HTTP_BUFFER_SIZE = 32 * 1024;
+static const size_t HTTP_BUFFER_SIZE = 64 * 1024;
 static const size_t BUFFER_SIZE_SAMPLES = 32768;
 static const size_t BUFFER_SIZE_BYTES = BUFFER_SIZE_SAMPLES * sizeof(int16_t);
 
@@ -50,21 +50,23 @@ AudioPipeline::AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type)
   this->event_group_ = xEventGroupCreate();
 }
 
-void AudioPipeline::start(const std::string &uri, const std::string &task_name, UBaseType_t priority) {
-  this->common_start_(task_name, priority);
+void AudioPipeline::start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name,
+                          UBaseType_t priority) {
+  this->common_start_(target_sample_rate, task_name, priority);
 
   this->current_uri_ = uri;
   xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_HTTP);
 }
 
-void AudioPipeline::start(media_player::MediaFile *media_file, const std::string &task_name, UBaseType_t priority) {
-  this->common_start_(task_name, priority);
+void AudioPipeline::start(media_player::MediaFile *media_file, uint32_t target_sample_rate,
+                          const std::string &task_name, UBaseType_t priority) {
+  this->common_start_(target_sample_rate, task_name, priority);
 
   this->current_media_file_ = media_file;
   xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_FILE);
 }
 
-void AudioPipeline::common_start_(const std::string &task_name, UBaseType_t priority) {
+void AudioPipeline::common_start_(uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority) {
   if (this->read_task_handle_ == nullptr) {
     this->read_task_handle_ =
         xTaskCreateStatic(AudioPipeline::read_task_, (task_name + "_read").c_str(), 8192, (void *) this, priority,
@@ -82,13 +84,33 @@ void AudioPipeline::common_start_(const std::string &task_name, UBaseType_t prio
   }
 
   this->stop();
+
+  this->target_sample_rate_ = target_sample_rate;
 }
 
 AudioPipelineState AudioPipeline::get_state() {
   EventBits_t event_bits = xEventGroupGetBits(this->event_group_);
   if (!this->read_task_handle_ && !this->decode_task_handle_ && !this->resample_task_handle_) {
     return AudioPipelineState::STOPPED;
-  } else if (event_bits & (READER_MESSAGE_FINISHED | DECODER_MESSAGE_FINISHED | RESAMPLER_MESSAGE_FINISHED)) {
+  }
+
+  if ((event_bits & READER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, READER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_READING;
+  }
+
+  if ((event_bits & DECODER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, DECODER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_DECODING;
+  }
+
+  if ((event_bits & RESAMPLER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, RESAMPLER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_RESAMPLING;
+  }
+
+  if ((event_bits & READER_MESSAGE_FINISHED) && (event_bits & DECODER_MESSAGE_FINISHED) &&
+             (event_bits & RESAMPLER_MESSAGE_FINISHED)) {
     return AudioPipelineState::STOPPED;
   }
 
@@ -105,6 +127,15 @@ void AudioPipeline::stop() {
       true,                                                                               // Wait for all the bits,
       pdMS_TO_TICKS(200));  // Block temporarily before deleting each task
 
+  // Clear the ring buffer in the mixer; avoids playing incorrect audio when starting a new file while paused
+  CommandEvent command_event;
+  if (this->pipeline_type_ == AudioPipelineType::MEDIA) {
+    command_event.command = CommandEventType::CLEAR_MEDIA;
+  } else {
+    command_event.command = CommandEventType::CLEAR_ANNOUNCEMENT;
+  }
+  this->mixer_->send_command(&command_event);
+
   xEventGroupClearBits(this->event_group_, ALL_BITS);
   this->reset_ring_buffers();
 }
@@ -187,8 +218,9 @@ void AudioPipeline::decode_task_(void *params) {
     xEventGroupClearBits(this_pipeline->event_group_, EventGroupBits::DECODER_MESSAGE_FINISHED);
 
     {
-      AudioDecoder decoder = AudioDecoder(this_pipeline->raw_file_ring_buffer_.get(),
-                                          this_pipeline->decoded_ring_buffer_.get(), BUFFER_SIZE_BYTES);
+      AudioDecoder decoder =
+          AudioDecoder(this_pipeline->raw_file_ring_buffer_.get(), this_pipeline->decoded_ring_buffer_.get(),
+                       HTTP_BUFFER_SIZE);  // BUFFER_SIZE_BYTES);
       decoder.start(this_pipeline->current_media_file_type_);
 
       bool has_stream_info = false;
@@ -256,7 +288,11 @@ void AudioPipeline::resample_task_(void *params) {
       AudioResampler resampler =
           AudioResampler(this_pipeline->decoded_ring_buffer_.get(), output_ring_buffer, BUFFER_SIZE_SAMPLES);
 
-      resampler.start(this_pipeline->current_stream_info_);
+      if (!resampler.start(this_pipeline->current_stream_info_, this_pipeline->target_sample_rate_)) {
+        // Unsupported incoming audio stream
+        xEventGroupSetBits(this_pipeline->event_group_,
+                           EventGroupBits::RESAMPLER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+      }
 
       while (true) {
         event_bits = xEventGroupGetBits(this_pipeline->event_group_);

diff --git a/esphome/components/nabu/audio_pipeline.h b/esphome/components/nabu/audio_pipeline.h
@@ -28,20 +28,19 @@ enum class AudioPipelineType : uint8_t {
 };
 
 enum class AudioPipelineState : uint8_t {
-  STARTING,
-  STARTED,
   PLAYING,
-  PAUSED,
-  STOPPING,
   STOPPED,
+  ERROR_READING,
+  ERROR_DECODING,
+  ERROR_RESAMPLING,
 };
 
 class AudioPipeline {
  public:
   AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type);
 
-  void start(const std::string &uri, const std::string &task_name, UBaseType_t priority = 1);
-  void start(media_player::MediaFile *media_file, const std::string &task_name, UBaseType_t priority = 1);
+  void start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority = 1);
+  void start(media_player::MediaFile *media_file, uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority = 1);
 
   void stop();
 
@@ -50,7 +49,9 @@ class AudioPipeline {
   void reset_ring_buffers();
 
  protected:
-  void common_start_(const std::string &task_name, UBaseType_t priority);
+  void common_start_(uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority);
+
+  uint32_t target_sample_rate_;
 
   AudioMixer *mixer_;
 

diff --git a/esphome/components/nabu/audio_resampler.cpp b/esphome/components/nabu/audio_resampler.cpp
@@ -46,9 +46,11 @@ AudioResampler::~AudioResampler() {
     resampleFree(this->resampler_);
     this->resampler_ = nullptr;
   }
+
+  // dsps_fird_s16_aexx_free(&this->fir_filter_);
 }
 
-void AudioResampler::start(media_player::StreamInfo &stream_info) {
+bool AudioResampler::start(media_player::StreamInfo &stream_info, uint32_t target_sample_rate) {
   this->stream_info_ = stream_info;
 
   this->input_buffer_current_ = this->input_buffer_;
@@ -63,33 +65,40 @@ void AudioResampler::start(media_player::StreamInfo &stream_info) {
 
   this->needs_mono_to_stereo_ = (stream_info.channels != 2);
 
+  if ((stream_info.channels > 2) || (stream_info_.bits_per_sample != 16)) {
+    // TODO: Make these values configurable
+    return false;
+  }
+
   if (stream_info.channels > 0) {
     this->channel_factor_ = 2 / stream_info.channels;
     printf("Converting %d channels to 2 channels\n", stream_info.channels);
   }
-  constexpr float resample_rate = 16000.0f;
-  if (stream_info.sample_rate != 16000) {
-    if (stream_info.sample_rate == 48000) {
-      // Special case, we can do this a lot faster with esp-dsp code!
-      const uint8_t decimation = 48000 / 16000;
-      const float fir_out_offset = 0;  //((FIR_FILTER_LENGTH / decimation / 2) - 1);
-
-      int8_t shift = this->generate_q15_fir_coefficients_(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH,
-                                                          (float) 0.5 / decimation);
-      // dsps_16_array_rev(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH);
-      dsps_fird_init_s16(&this->fir_filter_, this->fir_filter_coeffecients_, this->fir_delay_, FIR_FILTER_LENGTH,
-                         decimation, fir_out_offset, -shift);
-      this->decimation_filter_ = true;
-      this->needs_resampling_ = true;
-      // memset(this->fir_delay_, 0, FIR_FILTER_LENGTH*sizeof(int16_t));
-    } else {
+
+  if (stream_info.sample_rate != target_sample_rate) {
+    // if (stream_info.sample_rate == 48000) {
+    //   // Special case, we can do this a lot faster with esp-dsp code!
+    //   const uint8_t decimation = 48000 / 16000;
+    //   const float fir_out_offset = 0;  //((FIR_FILTER_LENGTH / decimation / 2) - 1);
+
+    //   int8_t shift = this->generate_q15_fir_coefficients_(this->fir_filter_coeffecients_, (uint32_t)
+    //   FIR_FILTER_LENGTH,
+    //                                                       (float) 0.5 / decimation);
+    //   // dsps_16_array_rev(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH);
+    //   dsps_fird_init_s16(&this->fir_filter_, this->fir_filter_coeffecients_, this->fir_delay_, FIR_FILTER_LENGTH,
+    //                      decimation, fir_out_offset, -shift);
+    //   this->decimation_filter_ = true;
+    //   this->needs_resampling_ = true;
+    //   // memset(this->fir_delay_, 0, FIR_FILTER_LENGTH*sizeof(int16_t));
+    // } else
+    {
       int flags = 0;
 
       this->needs_resampling_ = true;
 
-      this->sample_ratio_ = resample_rate / static_cast<float>(stream_info.sample_rate);
+      this->sample_ratio_ = static_cast<float>(target_sample_rate) / static_cast<float>(stream_info.sample_rate);
 
-      printf("Resampling from %d Hz to 16000 Hz\n", stream_info.sample_rate);
+      printf("Resampling from %d Hz to %d Hz\n", stream_info.sample_rate, target_sample_rate);
 
       if (this->sample_ratio_ < 1.0) {
         this->lowpass_ratio_ -= (10.24 / 16);
@@ -137,6 +146,8 @@ void AudioResampler::start(media_player::StreamInfo &stream_info) {
   } else {
     this->needs_resampling_ = false;
   }
+
+  return true;
 }
 
 AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
@@ -165,6 +176,19 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
   // Refill input buffer
   //////
 
+  // Depending on if we are converting mono to stereo or if we are upsampling, we may need to restrict how many input
+  // samples we transfer
+  size_t max_input_samples = this->internal_buffer_samples_;
+
+  // Mono to stereo -> cut in half
+  max_input_samples /= (2 / this->stream_info_.channels);
+
+  if (this->sample_ratio_ > 1.0) {
+    // Upsampling -> reduce by a factor of the ceiling of sample_ratio_
+    uint32_t upsampling_factor = std::ceil(this->sample_ratio_);
+    max_input_samples /= upsampling_factor;
+  }
+
   // Move old data to the start of the buffer
   if (this->input_buffer_length_ > 0) {
     memmove((void *) this->input_buffer_, (void *) this->input_buffer_current_, this->input_buffer_length_);
@@ -173,8 +197,7 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
 
   // Copy new data to the end of the of the buffer
   size_t bytes_available = this->input_ring_buffer_->available();
-  size_t bytes_to_read =
-      std::min(bytes_available, this->internal_buffer_samples_ * sizeof(int16_t) - this->input_buffer_length_);
+  size_t bytes_to_read = std::min(bytes_available, max_input_samples * sizeof(int16_t) - this->input_buffer_length_);
 
   if (bytes_to_read > 0) {
     int16_t *new_input_buffer_data = this->input_buffer_ + this->input_buffer_length_ / sizeof(int16_t);
@@ -207,7 +230,8 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
         }
       } else {
         // Interleaved stereo samples
-        // TODO: This doesn't sound correct! I need to use separate filters for each channel so the delay line isn't mixed
+        // TODO: This doesn't sound correct! I need to use separate filters for each channel so the delay line isn't
+        // mixed
         size_t available_samples = this->input_buffer_length_ / sizeof(int16_t);
         for (int i = 0; i < available_samples / 2; ++i) {
           // split interleaved samples into two separate streams
@@ -244,15 +268,12 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
 
         size_t samples_read = this->input_buffer_length_ / sizeof(int16_t);
 
-        // This is inefficient! It reconverts any samples that weren't used in the previous resampling run
         for (int i = 0; i < samples_read; ++i) {
           this->float_input_buffer_[i] = static_cast<float>(this->input_buffer_[i]) / 32768.0f;
         }
 
         size_t frames_read = samples_read / this->stream_info_.channels;
 
-        // The low pass filter seems to be causing glitches... probably because samples are repeated due to the above
-        // ineffeciency!
         if (this->pre_filter_) {
           for (int i = 0; i < this->stream_info_.channels; ++i) {
             biquad_apply_buffer(&this->lowpass_[i][0], this->float_input_buffer_ + i, frames_read,
@@ -363,6 +384,7 @@ int8_t AudioResampler::generate_q15_fir_coefficients_(int16_t *fir_coeffs, const
   }
 
   free(fir_window);
+  free(float_coeffs);
 
   return shift;
 }