From dd7fd38706f9c21e352735c59cb5991111b75285 Mon Sep 17 00:00:00 2001
From: Kevin Ahrendt <kevin.ahrendt@nabucasa.com>
Date: Mon, 5 Aug 2024 11:37:27 -0400
Subject: [PATCH] Bugfixes/Improvements for media player (#37)

* quickly disable new special case fir filter

* fix pipeline stopped vs playing logic

* improve robustness of flac decoding

* Increase http buffer size and allow setting output sample rate

* fix flac sync errors after running out of data

* limit http transfer size per loop

* disable task debug logging

* revert min http transfer size

* always reset input_buffer_current

* update TODOs and describe media player framework

* read mute status from dac at setup

* raise error if bits per sample is too high for our optimized version

* block unprocessable streams

* clear appropriate mixer buffer when stopping a pipeline

* log which pipeline element has an error

* never transfer more input samples than can be processed in 1 step

* correctly account for mono to stereo adjustment

* check for upsampling before scaling max input samples

* fix memory leak
---
 esphome/components/nabu/audio_decoder.cpp     |  35 +++---
 esphome/components/nabu/audio_pipeline.cpp    |  56 +++++++--
 esphome/components/nabu/audio_pipeline.h      |  15 +--
 esphome/components/nabu/audio_resampler.cpp   |  72 +++++++----
 esphome/components/nabu/audio_resampler.h     |   6 +-
 esphome/components/nabu/flac_decoder.cpp      |  73 +++++++----
 esphome/components/nabu/flac_decoder.h        |  33 ++---
 esphome/components/nabu/media_player.py       |   3 +
 esphome/components/nabu/nabu_media_player.cpp | 115 ++++++++++++++++--
 esphome/components/nabu/nabu_media_player.h   |   7 +-
 10 files changed, 302 insertions(+), 113 deletions(-)

diff --git a/esphome/components/nabu/audio_decoder.cpp b/esphome/components/nabu/audio_decoder.cpp
index 11720a5..7684fc8 100644
--- a/esphome/components/nabu/audio_decoder.cpp
+++ b/esphome/components/nabu/audio_decoder.cpp
@@ -110,21 +110,26 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
       }
     } else {
       // Try to decode more data
+      size_t bytes_available = this->input_ring_buffer_->available();
+      size_t bytes_to_read = std::min(bytes_available, this->internal_buffer_size_ - this->input_buffer_length_);
+
+      if ((this->potentially_failed_count_ > 0) && (bytes_to_read == 0)) {
+        // We didn't have enough data last time, and we have no new data, so just return
+        return AudioDecoderState::DECODING;
+      }
 
       // Shift unread data in input buffer to start
-      if ((this->input_buffer_length_ > 0) && (this->input_buffer_length_ < this->internal_buffer_size_)) {
+      if (this->input_buffer_length_ > 0) {
         memmove(this->input_buffer_, this->input_buffer_current_, this->input_buffer_length_);
       }
       this->input_buffer_current_ = this->input_buffer_;
 
       // read in new ring buffer data to fill the remaining input buffer
-      size_t bytes_available = this->input_ring_buffer_->available();
-      size_t bytes_to_read = std::min(bytes_available, this->internal_buffer_size_ - this->input_buffer_length_);
       size_t bytes_read = 0;
 
       if (bytes_to_read > 0) {
-        uint8_t *new_mp3_data = this->input_buffer_ + this->input_buffer_length_;
-        bytes_read = this->input_ring_buffer_->read((void *) new_mp3_data, bytes_to_read);
+        uint8_t *new_audio_data = this->input_buffer_ + this->input_buffer_length_;
+        bytes_read = this->input_ring_buffer_->read((void *) new_audio_data, bytes_to_read);
 
         this->input_buffer_length_ += bytes_read;
       }
@@ -153,6 +158,8 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) {
       ++this->potentially_failed_count_;
     } else if (state == FileDecoderState::END_OF_FILE) {
       this->end_of_file_ = true;
+    } else if (state == FileDecoderState::FAILED) {
+      return AudioDecoderState::FAILED;
     } else {
       this->potentially_failed_count_ = 0;
     }
@@ -191,9 +198,7 @@ FileDecoderState AudioDecoder::decode_wav_() {
 
           printf("sample channels: %d\n", this->channels_.value());
           printf("sample rate: %" PRId32 "\n", this->sample_rate_.value());
-          // printf("number of samples: %d\n",
-          //        this->wav_decoder_->chunk_bytes_left() / (this->channels_.value() * (this->bits_per_sample.value()
-          //        / 8)));
+          printf("bits per sample: %d\n", this->sample_depth_.value());
           this->wav_bytes_left_ = this->wav_decoder_->chunk_bytes_left();
           header_finished = true;
         } else if (result == wav_decoder::WAV_DECODER_SUCCESS_NEXT) {
@@ -226,11 +231,10 @@ FileDecoderState AudioDecoder::decode_wav_() {
       this->wav_bytes_left_ -= bytes_to_write;
     }
 
-    return FileDecoderState::MORE_TO_PROCESS;
+    return FileDecoderState::IDLE;
   }
 
   return FileDecoderState::END_OF_FILE;
-  // return DecoderState::FINISHED;
 }
 
 FileDecoderState AudioDecoder::decode_mp3_() {
@@ -280,14 +284,14 @@ FileDecoderState AudioDecoder::decode_flac_() {
     // Header hasn't been read
     auto result = this->flac_decoder_->read_header(this->input_buffer_length_);
 
-    if (result == flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
-      return FileDecoderState::POTENTIALLY_FAILED;
-    }
-
     size_t bytes_consumed = this->flac_decoder_->get_bytes_index();
     this->input_buffer_current_ += bytes_consumed;
     this->input_buffer_length_ = this->flac_decoder_->get_bytes_left();
 
+    if (result == flac::FLAC_DECODER_HEADER_OUT_OF_DATA) {
+      return FileDecoderState::POTENTIALLY_FAILED;
+    }
+
     if (result != flac::FLAC_DECODER_SUCCESS) {
       printf("failed to read flac header. Error: %d\n", result);
       return FileDecoderState::FAILED;
@@ -311,10 +315,11 @@ FileDecoderState AudioDecoder::decode_flac_() {
       this->flac_decoder_->decode_frame(this->input_buffer_length_, (int16_t *) this->output_buffer_, &output_samples);
 
   if (result == flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
-    // not an issue, just needs more data!
+    // Not an issue, just needs more data that we'll get next time.
     return FileDecoderState::POTENTIALLY_FAILED;
   } else if (result > flac::FLAC_DECODER_ERROR_OUT_OF_DATA) {
     // Serious error, can't recover
+    printf("FLAC Decoder Error %d\n", result);
     return FileDecoderState::FAILED;
   }
 
diff --git a/esphome/components/nabu/audio_pipeline.cpp b/esphome/components/nabu/audio_pipeline.cpp
index f95d270..e9520b1 100644
--- a/esphome/components/nabu/audio_pipeline.cpp
+++ b/esphome/components/nabu/audio_pipeline.cpp
@@ -9,7 +9,7 @@ namespace nabu {
 
 static const size_t QUEUE_COUNT = 10;
 
-static const size_t HTTP_BUFFER_SIZE = 32 * 1024;
+static const size_t HTTP_BUFFER_SIZE = 64 * 1024;
 static const size_t BUFFER_SIZE_SAMPLES = 32768;
 static const size_t BUFFER_SIZE_BYTES = BUFFER_SIZE_SAMPLES * sizeof(int16_t);
 
@@ -50,21 +50,23 @@ AudioPipeline::AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type)
   this->event_group_ = xEventGroupCreate();
 }
 
-void AudioPipeline::start(const std::string &uri, const std::string &task_name, UBaseType_t priority) {
-  this->common_start_(task_name, priority);
+void AudioPipeline::start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name,
+                          UBaseType_t priority) {
+  this->common_start_(target_sample_rate, task_name, priority);
 
   this->current_uri_ = uri;
   xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_HTTP);
 }
 
-void AudioPipeline::start(media_player::MediaFile *media_file, const std::string &task_name, UBaseType_t priority) {
-  this->common_start_(task_name, priority);
+void AudioPipeline::start(media_player::MediaFile *media_file, uint32_t target_sample_rate,
+                          const std::string &task_name, UBaseType_t priority) {
+  this->common_start_(target_sample_rate, task_name, priority);
 
   this->current_media_file_ = media_file;
   xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_FILE);
 }
 
-void AudioPipeline::common_start_(const std::string &task_name, UBaseType_t priority) {
+void AudioPipeline::common_start_(uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority) {
   if (this->read_task_handle_ == nullptr) {
     this->read_task_handle_ =
         xTaskCreateStatic(AudioPipeline::read_task_, (task_name + "_read").c_str(), 8192, (void *) this, priority,
@@ -82,13 +84,33 @@ void AudioPipeline::common_start_(const std::string &task_name, UBaseType_t prio
   }
 
   this->stop();
+
+  this->target_sample_rate_ = target_sample_rate;
 }
 
 AudioPipelineState AudioPipeline::get_state() {
   EventBits_t event_bits = xEventGroupGetBits(this->event_group_);
   if (!this->read_task_handle_ && !this->decode_task_handle_ && !this->resample_task_handle_) {
     return AudioPipelineState::STOPPED;
-  } else if (event_bits & (READER_MESSAGE_FINISHED | DECODER_MESSAGE_FINISHED | RESAMPLER_MESSAGE_FINISHED)) {
+  }
+  
+  if ((event_bits & READER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, READER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_READING;
+  }
+  
+  if ((event_bits & DECODER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, DECODER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_DECODING;
+  }
+
+  if ((event_bits & RESAMPLER_MESSAGE_ERROR)) {
+    xEventGroupClearBits(this->event_group_, RESAMPLER_MESSAGE_ERROR);
+    return AudioPipelineState::ERROR_RESAMPLING;
+  }
+
+  if ((event_bits & READER_MESSAGE_FINISHED) && (event_bits & DECODER_MESSAGE_FINISHED) &&
+             (event_bits & RESAMPLER_MESSAGE_FINISHED)) {
     return AudioPipelineState::STOPPED;
   }
 
@@ -105,6 +127,15 @@ void AudioPipeline::stop() {
       true,                                                                               // Wait for all the bits,
       pdMS_TO_TICKS(200));  // Block temporarily before deleting each task
 
+  // Clear the ring buffer in the mixer; avoids playing incorrect audio when starting a new file while paused
+  CommandEvent command_event;
+  if (this->pipeline_type_ == AudioPipelineType::MEDIA) {
+    command_event.command = CommandEventType::CLEAR_MEDIA;
+  } else {
+    command_event.command = CommandEventType::CLEAR_ANNOUNCEMENT;
+  }
+  this->mixer_->send_command(&command_event);
+
   xEventGroupClearBits(this->event_group_, ALL_BITS);
   this->reset_ring_buffers();
 }
@@ -187,8 +218,9 @@ void AudioPipeline::decode_task_(void *params) {
     xEventGroupClearBits(this_pipeline->event_group_, EventGroupBits::DECODER_MESSAGE_FINISHED);
 
     {
-      AudioDecoder decoder = AudioDecoder(this_pipeline->raw_file_ring_buffer_.get(),
-                                          this_pipeline->decoded_ring_buffer_.get(), BUFFER_SIZE_BYTES);
+      AudioDecoder decoder =
+          AudioDecoder(this_pipeline->raw_file_ring_buffer_.get(), this_pipeline->decoded_ring_buffer_.get(),
+                       HTTP_BUFFER_SIZE);  // BUFFER_SIZE_BYTES);
       decoder.start(this_pipeline->current_media_file_type_);
 
       bool has_stream_info = false;
@@ -256,7 +288,11 @@ void AudioPipeline::resample_task_(void *params) {
       AudioResampler resampler =
           AudioResampler(this_pipeline->decoded_ring_buffer_.get(), output_ring_buffer, BUFFER_SIZE_SAMPLES);
 
-      resampler.start(this_pipeline->current_stream_info_);
+      if (!resampler.start(this_pipeline->current_stream_info_, this_pipeline->target_sample_rate_)) {
+        // Unsupported incoming audio stream
+        xEventGroupSetBits(this_pipeline->event_group_,
+                           EventGroupBits::RESAMPLER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP);
+      }
 
       while (true) {
         event_bits = xEventGroupGetBits(this_pipeline->event_group_);
diff --git a/esphome/components/nabu/audio_pipeline.h b/esphome/components/nabu/audio_pipeline.h
index 4b1996f..20a8d91 100644
--- a/esphome/components/nabu/audio_pipeline.h
+++ b/esphome/components/nabu/audio_pipeline.h
@@ -28,20 +28,19 @@ enum class AudioPipelineType : uint8_t {
 };
 
 enum class AudioPipelineState : uint8_t {
-  STARTING,
-  STARTED,
   PLAYING,
-  PAUSED,
-  STOPPING,
   STOPPED,
+  ERROR_READING,
+  ERROR_DECODING,
+  ERROR_RESAMPLING,
 };
 
 class AudioPipeline {
  public:
   AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type);
 
-  void start(const std::string &uri, const std::string &task_name, UBaseType_t priority = 1);
-  void start(media_player::MediaFile *media_file, const std::string &task_name, UBaseType_t priority = 1);
+  void start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority = 1);
+  void start(media_player::MediaFile *media_file, uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority = 1);
 
   void stop();
 
@@ -50,7 +49,9 @@ class AudioPipeline {
   void reset_ring_buffers();
 
  protected:
-  void common_start_(const std::string &task_name, UBaseType_t priority);
+  void common_start_(uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority);
+
+  uint32_t target_sample_rate_;
 
   AudioMixer *mixer_;
 
diff --git a/esphome/components/nabu/audio_resampler.cpp b/esphome/components/nabu/audio_resampler.cpp
index 4860c28..0203cc6 100644
--- a/esphome/components/nabu/audio_resampler.cpp
+++ b/esphome/components/nabu/audio_resampler.cpp
@@ -46,9 +46,11 @@ AudioResampler::~AudioResampler() {
     resampleFree(this->resampler_);
     this->resampler_ = nullptr;
   }
+
+  // dsps_fird_s16_aexx_free(&this->fir_filter_);
 }
 
-void AudioResampler::start(media_player::StreamInfo &stream_info) {
+bool AudioResampler::start(media_player::StreamInfo &stream_info, uint32_t target_sample_rate) {
   this->stream_info_ = stream_info;
 
   this->input_buffer_current_ = this->input_buffer_;
@@ -63,33 +65,40 @@ void AudioResampler::start(media_player::StreamInfo &stream_info) {
 
   this->needs_mono_to_stereo_ = (stream_info.channels != 2);
 
+  if ((stream_info.channels > 2) || (stream_info_.bits_per_sample != 16)) {
+    // TODO: Make these values configurable
+    return false;
+  }
+
   if (stream_info.channels > 0) {
     this->channel_factor_ = 2 / stream_info.channels;
     printf("Converting %d channels to 2 channels\n", stream_info.channels);
   }
-  constexpr float resample_rate = 16000.0f;
-  if (stream_info.sample_rate != 16000) {
-    if (stream_info.sample_rate == 48000) {
-      // Special case, we can do this a lot faster with esp-dsp code!
-      const uint8_t decimation = 48000 / 16000;
-      const float fir_out_offset = 0;  //((FIR_FILTER_LENGTH / decimation / 2) - 1);
-
-      int8_t shift = this->generate_q15_fir_coefficients_(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH,
-                                                          (float) 0.5 / decimation);
-      // dsps_16_array_rev(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH);
-      dsps_fird_init_s16(&this->fir_filter_, this->fir_filter_coeffecients_, this->fir_delay_, FIR_FILTER_LENGTH,
-                         decimation, fir_out_offset, -shift);
-      this->decimation_filter_ = true;
-      this->needs_resampling_ = true;
-      // memset(this->fir_delay_, 0, FIR_FILTER_LENGTH*sizeof(int16_t));
-    } else {
+
+  if (stream_info.sample_rate != target_sample_rate) {
+    // if (stream_info.sample_rate == 48000) {
+    //   // Special case, we can do this a lot faster with esp-dsp code!
+    //   const uint8_t decimation = 48000 / 16000;
+    //   const float fir_out_offset = 0;  //((FIR_FILTER_LENGTH / decimation / 2) - 1);
+
+    //   int8_t shift = this->generate_q15_fir_coefficients_(this->fir_filter_coeffecients_, (uint32_t)
+    //   FIR_FILTER_LENGTH,
+    //                                                       (float) 0.5 / decimation);
+    //   // dsps_16_array_rev(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH);
+    //   dsps_fird_init_s16(&this->fir_filter_, this->fir_filter_coeffecients_, this->fir_delay_, FIR_FILTER_LENGTH,
+    //                      decimation, fir_out_offset, -shift);
+    //   this->decimation_filter_ = true;
+    //   this->needs_resampling_ = true;
+    //   // memset(this->fir_delay_, 0, FIR_FILTER_LENGTH*sizeof(int16_t));
+    // } else
+    {
       int flags = 0;
 
       this->needs_resampling_ = true;
 
-      this->sample_ratio_ = resample_rate / static_cast<float>(stream_info.sample_rate);
+      this->sample_ratio_ = static_cast<float>(target_sample_rate) / static_cast<float>(stream_info.sample_rate);
 
-      printf("Resampling from %d Hz to 16000 Hz\n", stream_info.sample_rate);
+      printf("Resampling from %d Hz to %d Hz\n", stream_info.sample_rate, target_sample_rate);
 
       if (this->sample_ratio_ < 1.0) {
         this->lowpass_ratio_ -= (10.24 / 16);
@@ -137,6 +146,8 @@ void AudioResampler::start(media_player::StreamInfo &stream_info) {
   } else {
     this->needs_resampling_ = false;
   }
+
+  return true;
 }
 
 AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
@@ -165,6 +176,19 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
   // Refill input buffer
   //////
 
+  // Depending on if we are converting mono to stereo or if we are upsampling, we may need to restrict how many input
+  // samples we transfer
+  size_t max_input_samples = this->internal_buffer_samples_;
+
+  // Mono to stereo -> cut in half
+  max_input_samples /= (2 / this->stream_info_.channels);
+
+  if (this->sample_ratio_ > 1.0) {
+    // Upsampling -> reduce by a factor of the ceiling of sample_ratio_
+    uint32_t upsampling_factor = std::ceil(this->sample_ratio_);
+    max_input_samples /= upsampling_factor;
+  }
+
   // Move old data to the start of the buffer
   if (this->input_buffer_length_ > 0) {
     memmove((void *) this->input_buffer_, (void *) this->input_buffer_current_, this->input_buffer_length_);
@@ -173,8 +197,7 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
 
   // Copy new data to the end of the of the buffer
   size_t bytes_available = this->input_ring_buffer_->available();
-  size_t bytes_to_read =
-      std::min(bytes_available, this->internal_buffer_samples_ * sizeof(int16_t) - this->input_buffer_length_);
+  size_t bytes_to_read = std::min(bytes_available, max_input_samples * sizeof(int16_t) - this->input_buffer_length_);
 
   if (bytes_to_read > 0) {
     int16_t *new_input_buffer_data = this->input_buffer_ + this->input_buffer_length_ / sizeof(int16_t);
@@ -207,7 +230,8 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
         }
       } else {
         // Interleaved stereo samples
-        // TODO: This doesn't sound correct! I need to use separate filters for each channel so the delay line isn't mixed
+        // TODO: This doesn't sound correct! I need to use separate filters for each channel so the delay line isn't
+        // mixed
         size_t available_samples = this->input_buffer_length_ / sizeof(int16_t);
         for (int i = 0; i < available_samples / 2; ++i) {
           // split interleaved samples into two separate streams
@@ -244,15 +268,12 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) {
 
         size_t samples_read = this->input_buffer_length_ / sizeof(int16_t);
 
-        // This is inefficient! It reconverts any samples that weren't used in the previous resampling run
         for (int i = 0; i < samples_read; ++i) {
           this->float_input_buffer_[i] = static_cast<float>(this->input_buffer_[i]) / 32768.0f;
         }
 
         size_t frames_read = samples_read / this->stream_info_.channels;
 
-        // The low pass filter seems to be causing glitches... probably because samples are repeated due to the above
-        // ineffeciency!
         if (this->pre_filter_) {
           for (int i = 0; i < this->stream_info_.channels; ++i) {
             biquad_apply_buffer(&this->lowpass_[i][0], this->float_input_buffer_ + i, frames_read,
@@ -363,6 +384,7 @@ int8_t AudioResampler::generate_q15_fir_coefficients_(int16_t *fir_coeffs, const
   }
 
   free(fir_window);
+  free(float_coeffs);
 
   return shift;
 }
diff --git a/esphome/components/nabu/audio_resampler.h b/esphome/components/nabu/audio_resampler.h
index 1176c1a..aa36086 100644
--- a/esphome/components/nabu/audio_resampler.h
+++ b/esphome/components/nabu/audio_resampler.h
@@ -30,7 +30,11 @@ class AudioResampler {
                  size_t internal_buffer_samples);
   ~AudioResampler();
 
-  void start(media_player::StreamInfo &stream_info);
+  /// @brief Sets up the various bits necessary to resample
+  /// @param stream_info the incoming sample rate, bits per sample, and number of channels
+  /// @param target_sample_rate the necessary sample rate to convert to
+  /// @return True if it convert the incoming stream, false otherwise
+  bool start(media_player::StreamInfo &stream_info, uint32_t target_sample_rate);
 
   AudioResamplerState resample(bool stop_gracefully);
 
diff --git a/esphome/components/nabu/flac_decoder.cpp b/esphome/components/nabu/flac_decoder.cpp
index 976ab3d..8fe0bea 100644
--- a/esphome/components/nabu/flac_decoder.cpp
+++ b/esphome/components/nabu/flac_decoder.cpp
@@ -13,30 +13,32 @@ namespace flac {
 FLACDecoderResult FLACDecoder::read_header(size_t buffer_length) {
   this->buffer_index_ = 0;
   this->bytes_left_ = buffer_length;
+  this->bit_buffer_ = 0;
+  this->bit_buffer_length_ = 0;
 
-  if (this->out_of_data_) {
-    return FLAC_DECODER_ERROR_OUT_OF_DATA;
-  }
+  this->out_of_data_ = (buffer_length == 0);
 
-  // File must start with 'fLaC'
-  if (this->read_uint(32) != FLAC_MAGIC_NUMBER) {
-    return FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER;
+  if (!this->partial_header_read_) {
+    // File must start with 'fLaC'
+    if (this->read_uint(32) != FLAC_MAGIC_NUMBER) {
+      return FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER;
+    }
   }
 
-  // Read header blocks
-  bool last = false;
-  uint32_t type = 0;
-  uint32_t length = 0;
+  while (!this->partial_header_last_ || (this->partial_header_length_ > 0)) {
+    if (this->bytes_left_ == 0) {
+      // We'll try to finish reading it once more data is loaded
+      this->partial_header_read_ = true;
+      return FLAC_DECODER_HEADER_OUT_OF_DATA;
+    }
 
-  while (!last) {
-    if (this->out_of_data_) {
-      return FLAC_DECODER_ERROR_OUT_OF_DATA;
+    if (this->partial_header_length_ == 0) {
+      this->partial_header_last_ = this->read_uint(1) != 0;
+      this->partial_header_type_ = this->read_uint(7);
+      this->partial_header_length_ = this->read_uint(24);
     }
 
-    last = this->read_uint(1) != 0;
-    type = this->read_uint(7);
-    length = this->read_uint(24);
-    if (type == 0) {
+    if (this->partial_header_type_ == 0) {
       // Stream info block
       this->min_block_size_ = this->read_uint(16);
       this->max_block_size_ = this->read_uint(16);
@@ -48,24 +50,31 @@ FLACDecoderResult FLACDecoder::read_header(size_t buffer_length) {
       this->sample_depth_ = this->read_uint(5) + 1;
       this->num_samples_ = this->read_uint(36);
       this->read_uint(128);
+
+      this->partial_header_length_ = 0;
     } else {
       // Variable block
-      for (uint32_t i = 0; i < length; i++) {
-        this->read_uint(8);
-
-        // Exit early if we run out of data here
-        if (this->out_of_data_) {
-          return FLAC_DECODER_ERROR_OUT_OF_DATA;
+      while (this->partial_header_length_ > 0) {
+        if (this->bytes_left_ == 0) {
+          break;
         }
-      }  // for each byte in block
+        this->read_uint(8);
+        --this->partial_header_length_;
+      }
     }  // variable block
-  }  // while not last
+  }
 
   if ((this->sample_rate_ == 0) || (this->num_channels_ == 0) || (this->sample_depth_ == 0) ||
       (this->max_block_size_ == 0)) {
     return FLAC_DECODER_ERROR_BAD_HEADER;
   }
 
+  if (this->sample_depth_ > 16) {
+    // This decoder can support higher sample depths, but it would require using int32s throughout. We limit to 16 bits
+    // per sample for the sake of speed, as we can then use a quick esp-dsp function for the dot product calculation
+    return FLAC_DECODER_ERROR_UNSUPPORTED_BITS_PER_SAMPLE;
+  }
+
   // Successfully read header
   return FLAC_DECODER_SUCCESS;
 }  // read_header
@@ -73,6 +82,8 @@ FLACDecoderResult FLACDecoder::read_header(size_t buffer_length) {
 FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *output_buffer, uint32_t *num_samples) {
   this->buffer_index_ = 0;
   this->bytes_left_ = buffer_length;
+  this->out_of_data_ = false;
+
   *num_samples = 0;
 
   if (!this->block_samples_) {
@@ -86,6 +97,9 @@ FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *outpu
     return FLAC_DECODER_NO_MORE_FRAMES;
   }
 
+  uint64_t previous_bit_buffer = this->bit_buffer_;
+  uint32_t previous_bit_buffer_length = this->bit_buffer_length_;
+
   // sync code
   if (this->read_uint(14) != 0x3FFE) {
     return FLAC_DECODER_ERROR_SYNC_NOT_FOUND;
@@ -107,6 +121,8 @@ FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *outpu
     next_int = (next_int << 1) & 0xFF;
 
     if (this->out_of_data_) {
+      this->bit_buffer_ = previous_bit_buffer;
+      this->bit_buffer_length_ = previous_bit_buffer_length;
       return FLAC_DECODER_ERROR_OUT_OF_DATA;
     }
   }
@@ -139,6 +155,12 @@ FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *outpu
   this->decode_subframes(block_size, this->sample_depth_, channel_assignment);
   *num_samples = block_size * this->num_channels_;
 
+  if (this->bytes_left_ < 2) {
+    this->bit_buffer_ = previous_bit_buffer;
+    this->bit_buffer_length_ = previous_bit_buffer_length;
+    return FLAC_DECODER_ERROR_OUT_OF_DATA;
+  }
+
   // Footer
   this->align_to_byte();
   this->read_uint(16);
@@ -386,7 +408,6 @@ uint32_t FLACDecoder::read_uint(std::size_t num_bits) {
   while (this->bit_buffer_length_ < num_bits) {
     uint8_t next_byte = this->buffer_[this->buffer_index_];
     this->buffer_index_++;
-    this->buffer_total_read_++;
     this->bytes_left_--;
     if (this->bytes_left_ == 0) {
       this->out_of_data_ = true;
diff --git a/esphome/components/nabu/flac_decoder.h b/esphome/components/nabu/flac_decoder.h
index 0d050ad..4ec33d6 100644
--- a/esphome/components/nabu/flac_decoder.h
+++ b/esphome/components/nabu/flac_decoder.h
@@ -27,16 +27,18 @@ const static uint32_t FLAC_UINT_MASK[] = {
 enum FLACDecoderResult {
   FLAC_DECODER_SUCCESS = 0,
   FLAC_DECODER_NO_MORE_FRAMES = 1,
-  FLAC_DECODER_ERROR_OUT_OF_DATA = 2,
-  FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER = 3,
-  FLAC_DECODER_ERROR_SYNC_NOT_FOUND = 4,
-  FLAC_DECODER_ERROR_BAD_BLOCK_SIZE_CODE = 5,
-  FLAC_DECODER_ERROR_BAD_HEADER = 6,
-  FLAC_DECODER_ERROR_RESERVED_CHANNEL_ASSIGNMENT = 7,
-  FLAC_DECODER_ERROR_RESERVED_SUBFRAME_TYPE = 8,
-  FLAC_DECODER_ERROR_BAD_FIXED_PREDICTION_ORDER = 9,
-  FLAC_DECODER_ERROR_RESERVED_RESIDUAL_CODING_METHOD = 10,
-  FLAC_DECODER_ERROR_BLOCK_SIZE_NOT_DIVISIBLE_RICE = 11,
+  FLAC_DECODER_HEADER_OUT_OF_DATA = 2,
+  FLAC_DECODER_ERROR_OUT_OF_DATA = 3,
+  FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER = 4,
+  FLAC_DECODER_ERROR_SYNC_NOT_FOUND = 5,
+  FLAC_DECODER_ERROR_BAD_BLOCK_SIZE_CODE = 6,
+  FLAC_DECODER_ERROR_BAD_HEADER = 7,
+  FLAC_DECODER_ERROR_RESERVED_CHANNEL_ASSIGNMENT = 8,
+  FLAC_DECODER_ERROR_RESERVED_SUBFRAME_TYPE = 9,
+  FLAC_DECODER_ERROR_BAD_FIXED_PREDICTION_ORDER = 10,
+  FLAC_DECODER_ERROR_RESERVED_RESIDUAL_CODING_METHOD = 11,
+  FLAC_DECODER_ERROR_BLOCK_SIZE_NOT_DIVISIBLE_RICE = 12,
+  FLAC_DECODER_ERROR_UNSUPPORTED_BITS_PER_SAMPLE = 13,
 };
 
 // Coefficients for fixed linear prediction
@@ -52,8 +54,7 @@ class FLACDecoder {
    * buffer_size - size of the data buffer
    * min_buffer_size - min bytes in buffer before fill_buffer is called
    */
-  FLACDecoder(uint8_t *buffer)
-      : buffer_(buffer) {}
+  FLACDecoder(uint8_t *buffer) : buffer_(buffer) {}
 
   ~FLACDecoder() { this->free_buffers(); }
 
@@ -129,9 +130,6 @@ class FLACDecoder {
   /* Next index to read from the input buffer. */
   std::size_t buffer_index_ = 0;
 
-  /* Total number of bytes read across frames (debugging). */
-  std::size_t buffer_total_read_ = 0;
-
   /* Number of byte that haven't been read from the input buffer yet. */
   std::size_t bytes_left_ = 0;
 
@@ -167,6 +165,11 @@ class FLACDecoder {
 
   /* Buffer of decoded samples at full precision (single channel). */
   std::vector<int16_t, esphome::ExternalRAMAllocator<int16_t>> block_result_;
+
+  bool partial_header_read_{false};
+  bool partial_header_last_{false};
+  uint32_t partial_header_type_{0};
+  uint32_t partial_header_length_{0};
 };
 
 }  // namespace flac
diff --git a/esphome/components/nabu/media_player.py b/esphome/components/nabu/media_player.py
index fb58299..7d7fe3b 100644
--- a/esphome/components/nabu/media_player.py
+++ b/esphome/components/nabu/media_player.py
@@ -46,6 +46,7 @@
 TYPE_WEB = "web"
 
 CONF_FILES = "files"
+CONF_SAMPLE_RATE = "sample_rate"
 
 nabu_ns = cg.esphome_ns.namespace("nabu")
 NabuMediaPlayer = nabu_ns.class_("NabuMediaPlayer")
@@ -176,6 +177,7 @@ def _file_schema(value):
         cv.GenerateID(): cv.declare_id(NabuMediaPlayer),
         cv.GenerateID(CONF_I2S_AUDIO_ID): cv.use_id(I2SAudioComponent),
         cv.Required(CONF_I2S_DOUT_PIN): pins.internal_gpio_output_pin_number,
+        cv.Optional(CONF_SAMPLE_RATE, default=16000): cv.int_range(min=1),
         cv.Optional(CONF_BITS_PER_SAMPLE, default="16bit"): cv.All(
             _validate_bits, cv.enum(BITS_PER_SAMPLE)
         ),
@@ -203,6 +205,7 @@ async def to_code(config):
     await cg.register_parented(var, config[CONF_I2S_AUDIO_ID])
     cg.add(var.set_dout_pin(config[CONF_I2S_DOUT_PIN]))
     cg.add(var.set_bits_per_sample(config[CONF_BITS_PER_SAMPLE]))
+    cg.add(var.set_sample_rate(config[CONF_SAMPLE_RATE]))
 
     if files_list := config.get(CONF_FILES):
         media_files = []
diff --git a/esphome/components/nabu/nabu_media_player.cpp b/esphome/components/nabu/nabu_media_player.cpp
index f817d62..28bdec9 100644
--- a/esphome/components/nabu/nabu_media_player.cpp
+++ b/esphome/components/nabu/nabu_media_player.cpp
@@ -14,16 +14,59 @@ namespace esphome {
 namespace nabu {
 
 // TODO:
+//  - Have better logging outputs
+//    - Output file type and stream information + any resampling processes
+//    - Remove printf
+//  - Block media commands until the bluetooth stack is disabled (will run out of memory otherwise)
 //  - Tune task memory requirements and potentially buffer sizes if issues appear
-//  - Biquad filters work for downsampling without handling float buffer carefully, upsampling will require some care
 //  - Ducking improvements
 //    - Ducking ratio probably isn't the best way to specify, as volume perception is not linear
 //    - Add a YAML action for setting the ducking level instead of requiring a lambda
 //  - Clean up process around playing back local media files
 //    - Create a registry of media files in Python
+//    - What do I need to give them an ESPHome id?
 //    - Add a yaml action to play a specific media file
+//
+//
+// Framework:
+//  - Media player that can handle two streams; one for media and one for announcements
+//    - If played together, they are mixed with the announcement stream staying at full volume
+//    - The media audio can be further ducked via the ``set_ducking_ratio`` function
+//  - Each stream is handled by an ``AudioPipeline`` object with three parts/tasks
+//    - ``AudioReader`` handles reading from an HTTP source or from a PROGMEM flash set at compile time
+//    - ``AudioDecoder`` handles decoding the audio file. All formats are limited to two channels and 16 bits per sample
+//      - FLAC
+//      - WAV
+//      - MP3 (based on the libhelix decoder - a random mp3 file may be incompatible)
+//    - ``AudioResampler`` handles converting the sample rate to the configured output sample rate and converting mono
+//      to stereo
+//      - The quality is not good, and it is slow! Please send audio at the configured sample rate to avoid these issues
+//    - Each task will always run once started, but they will not doing anything until they are needed
+//    - FreeRTOS Event Groups make up the inter-task communication
+//    - The ``AudioPipeline`` sets up an output ring buffer for the Reader and Decoder parts. The next part/task
+//      automatically pulls from the previous ring buffer
+//  - The streams are mixed together in the ``AudioMixer`` task
+//    - Each stream has a corresponding input buffer that the ``AudioResampler`` feeds directly
+//    - Pausing the media stream is done here
+//    - Media stream ducking is done here
+//    - The output ring buffer feeds the ``speaker_task`` directly. It is kept small intentionally to avoid latency when
+//      pausing
+//  - Audio output is handled by the ``speaker_task``. It configures the I2S bus and copies audio from the mixer's
+//    output ring buffer to the DMA buffers
+//  - Media player commands are received by the ``control`` function. The commands are added to the
+//    ``media_control_command_queue_`` to be processed in the component's loop
+//    - Starting a stream intializes the appropriate pipeline or stops it if it is already running
+//    - Volume and mute commands are achieved by the ``mute``, ``unmute``, ``set_volume`` functions. They communicate
+//      directly with the DAC over I2C.
+//      - Volume commands are ignored if the media control queue is full to avoid crashing when the track wheel is spun
+//      fast
+//    - Pausing is sent to the ``AudioMixer`` task. It only effects the media stream.
+//  - The components main loop performs housekeeping:
+//    - It reads the media control queue and processes it directly
+//    - It watches the state of speaker and mixer tasks
+//    - It determines the overall state of the media player by considering the state of each pipeline
+//      - announcement playback takes highest priority
 
-static const size_t SAMPLE_RATE_HZ = 16000;  // 16 kHz
 static const size_t QUEUE_COUNT = 20;
 static const size_t DMA_BUFFER_COUNT = 4;
 static const size_t DMA_BUFFER_SIZE = 512;
@@ -167,7 +210,10 @@ void NabuMediaPlayer::setup() {
 
   xTaskCreate(NabuMediaPlayer::speaker_task, "speaker_task", 3072, (void *) this, 23, &this->speaker_task_handle_);
 
-  this->get_dac_volume_();
+  if (!this->get_dac_volume_().has_value() || !this->get_dac_mute_().has_value()) {
+    ESP_LOGE(TAG, "Couldn't communicate with DAC");
+    this->mark_failed();
+  }
 
   // if (!this->write_byte(DAC_PAGE_SELECTION_REGISTER, 0x01)) {
   //   ESP_LOGE(TAG, "DAC failed to switch register page");
@@ -212,7 +258,7 @@ void NabuMediaPlayer::speaker_task(void *params) {
 
   i2s_driver_config_t config = {
       .mode = (i2s_mode_t) (this_speaker->parent_->get_i2s_mode() | I2S_MODE_TX),
-      .sample_rate = 16000,
+      .sample_rate = this_speaker->sample_rate_,
       .bits_per_sample = this_speaker->bits_per_sample_,
       .channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT,
       .communication_format = I2S_COMM_FORMAT_STAND_I2S,
@@ -347,13 +393,13 @@ void NabuMediaPlayer::watch_media_commands_() {
               make_unique<AudioPipeline>(this->audio_mixer_.get(), AudioPipelineType::ANNOUNCEMENT);
         }
 
-        this->announcement_pipeline_->start(this->announcement_url_.value(), "ann", 7);
+        this->announcement_pipeline_->start(this->announcement_url_.value(), this->sample_rate_, "ann", 7);
       } else {
         if (this->media_pipeline_ == nullptr) {
           this->media_pipeline_ = make_unique<AudioPipeline>(this->audio_mixer_.get(), AudioPipelineType::MEDIA);
         }
 
-        this->media_pipeline_->start(this->media_url_.value(), "media", 2);
+        this->media_pipeline_->start(this->media_url_.value(), this->sample_rate_, "media", 2);
 
         if (this->is_paused_) {
           CommandEvent command_event;
@@ -371,13 +417,13 @@ void NabuMediaPlayer::watch_media_commands_() {
               make_unique<AudioPipeline>(this->audio_mixer_.get(), AudioPipelineType::ANNOUNCEMENT);
         }
 
-        this->announcement_pipeline_->start(this->announcement_file_.value(), "ann", 7);
+        this->announcement_pipeline_->start(this->announcement_file_.value(), this->sample_rate_, "ann", 7);
       } else {
         if (this->media_pipeline_ == nullptr) {
           this->media_pipeline_ = make_unique<AudioPipeline>(this->audio_mixer_.get(), AudioPipelineType::MEDIA);
         }
 
-        this->media_pipeline_->start(this->media_file_.value(), "media", 2);
+        this->media_pipeline_->start(this->media_file_.value(), this->sample_rate_, "media", 5);
 
         if (this->is_paused_) {
           CommandEvent command_event;
@@ -405,7 +451,7 @@ void NabuMediaPlayer::watch_media_commands_() {
           this->is_paused_ = false;
           break;
         case media_player::MEDIA_PLAYER_COMMAND_PAUSE:
-          if (this->media_pipeline_state_ == AudioPipelineState::PLAYING) {
+          if (!this->is_paused_) {
             command_event.command = CommandEventType::PAUSE_MEDIA;
             this->audio_mixer_->send_command(&command_event);
           }
@@ -510,12 +556,31 @@ void NabuMediaPlayer::loop() {
 
   if (this->announcement_pipeline_ != nullptr)
     this->announcement_pipeline_state_ = this->announcement_pipeline_->get_state();
+  
+  if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_READING) {
+    ESP_LOGE(TAG, "Encountered an error reading the announcement file");
+  }
+  if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_DECODING) {
+    ESP_LOGE(TAG, "Encountered an error decoding the announcement file");
+  }
+  if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_RESAMPLING) {
+    ESP_LOGE(TAG, "Encountered an error resampling the announcement file");
+  }
 
   if (this->media_pipeline_ != nullptr)
     this->media_pipeline_state_ = this->media_pipeline_->get_state();
 
-  if ((this->announcement_pipeline_state_ != AudioPipelineState::STOPPING) &&
-      (this->announcement_pipeline_state_ != AudioPipelineState::STOPPED)) {
+  if (this->media_pipeline_state_ == AudioPipelineState::ERROR_READING) {
+    ESP_LOGE(TAG, "Encountered an error reading the media file");
+  }
+  if (this->media_pipeline_state_ == AudioPipelineState::ERROR_DECODING) {
+    ESP_LOGE(TAG, "Encountered an error decoding the media file");
+  }
+  if (this->media_pipeline_state_ == AudioPipelineState::ERROR_RESAMPLING) {
+    ESP_LOGE(TAG, "Encountered an error resampling the media file");
+  }
+
+  if (this->announcement_pipeline_state_ != AudioPipelineState::STOPPED) {
     this->state = media_player::MEDIA_PLAYER_STATE_ANNOUNCING;
     if (this->is_idle_muted_ && !this->is_muted_) {
       // this->unmute_();
@@ -529,8 +594,7 @@ void NabuMediaPlayer::loop() {
         // this->mute_();
         this->is_idle_muted_ = true;
       }
-    } else if ((this->media_pipeline_state_ == AudioPipelineState::STOPPING) ||
-               (this->media_pipeline_state_ == AudioPipelineState::STOPPED)) {
+    } else if (this->media_pipeline_state_ == AudioPipelineState::STOPPED) {
       this->state = media_player::MEDIA_PLAYER_STATE_IDLE;
       if (!this->is_idle_muted_) {
         // this->mute_();
@@ -656,6 +720,31 @@ bool NabuMediaPlayer::set_volume_(float volume, bool publish) {
   return true;
 }
 
+optional<bool> NabuMediaPlayer::get_dac_mute_(bool publish) {
+  if (!this->write_byte(DAC_PAGE_SELECTION_REGISTER, DAC_MUTE_PAGE)) {
+    ESP_LOGE(TAG, "DAC failed to switch to mute page registers");
+    return {};
+  }
+
+  uint8_t dac_mute_left = 0;
+  uint8_t dac_mute_right = 0;
+  if (!this->read_byte(DAC_LEFT_MUTE_REGISTER, &dac_mute_left) ||
+      !this->read_byte(DAC_RIGHT_MUTE_REGISTER, &dac_mute_right)) {
+    ESP_LOGE(TAG, "DAC failed to read mute status");
+    return {};
+  }
+
+  bool is_muted = false;
+  if (dac_mute_left == DAC_MUTE_COMMAND && dac_mute_right == DAC_MUTE_COMMAND) {
+    is_muted = true;
+  }
+
+  if (publish) {
+    this->is_muted_ = is_muted;
+  }
+  return is_muted;
+}
+
 bool NabuMediaPlayer::mute_() {
   if (!this->write_byte(DAC_PAGE_SELECTION_REGISTER, DAC_MUTE_PAGE)) {
     ESP_LOGE(TAG, "DAC failed to switch to mute page registers");
diff --git a/esphome/components/nabu/nabu_media_player.h b/esphome/components/nabu/nabu_media_player.h
index 4ff4260..149ee6b 100644
--- a/esphome/components/nabu/nabu_media_player.h
+++ b/esphome/components/nabu/nabu_media_player.h
@@ -57,15 +57,19 @@ class NabuMediaPlayer : public Component,
 
   void set_dout_pin(uint8_t pin) { this->dout_pin_ = pin; }
   void set_bits_per_sample(i2s_bits_per_sample_t bits_per_sample) { this->bits_per_sample_ = bits_per_sample; }
+  void set_sample_rate(uint32_t sample_rate) { this->sample_rate_ = sample_rate; }
 
  protected:
   // Receives commands from HA or from the voice assistant component
   // Sends commands to the media_control_commanda_queue_
   void control(const media_player::MediaPlayerCall &call) override;
 
-  /// @return volume read from DAC between 0.0 and 1.0, if successful
+  /// @return Volume read from DAC between 0.0 and 1.0, if successful. Updates volume_ if publish is true.
   optional<float> get_dac_volume_(bool publish = true);
 
+  /// @return Mute status read from DAC, if successful. Updates is_muted_ if publish is true.
+  optional<bool> get_dac_mute_(bool publish = true);
+
   /// @return true if I2C writes were successful
   bool set_volume_(float volume, bool publish = true);
 
@@ -101,6 +105,7 @@ class NabuMediaPlayer : public Component,
   QueueHandle_t speaker_command_queue_;
 
   i2s_bits_per_sample_t bits_per_sample_;
+  uint32_t sample_rate_;
   uint8_t dout_pin_{0};
 
   bool is_paused_{false};