From dd7fd38706f9c21e352735c59cb5991111b75285 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Mon, 5 Aug 2024 11:37:27 -0400 Subject: [PATCH] Bugfixes/Improvements for media player (#37) * quickly disable new special case fir filter * fix pipeline stopped vs playing logic * improve robustness of flac decoding * Increase http buffer size and allow setting output sample rate * fix flac sync errors after running out of data * limit http transfer size per loop * disable task debug logging * revert min http transfer size * always reset input_buffer_current * update TODOs and describe media player framework * read mute status from dac at setup * raise error if bits per sample is too high for our optimized version * block unprocessable streams * clear appropriate mixer buffer when stopping a pipeline * log which pipeline element has an error * never transfer more input samples than can be processed in 1 step * correctly account for mono to stereo adjustment * check for upsampling before scaling max input samples * fix memory leak --- esphome/components/nabu/audio_decoder.cpp | 35 +++--- esphome/components/nabu/audio_pipeline.cpp | 56 +++++++-- esphome/components/nabu/audio_pipeline.h | 15 +-- esphome/components/nabu/audio_resampler.cpp | 72 +++++++---- esphome/components/nabu/audio_resampler.h | 6 +- esphome/components/nabu/flac_decoder.cpp | 73 +++++++---- esphome/components/nabu/flac_decoder.h | 33 ++--- esphome/components/nabu/media_player.py | 3 + esphome/components/nabu/nabu_media_player.cpp | 115 ++++++++++++++++-- esphome/components/nabu/nabu_media_player.h | 7 +- 10 files changed, 302 insertions(+), 113 deletions(-) diff --git a/esphome/components/nabu/audio_decoder.cpp b/esphome/components/nabu/audio_decoder.cpp index 11720a5..7684fc8 100644 --- a/esphome/components/nabu/audio_decoder.cpp +++ b/esphome/components/nabu/audio_decoder.cpp @@ -110,21 +110,26 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) { } } else { // Try to decode more data + size_t bytes_available = this->input_ring_buffer_->available(); + size_t bytes_to_read = std::min(bytes_available, this->internal_buffer_size_ - this->input_buffer_length_); + + if ((this->potentially_failed_count_ > 0) && (bytes_to_read == 0)) { + // We didn't have enough data last time, and we have no new data, so just return + return AudioDecoderState::DECODING; + } // Shift unread data in input buffer to start - if ((this->input_buffer_length_ > 0) && (this->input_buffer_length_ < this->internal_buffer_size_)) { + if (this->input_buffer_length_ > 0) { memmove(this->input_buffer_, this->input_buffer_current_, this->input_buffer_length_); } this->input_buffer_current_ = this->input_buffer_; // read in new ring buffer data to fill the remaining input buffer - size_t bytes_available = this->input_ring_buffer_->available(); - size_t bytes_to_read = std::min(bytes_available, this->internal_buffer_size_ - this->input_buffer_length_); size_t bytes_read = 0; if (bytes_to_read > 0) { - uint8_t *new_mp3_data = this->input_buffer_ + this->input_buffer_length_; - bytes_read = this->input_ring_buffer_->read((void *) new_mp3_data, bytes_to_read); + uint8_t *new_audio_data = this->input_buffer_ + this->input_buffer_length_; + bytes_read = this->input_ring_buffer_->read((void *) new_audio_data, bytes_to_read); this->input_buffer_length_ += bytes_read; } @@ -153,6 +158,8 @@ AudioDecoderState AudioDecoder::decode(bool stop_gracefully) { ++this->potentially_failed_count_; } else if (state == FileDecoderState::END_OF_FILE) { this->end_of_file_ = true; + } else if (state == FileDecoderState::FAILED) { + return AudioDecoderState::FAILED; } else { this->potentially_failed_count_ = 0; } @@ -191,9 +198,7 @@ FileDecoderState AudioDecoder::decode_wav_() { printf("sample channels: %d\n", this->channels_.value()); printf("sample rate: %" PRId32 "\n", this->sample_rate_.value()); - // printf("number of samples: %d\n", - // this->wav_decoder_->chunk_bytes_left() / (this->channels_.value() * (this->bits_per_sample.value() - // / 8))); + printf("bits per sample: %d\n", this->sample_depth_.value()); this->wav_bytes_left_ = this->wav_decoder_->chunk_bytes_left(); header_finished = true; } else if (result == wav_decoder::WAV_DECODER_SUCCESS_NEXT) { @@ -226,11 +231,10 @@ FileDecoderState AudioDecoder::decode_wav_() { this->wav_bytes_left_ -= bytes_to_write; } - return FileDecoderState::MORE_TO_PROCESS; + return FileDecoderState::IDLE; } return FileDecoderState::END_OF_FILE; - // return DecoderState::FINISHED; } FileDecoderState AudioDecoder::decode_mp3_() { @@ -280,14 +284,14 @@ FileDecoderState AudioDecoder::decode_flac_() { // Header hasn't been read auto result = this->flac_decoder_->read_header(this->input_buffer_length_); - if (result == flac::FLAC_DECODER_ERROR_OUT_OF_DATA) { - return FileDecoderState::POTENTIALLY_FAILED; - } - size_t bytes_consumed = this->flac_decoder_->get_bytes_index(); this->input_buffer_current_ += bytes_consumed; this->input_buffer_length_ = this->flac_decoder_->get_bytes_left(); + if (result == flac::FLAC_DECODER_HEADER_OUT_OF_DATA) { + return FileDecoderState::POTENTIALLY_FAILED; + } + if (result != flac::FLAC_DECODER_SUCCESS) { printf("failed to read flac header. Error: %d\n", result); return FileDecoderState::FAILED; @@ -311,10 +315,11 @@ FileDecoderState AudioDecoder::decode_flac_() { this->flac_decoder_->decode_frame(this->input_buffer_length_, (int16_t *) this->output_buffer_, &output_samples); if (result == flac::FLAC_DECODER_ERROR_OUT_OF_DATA) { - // not an issue, just needs more data! + // Not an issue, just needs more data that we'll get next time. return FileDecoderState::POTENTIALLY_FAILED; } else if (result > flac::FLAC_DECODER_ERROR_OUT_OF_DATA) { // Serious error, can't recover + printf("FLAC Decoder Error %d\n", result); return FileDecoderState::FAILED; } diff --git a/esphome/components/nabu/audio_pipeline.cpp b/esphome/components/nabu/audio_pipeline.cpp index f95d270..e9520b1 100644 --- a/esphome/components/nabu/audio_pipeline.cpp +++ b/esphome/components/nabu/audio_pipeline.cpp @@ -9,7 +9,7 @@ namespace nabu { static const size_t QUEUE_COUNT = 10; -static const size_t HTTP_BUFFER_SIZE = 32 * 1024; +static const size_t HTTP_BUFFER_SIZE = 64 * 1024; static const size_t BUFFER_SIZE_SAMPLES = 32768; static const size_t BUFFER_SIZE_BYTES = BUFFER_SIZE_SAMPLES * sizeof(int16_t); @@ -50,21 +50,23 @@ AudioPipeline::AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type) this->event_group_ = xEventGroupCreate(); } -void AudioPipeline::start(const std::string &uri, const std::string &task_name, UBaseType_t priority) { - this->common_start_(task_name, priority); +void AudioPipeline::start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name, + UBaseType_t priority) { + this->common_start_(target_sample_rate, task_name, priority); this->current_uri_ = uri; xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_HTTP); } -void AudioPipeline::start(media_player::MediaFile *media_file, const std::string &task_name, UBaseType_t priority) { - this->common_start_(task_name, priority); +void AudioPipeline::start(media_player::MediaFile *media_file, uint32_t target_sample_rate, + const std::string &task_name, UBaseType_t priority) { + this->common_start_(target_sample_rate, task_name, priority); this->current_media_file_ = media_file; xEventGroupSetBits(this->event_group_, READER_COMMAND_INIT_FILE); } -void AudioPipeline::common_start_(const std::string &task_name, UBaseType_t priority) { +void AudioPipeline::common_start_(uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority) { if (this->read_task_handle_ == nullptr) { this->read_task_handle_ = xTaskCreateStatic(AudioPipeline::read_task_, (task_name + "_read").c_str(), 8192, (void *) this, priority, @@ -82,13 +84,33 @@ void AudioPipeline::common_start_(const std::string &task_name, UBaseType_t prio } this->stop(); + + this->target_sample_rate_ = target_sample_rate; } AudioPipelineState AudioPipeline::get_state() { EventBits_t event_bits = xEventGroupGetBits(this->event_group_); if (!this->read_task_handle_ && !this->decode_task_handle_ && !this->resample_task_handle_) { return AudioPipelineState::STOPPED; - } else if (event_bits & (READER_MESSAGE_FINISHED | DECODER_MESSAGE_FINISHED | RESAMPLER_MESSAGE_FINISHED)) { + } + + if ((event_bits & READER_MESSAGE_ERROR)) { + xEventGroupClearBits(this->event_group_, READER_MESSAGE_ERROR); + return AudioPipelineState::ERROR_READING; + } + + if ((event_bits & DECODER_MESSAGE_ERROR)) { + xEventGroupClearBits(this->event_group_, DECODER_MESSAGE_ERROR); + return AudioPipelineState::ERROR_DECODING; + } + + if ((event_bits & RESAMPLER_MESSAGE_ERROR)) { + xEventGroupClearBits(this->event_group_, RESAMPLER_MESSAGE_ERROR); + return AudioPipelineState::ERROR_RESAMPLING; + } + + if ((event_bits & READER_MESSAGE_FINISHED) && (event_bits & DECODER_MESSAGE_FINISHED) && + (event_bits & RESAMPLER_MESSAGE_FINISHED)) { return AudioPipelineState::STOPPED; } @@ -105,6 +127,15 @@ void AudioPipeline::stop() { true, // Wait for all the bits, pdMS_TO_TICKS(200)); // Block temporarily before deleting each task + // Clear the ring buffer in the mixer; avoids playing incorrect audio when starting a new file while paused + CommandEvent command_event; + if (this->pipeline_type_ == AudioPipelineType::MEDIA) { + command_event.command = CommandEventType::CLEAR_MEDIA; + } else { + command_event.command = CommandEventType::CLEAR_ANNOUNCEMENT; + } + this->mixer_->send_command(&command_event); + xEventGroupClearBits(this->event_group_, ALL_BITS); this->reset_ring_buffers(); } @@ -187,8 +218,9 @@ void AudioPipeline::decode_task_(void *params) { xEventGroupClearBits(this_pipeline->event_group_, EventGroupBits::DECODER_MESSAGE_FINISHED); { - AudioDecoder decoder = AudioDecoder(this_pipeline->raw_file_ring_buffer_.get(), - this_pipeline->decoded_ring_buffer_.get(), BUFFER_SIZE_BYTES); + AudioDecoder decoder = + AudioDecoder(this_pipeline->raw_file_ring_buffer_.get(), this_pipeline->decoded_ring_buffer_.get(), + HTTP_BUFFER_SIZE); // BUFFER_SIZE_BYTES); decoder.start(this_pipeline->current_media_file_type_); bool has_stream_info = false; @@ -256,7 +288,11 @@ void AudioPipeline::resample_task_(void *params) { AudioResampler resampler = AudioResampler(this_pipeline->decoded_ring_buffer_.get(), output_ring_buffer, BUFFER_SIZE_SAMPLES); - resampler.start(this_pipeline->current_stream_info_); + if (!resampler.start(this_pipeline->current_stream_info_, this_pipeline->target_sample_rate_)) { + // Unsupported incoming audio stream + xEventGroupSetBits(this_pipeline->event_group_, + EventGroupBits::RESAMPLER_MESSAGE_ERROR | EventGroupBits::PIPELINE_COMMAND_STOP); + } while (true) { event_bits = xEventGroupGetBits(this_pipeline->event_group_); diff --git a/esphome/components/nabu/audio_pipeline.h b/esphome/components/nabu/audio_pipeline.h index 4b1996f..20a8d91 100644 --- a/esphome/components/nabu/audio_pipeline.h +++ b/esphome/components/nabu/audio_pipeline.h @@ -28,20 +28,19 @@ enum class AudioPipelineType : uint8_t { }; enum class AudioPipelineState : uint8_t { - STARTING, - STARTED, PLAYING, - PAUSED, - STOPPING, STOPPED, + ERROR_READING, + ERROR_DECODING, + ERROR_RESAMPLING, }; class AudioPipeline { public: AudioPipeline(AudioMixer *mixer, AudioPipelineType pipeline_type); - void start(const std::string &uri, const std::string &task_name, UBaseType_t priority = 1); - void start(media_player::MediaFile *media_file, const std::string &task_name, UBaseType_t priority = 1); + void start(const std::string &uri, uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority = 1); + void start(media_player::MediaFile *media_file, uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority = 1); void stop(); @@ -50,7 +49,9 @@ class AudioPipeline { void reset_ring_buffers(); protected: - void common_start_(const std::string &task_name, UBaseType_t priority); + void common_start_(uint32_t target_sample_rate, const std::string &task_name, UBaseType_t priority); + + uint32_t target_sample_rate_; AudioMixer *mixer_; diff --git a/esphome/components/nabu/audio_resampler.cpp b/esphome/components/nabu/audio_resampler.cpp index 4860c28..0203cc6 100644 --- a/esphome/components/nabu/audio_resampler.cpp +++ b/esphome/components/nabu/audio_resampler.cpp @@ -46,9 +46,11 @@ AudioResampler::~AudioResampler() { resampleFree(this->resampler_); this->resampler_ = nullptr; } + + // dsps_fird_s16_aexx_free(&this->fir_filter_); } -void AudioResampler::start(media_player::StreamInfo &stream_info) { +bool AudioResampler::start(media_player::StreamInfo &stream_info, uint32_t target_sample_rate) { this->stream_info_ = stream_info; this->input_buffer_current_ = this->input_buffer_; @@ -63,33 +65,40 @@ void AudioResampler::start(media_player::StreamInfo &stream_info) { this->needs_mono_to_stereo_ = (stream_info.channels != 2); + if ((stream_info.channels > 2) || (stream_info_.bits_per_sample != 16)) { + // TODO: Make these values configurable + return false; + } + if (stream_info.channels > 0) { this->channel_factor_ = 2 / stream_info.channels; printf("Converting %d channels to 2 channels\n", stream_info.channels); } - constexpr float resample_rate = 16000.0f; - if (stream_info.sample_rate != 16000) { - if (stream_info.sample_rate == 48000) { - // Special case, we can do this a lot faster with esp-dsp code! - const uint8_t decimation = 48000 / 16000; - const float fir_out_offset = 0; //((FIR_FILTER_LENGTH / decimation / 2) - 1); - - int8_t shift = this->generate_q15_fir_coefficients_(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH, - (float) 0.5 / decimation); - // dsps_16_array_rev(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH); - dsps_fird_init_s16(&this->fir_filter_, this->fir_filter_coeffecients_, this->fir_delay_, FIR_FILTER_LENGTH, - decimation, fir_out_offset, -shift); - this->decimation_filter_ = true; - this->needs_resampling_ = true; - // memset(this->fir_delay_, 0, FIR_FILTER_LENGTH*sizeof(int16_t)); - } else { + + if (stream_info.sample_rate != target_sample_rate) { + // if (stream_info.sample_rate == 48000) { + // // Special case, we can do this a lot faster with esp-dsp code! + // const uint8_t decimation = 48000 / 16000; + // const float fir_out_offset = 0; //((FIR_FILTER_LENGTH / decimation / 2) - 1); + + // int8_t shift = this->generate_q15_fir_coefficients_(this->fir_filter_coeffecients_, (uint32_t) + // FIR_FILTER_LENGTH, + // (float) 0.5 / decimation); + // // dsps_16_array_rev(this->fir_filter_coeffecients_, (uint32_t) FIR_FILTER_LENGTH); + // dsps_fird_init_s16(&this->fir_filter_, this->fir_filter_coeffecients_, this->fir_delay_, FIR_FILTER_LENGTH, + // decimation, fir_out_offset, -shift); + // this->decimation_filter_ = true; + // this->needs_resampling_ = true; + // // memset(this->fir_delay_, 0, FIR_FILTER_LENGTH*sizeof(int16_t)); + // } else + { int flags = 0; this->needs_resampling_ = true; - this->sample_ratio_ = resample_rate / static_cast(stream_info.sample_rate); + this->sample_ratio_ = static_cast(target_sample_rate) / static_cast(stream_info.sample_rate); - printf("Resampling from %d Hz to 16000 Hz\n", stream_info.sample_rate); + printf("Resampling from %d Hz to %d Hz\n", stream_info.sample_rate, target_sample_rate); if (this->sample_ratio_ < 1.0) { this->lowpass_ratio_ -= (10.24 / 16); @@ -137,6 +146,8 @@ void AudioResampler::start(media_player::StreamInfo &stream_info) { } else { this->needs_resampling_ = false; } + + return true; } AudioResamplerState AudioResampler::resample(bool stop_gracefully) { @@ -165,6 +176,19 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) { // Refill input buffer ////// + // Depending on if we are converting mono to stereo or if we are upsampling, we may need to restrict how many input + // samples we transfer + size_t max_input_samples = this->internal_buffer_samples_; + + // Mono to stereo -> cut in half + max_input_samples /= (2 / this->stream_info_.channels); + + if (this->sample_ratio_ > 1.0) { + // Upsampling -> reduce by a factor of the ceiling of sample_ratio_ + uint32_t upsampling_factor = std::ceil(this->sample_ratio_); + max_input_samples /= upsampling_factor; + } + // Move old data to the start of the buffer if (this->input_buffer_length_ > 0) { memmove((void *) this->input_buffer_, (void *) this->input_buffer_current_, this->input_buffer_length_); @@ -173,8 +197,7 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) { // Copy new data to the end of the of the buffer size_t bytes_available = this->input_ring_buffer_->available(); - size_t bytes_to_read = - std::min(bytes_available, this->internal_buffer_samples_ * sizeof(int16_t) - this->input_buffer_length_); + size_t bytes_to_read = std::min(bytes_available, max_input_samples * sizeof(int16_t) - this->input_buffer_length_); if (bytes_to_read > 0) { int16_t *new_input_buffer_data = this->input_buffer_ + this->input_buffer_length_ / sizeof(int16_t); @@ -207,7 +230,8 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) { } } else { // Interleaved stereo samples - // TODO: This doesn't sound correct! I need to use separate filters for each channel so the delay line isn't mixed + // TODO: This doesn't sound correct! I need to use separate filters for each channel so the delay line isn't + // mixed size_t available_samples = this->input_buffer_length_ / sizeof(int16_t); for (int i = 0; i < available_samples / 2; ++i) { // split interleaved samples into two separate streams @@ -244,15 +268,12 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully) { size_t samples_read = this->input_buffer_length_ / sizeof(int16_t); - // This is inefficient! It reconverts any samples that weren't used in the previous resampling run for (int i = 0; i < samples_read; ++i) { this->float_input_buffer_[i] = static_cast(this->input_buffer_[i]) / 32768.0f; } size_t frames_read = samples_read / this->stream_info_.channels; - // The low pass filter seems to be causing glitches... probably because samples are repeated due to the above - // ineffeciency! if (this->pre_filter_) { for (int i = 0; i < this->stream_info_.channels; ++i) { biquad_apply_buffer(&this->lowpass_[i][0], this->float_input_buffer_ + i, frames_read, @@ -363,6 +384,7 @@ int8_t AudioResampler::generate_q15_fir_coefficients_(int16_t *fir_coeffs, const } free(fir_window); + free(float_coeffs); return shift; } diff --git a/esphome/components/nabu/audio_resampler.h b/esphome/components/nabu/audio_resampler.h index 1176c1a..aa36086 100644 --- a/esphome/components/nabu/audio_resampler.h +++ b/esphome/components/nabu/audio_resampler.h @@ -30,7 +30,11 @@ class AudioResampler { size_t internal_buffer_samples); ~AudioResampler(); - void start(media_player::StreamInfo &stream_info); + /// @brief Sets up the various bits necessary to resample + /// @param stream_info the incoming sample rate, bits per sample, and number of channels + /// @param target_sample_rate the necessary sample rate to convert to + /// @return True if it convert the incoming stream, false otherwise + bool start(media_player::StreamInfo &stream_info, uint32_t target_sample_rate); AudioResamplerState resample(bool stop_gracefully); diff --git a/esphome/components/nabu/flac_decoder.cpp b/esphome/components/nabu/flac_decoder.cpp index 976ab3d..8fe0bea 100644 --- a/esphome/components/nabu/flac_decoder.cpp +++ b/esphome/components/nabu/flac_decoder.cpp @@ -13,30 +13,32 @@ namespace flac { FLACDecoderResult FLACDecoder::read_header(size_t buffer_length) { this->buffer_index_ = 0; this->bytes_left_ = buffer_length; + this->bit_buffer_ = 0; + this->bit_buffer_length_ = 0; - if (this->out_of_data_) { - return FLAC_DECODER_ERROR_OUT_OF_DATA; - } + this->out_of_data_ = (buffer_length == 0); - // File must start with 'fLaC' - if (this->read_uint(32) != FLAC_MAGIC_NUMBER) { - return FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER; + if (!this->partial_header_read_) { + // File must start with 'fLaC' + if (this->read_uint(32) != FLAC_MAGIC_NUMBER) { + return FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER; + } } - // Read header blocks - bool last = false; - uint32_t type = 0; - uint32_t length = 0; + while (!this->partial_header_last_ || (this->partial_header_length_ > 0)) { + if (this->bytes_left_ == 0) { + // We'll try to finish reading it once more data is loaded + this->partial_header_read_ = true; + return FLAC_DECODER_HEADER_OUT_OF_DATA; + } - while (!last) { - if (this->out_of_data_) { - return FLAC_DECODER_ERROR_OUT_OF_DATA; + if (this->partial_header_length_ == 0) { + this->partial_header_last_ = this->read_uint(1) != 0; + this->partial_header_type_ = this->read_uint(7); + this->partial_header_length_ = this->read_uint(24); } - last = this->read_uint(1) != 0; - type = this->read_uint(7); - length = this->read_uint(24); - if (type == 0) { + if (this->partial_header_type_ == 0) { // Stream info block this->min_block_size_ = this->read_uint(16); this->max_block_size_ = this->read_uint(16); @@ -48,24 +50,31 @@ FLACDecoderResult FLACDecoder::read_header(size_t buffer_length) { this->sample_depth_ = this->read_uint(5) + 1; this->num_samples_ = this->read_uint(36); this->read_uint(128); + + this->partial_header_length_ = 0; } else { // Variable block - for (uint32_t i = 0; i < length; i++) { - this->read_uint(8); - - // Exit early if we run out of data here - if (this->out_of_data_) { - return FLAC_DECODER_ERROR_OUT_OF_DATA; + while (this->partial_header_length_ > 0) { + if (this->bytes_left_ == 0) { + break; } - } // for each byte in block + this->read_uint(8); + --this->partial_header_length_; + } } // variable block - } // while not last + } if ((this->sample_rate_ == 0) || (this->num_channels_ == 0) || (this->sample_depth_ == 0) || (this->max_block_size_ == 0)) { return FLAC_DECODER_ERROR_BAD_HEADER; } + if (this->sample_depth_ > 16) { + // This decoder can support higher sample depths, but it would require using int32s throughout. We limit to 16 bits + // per sample for the sake of speed, as we can then use a quick esp-dsp function for the dot product calculation + return FLAC_DECODER_ERROR_UNSUPPORTED_BITS_PER_SAMPLE; + } + // Successfully read header return FLAC_DECODER_SUCCESS; } // read_header @@ -73,6 +82,8 @@ FLACDecoderResult FLACDecoder::read_header(size_t buffer_length) { FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *output_buffer, uint32_t *num_samples) { this->buffer_index_ = 0; this->bytes_left_ = buffer_length; + this->out_of_data_ = false; + *num_samples = 0; if (!this->block_samples_) { @@ -86,6 +97,9 @@ FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *outpu return FLAC_DECODER_NO_MORE_FRAMES; } + uint64_t previous_bit_buffer = this->bit_buffer_; + uint32_t previous_bit_buffer_length = this->bit_buffer_length_; + // sync code if (this->read_uint(14) != 0x3FFE) { return FLAC_DECODER_ERROR_SYNC_NOT_FOUND; @@ -107,6 +121,8 @@ FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *outpu next_int = (next_int << 1) & 0xFF; if (this->out_of_data_) { + this->bit_buffer_ = previous_bit_buffer; + this->bit_buffer_length_ = previous_bit_buffer_length; return FLAC_DECODER_ERROR_OUT_OF_DATA; } } @@ -139,6 +155,12 @@ FLACDecoderResult FLACDecoder::decode_frame(size_t buffer_length, int16_t *outpu this->decode_subframes(block_size, this->sample_depth_, channel_assignment); *num_samples = block_size * this->num_channels_; + if (this->bytes_left_ < 2) { + this->bit_buffer_ = previous_bit_buffer; + this->bit_buffer_length_ = previous_bit_buffer_length; + return FLAC_DECODER_ERROR_OUT_OF_DATA; + } + // Footer this->align_to_byte(); this->read_uint(16); @@ -386,7 +408,6 @@ uint32_t FLACDecoder::read_uint(std::size_t num_bits) { while (this->bit_buffer_length_ < num_bits) { uint8_t next_byte = this->buffer_[this->buffer_index_]; this->buffer_index_++; - this->buffer_total_read_++; this->bytes_left_--; if (this->bytes_left_ == 0) { this->out_of_data_ = true; diff --git a/esphome/components/nabu/flac_decoder.h b/esphome/components/nabu/flac_decoder.h index 0d050ad..4ec33d6 100644 --- a/esphome/components/nabu/flac_decoder.h +++ b/esphome/components/nabu/flac_decoder.h @@ -27,16 +27,18 @@ const static uint32_t FLAC_UINT_MASK[] = { enum FLACDecoderResult { FLAC_DECODER_SUCCESS = 0, FLAC_DECODER_NO_MORE_FRAMES = 1, - FLAC_DECODER_ERROR_OUT_OF_DATA = 2, - FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER = 3, - FLAC_DECODER_ERROR_SYNC_NOT_FOUND = 4, - FLAC_DECODER_ERROR_BAD_BLOCK_SIZE_CODE = 5, - FLAC_DECODER_ERROR_BAD_HEADER = 6, - FLAC_DECODER_ERROR_RESERVED_CHANNEL_ASSIGNMENT = 7, - FLAC_DECODER_ERROR_RESERVED_SUBFRAME_TYPE = 8, - FLAC_DECODER_ERROR_BAD_FIXED_PREDICTION_ORDER = 9, - FLAC_DECODER_ERROR_RESERVED_RESIDUAL_CODING_METHOD = 10, - FLAC_DECODER_ERROR_BLOCK_SIZE_NOT_DIVISIBLE_RICE = 11, + FLAC_DECODER_HEADER_OUT_OF_DATA = 2, + FLAC_DECODER_ERROR_OUT_OF_DATA = 3, + FLAC_DECODER_ERROR_BAD_MAGIC_NUMBER = 4, + FLAC_DECODER_ERROR_SYNC_NOT_FOUND = 5, + FLAC_DECODER_ERROR_BAD_BLOCK_SIZE_CODE = 6, + FLAC_DECODER_ERROR_BAD_HEADER = 7, + FLAC_DECODER_ERROR_RESERVED_CHANNEL_ASSIGNMENT = 8, + FLAC_DECODER_ERROR_RESERVED_SUBFRAME_TYPE = 9, + FLAC_DECODER_ERROR_BAD_FIXED_PREDICTION_ORDER = 10, + FLAC_DECODER_ERROR_RESERVED_RESIDUAL_CODING_METHOD = 11, + FLAC_DECODER_ERROR_BLOCK_SIZE_NOT_DIVISIBLE_RICE = 12, + FLAC_DECODER_ERROR_UNSUPPORTED_BITS_PER_SAMPLE = 13, }; // Coefficients for fixed linear prediction @@ -52,8 +54,7 @@ class FLACDecoder { * buffer_size - size of the data buffer * min_buffer_size - min bytes in buffer before fill_buffer is called */ - FLACDecoder(uint8_t *buffer) - : buffer_(buffer) {} + FLACDecoder(uint8_t *buffer) : buffer_(buffer) {} ~FLACDecoder() { this->free_buffers(); } @@ -129,9 +130,6 @@ class FLACDecoder { /* Next index to read from the input buffer. */ std::size_t buffer_index_ = 0; - /* Total number of bytes read across frames (debugging). */ - std::size_t buffer_total_read_ = 0; - /* Number of byte that haven't been read from the input buffer yet. */ std::size_t bytes_left_ = 0; @@ -167,6 +165,11 @@ class FLACDecoder { /* Buffer of decoded samples at full precision (single channel). */ std::vector> block_result_; + + bool partial_header_read_{false}; + bool partial_header_last_{false}; + uint32_t partial_header_type_{0}; + uint32_t partial_header_length_{0}; }; } // namespace flac diff --git a/esphome/components/nabu/media_player.py b/esphome/components/nabu/media_player.py index fb58299..7d7fe3b 100644 --- a/esphome/components/nabu/media_player.py +++ b/esphome/components/nabu/media_player.py @@ -46,6 +46,7 @@ TYPE_WEB = "web" CONF_FILES = "files" +CONF_SAMPLE_RATE = "sample_rate" nabu_ns = cg.esphome_ns.namespace("nabu") NabuMediaPlayer = nabu_ns.class_("NabuMediaPlayer") @@ -176,6 +177,7 @@ def _file_schema(value): cv.GenerateID(): cv.declare_id(NabuMediaPlayer), cv.GenerateID(CONF_I2S_AUDIO_ID): cv.use_id(I2SAudioComponent), cv.Required(CONF_I2S_DOUT_PIN): pins.internal_gpio_output_pin_number, + cv.Optional(CONF_SAMPLE_RATE, default=16000): cv.int_range(min=1), cv.Optional(CONF_BITS_PER_SAMPLE, default="16bit"): cv.All( _validate_bits, cv.enum(BITS_PER_SAMPLE) ), @@ -203,6 +205,7 @@ async def to_code(config): await cg.register_parented(var, config[CONF_I2S_AUDIO_ID]) cg.add(var.set_dout_pin(config[CONF_I2S_DOUT_PIN])) cg.add(var.set_bits_per_sample(config[CONF_BITS_PER_SAMPLE])) + cg.add(var.set_sample_rate(config[CONF_SAMPLE_RATE])) if files_list := config.get(CONF_FILES): media_files = [] diff --git a/esphome/components/nabu/nabu_media_player.cpp b/esphome/components/nabu/nabu_media_player.cpp index f817d62..28bdec9 100644 --- a/esphome/components/nabu/nabu_media_player.cpp +++ b/esphome/components/nabu/nabu_media_player.cpp @@ -14,16 +14,59 @@ namespace esphome { namespace nabu { // TODO: +// - Have better logging outputs +// - Output file type and stream information + any resampling processes +// - Remove printf +// - Block media commands until the bluetooth stack is disabled (will run out of memory otherwise) // - Tune task memory requirements and potentially buffer sizes if issues appear -// - Biquad filters work for downsampling without handling float buffer carefully, upsampling will require some care // - Ducking improvements // - Ducking ratio probably isn't the best way to specify, as volume perception is not linear // - Add a YAML action for setting the ducking level instead of requiring a lambda // - Clean up process around playing back local media files // - Create a registry of media files in Python +// - What do I need to give them an ESPHome id? // - Add a yaml action to play a specific media file +// +// +// Framework: +// - Media player that can handle two streams; one for media and one for announcements +// - If played together, they are mixed with the announcement stream staying at full volume +// - The media audio can be further ducked via the ``set_ducking_ratio`` function +// - Each stream is handled by an ``AudioPipeline`` object with three parts/tasks +// - ``AudioReader`` handles reading from an HTTP source or from a PROGMEM flash set at compile time +// - ``AudioDecoder`` handles decoding the audio file. All formats are limited to two channels and 16 bits per sample +// - FLAC +// - WAV +// - MP3 (based on the libhelix decoder - a random mp3 file may be incompatible) +// - ``AudioResampler`` handles converting the sample rate to the configured output sample rate and converting mono +// to stereo +// - The quality is not good, and it is slow! Please send audio at the configured sample rate to avoid these issues +// - Each task will always run once started, but they will not doing anything until they are needed +// - FreeRTOS Event Groups make up the inter-task communication +// - The ``AudioPipeline`` sets up an output ring buffer for the Reader and Decoder parts. The next part/task +// automatically pulls from the previous ring buffer +// - The streams are mixed together in the ``AudioMixer`` task +// - Each stream has a corresponding input buffer that the ``AudioResampler`` feeds directly +// - Pausing the media stream is done here +// - Media stream ducking is done here +// - The output ring buffer feeds the ``speaker_task`` directly. It is kept small intentionally to avoid latency when +// pausing +// - Audio output is handled by the ``speaker_task``. It configures the I2S bus and copies audio from the mixer's +// output ring buffer to the DMA buffers +// - Media player commands are received by the ``control`` function. The commands are added to the +// ``media_control_command_queue_`` to be processed in the component's loop +// - Starting a stream intializes the appropriate pipeline or stops it if it is already running +// - Volume and mute commands are achieved by the ``mute``, ``unmute``, ``set_volume`` functions. They communicate +// directly with the DAC over I2C. +// - Volume commands are ignored if the media control queue is full to avoid crashing when the track wheel is spun +// fast +// - Pausing is sent to the ``AudioMixer`` task. It only effects the media stream. +// - The components main loop performs housekeeping: +// - It reads the media control queue and processes it directly +// - It watches the state of speaker and mixer tasks +// - It determines the overall state of the media player by considering the state of each pipeline +// - announcement playback takes highest priority -static const size_t SAMPLE_RATE_HZ = 16000; // 16 kHz static const size_t QUEUE_COUNT = 20; static const size_t DMA_BUFFER_COUNT = 4; static const size_t DMA_BUFFER_SIZE = 512; @@ -167,7 +210,10 @@ void NabuMediaPlayer::setup() { xTaskCreate(NabuMediaPlayer::speaker_task, "speaker_task", 3072, (void *) this, 23, &this->speaker_task_handle_); - this->get_dac_volume_(); + if (!this->get_dac_volume_().has_value() || !this->get_dac_mute_().has_value()) { + ESP_LOGE(TAG, "Couldn't communicate with DAC"); + this->mark_failed(); + } // if (!this->write_byte(DAC_PAGE_SELECTION_REGISTER, 0x01)) { // ESP_LOGE(TAG, "DAC failed to switch register page"); @@ -212,7 +258,7 @@ void NabuMediaPlayer::speaker_task(void *params) { i2s_driver_config_t config = { .mode = (i2s_mode_t) (this_speaker->parent_->get_i2s_mode() | I2S_MODE_TX), - .sample_rate = 16000, + .sample_rate = this_speaker->sample_rate_, .bits_per_sample = this_speaker->bits_per_sample_, .channel_format = I2S_CHANNEL_FMT_RIGHT_LEFT, .communication_format = I2S_COMM_FORMAT_STAND_I2S, @@ -347,13 +393,13 @@ void NabuMediaPlayer::watch_media_commands_() { make_unique(this->audio_mixer_.get(), AudioPipelineType::ANNOUNCEMENT); } - this->announcement_pipeline_->start(this->announcement_url_.value(), "ann", 7); + this->announcement_pipeline_->start(this->announcement_url_.value(), this->sample_rate_, "ann", 7); } else { if (this->media_pipeline_ == nullptr) { this->media_pipeline_ = make_unique(this->audio_mixer_.get(), AudioPipelineType::MEDIA); } - this->media_pipeline_->start(this->media_url_.value(), "media", 2); + this->media_pipeline_->start(this->media_url_.value(), this->sample_rate_, "media", 2); if (this->is_paused_) { CommandEvent command_event; @@ -371,13 +417,13 @@ void NabuMediaPlayer::watch_media_commands_() { make_unique(this->audio_mixer_.get(), AudioPipelineType::ANNOUNCEMENT); } - this->announcement_pipeline_->start(this->announcement_file_.value(), "ann", 7); + this->announcement_pipeline_->start(this->announcement_file_.value(), this->sample_rate_, "ann", 7); } else { if (this->media_pipeline_ == nullptr) { this->media_pipeline_ = make_unique(this->audio_mixer_.get(), AudioPipelineType::MEDIA); } - this->media_pipeline_->start(this->media_file_.value(), "media", 2); + this->media_pipeline_->start(this->media_file_.value(), this->sample_rate_, "media", 5); if (this->is_paused_) { CommandEvent command_event; @@ -405,7 +451,7 @@ void NabuMediaPlayer::watch_media_commands_() { this->is_paused_ = false; break; case media_player::MEDIA_PLAYER_COMMAND_PAUSE: - if (this->media_pipeline_state_ == AudioPipelineState::PLAYING) { + if (!this->is_paused_) { command_event.command = CommandEventType::PAUSE_MEDIA; this->audio_mixer_->send_command(&command_event); } @@ -510,12 +556,31 @@ void NabuMediaPlayer::loop() { if (this->announcement_pipeline_ != nullptr) this->announcement_pipeline_state_ = this->announcement_pipeline_->get_state(); + + if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_READING) { + ESP_LOGE(TAG, "Encountered an error reading the announcement file"); + } + if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_DECODING) { + ESP_LOGE(TAG, "Encountered an error decoding the announcement file"); + } + if (this->announcement_pipeline_state_ == AudioPipelineState::ERROR_RESAMPLING) { + ESP_LOGE(TAG, "Encountered an error resampling the announcement file"); + } if (this->media_pipeline_ != nullptr) this->media_pipeline_state_ = this->media_pipeline_->get_state(); - if ((this->announcement_pipeline_state_ != AudioPipelineState::STOPPING) && - (this->announcement_pipeline_state_ != AudioPipelineState::STOPPED)) { + if (this->media_pipeline_state_ == AudioPipelineState::ERROR_READING) { + ESP_LOGE(TAG, "Encountered an error reading the media file"); + } + if (this->media_pipeline_state_ == AudioPipelineState::ERROR_DECODING) { + ESP_LOGE(TAG, "Encountered an error decoding the media file"); + } + if (this->media_pipeline_state_ == AudioPipelineState::ERROR_RESAMPLING) { + ESP_LOGE(TAG, "Encountered an error resampling the media file"); + } + + if (this->announcement_pipeline_state_ != AudioPipelineState::STOPPED) { this->state = media_player::MEDIA_PLAYER_STATE_ANNOUNCING; if (this->is_idle_muted_ && !this->is_muted_) { // this->unmute_(); @@ -529,8 +594,7 @@ void NabuMediaPlayer::loop() { // this->mute_(); this->is_idle_muted_ = true; } - } else if ((this->media_pipeline_state_ == AudioPipelineState::STOPPING) || - (this->media_pipeline_state_ == AudioPipelineState::STOPPED)) { + } else if (this->media_pipeline_state_ == AudioPipelineState::STOPPED) { this->state = media_player::MEDIA_PLAYER_STATE_IDLE; if (!this->is_idle_muted_) { // this->mute_(); @@ -656,6 +720,31 @@ bool NabuMediaPlayer::set_volume_(float volume, bool publish) { return true; } +optional NabuMediaPlayer::get_dac_mute_(bool publish) { + if (!this->write_byte(DAC_PAGE_SELECTION_REGISTER, DAC_MUTE_PAGE)) { + ESP_LOGE(TAG, "DAC failed to switch to mute page registers"); + return {}; + } + + uint8_t dac_mute_left = 0; + uint8_t dac_mute_right = 0; + if (!this->read_byte(DAC_LEFT_MUTE_REGISTER, &dac_mute_left) || + !this->read_byte(DAC_RIGHT_MUTE_REGISTER, &dac_mute_right)) { + ESP_LOGE(TAG, "DAC failed to read mute status"); + return {}; + } + + bool is_muted = false; + if (dac_mute_left == DAC_MUTE_COMMAND && dac_mute_right == DAC_MUTE_COMMAND) { + is_muted = true; + } + + if (publish) { + this->is_muted_ = is_muted; + } + return is_muted; +} + bool NabuMediaPlayer::mute_() { if (!this->write_byte(DAC_PAGE_SELECTION_REGISTER, DAC_MUTE_PAGE)) { ESP_LOGE(TAG, "DAC failed to switch to mute page registers"); diff --git a/esphome/components/nabu/nabu_media_player.h b/esphome/components/nabu/nabu_media_player.h index 4ff4260..149ee6b 100644 --- a/esphome/components/nabu/nabu_media_player.h +++ b/esphome/components/nabu/nabu_media_player.h @@ -57,15 +57,19 @@ class NabuMediaPlayer : public Component, void set_dout_pin(uint8_t pin) { this->dout_pin_ = pin; } void set_bits_per_sample(i2s_bits_per_sample_t bits_per_sample) { this->bits_per_sample_ = bits_per_sample; } + void set_sample_rate(uint32_t sample_rate) { this->sample_rate_ = sample_rate; } protected: // Receives commands from HA or from the voice assistant component // Sends commands to the media_control_commanda_queue_ void control(const media_player::MediaPlayerCall &call) override; - /// @return volume read from DAC between 0.0 and 1.0, if successful + /// @return Volume read from DAC between 0.0 and 1.0, if successful. Updates volume_ if publish is true. optional get_dac_volume_(bool publish = true); + /// @return Mute status read from DAC, if successful. Updates is_muted_ if publish is true. + optional get_dac_mute_(bool publish = true); + /// @return true if I2C writes were successful bool set_volume_(float volume, bool publish = true); @@ -101,6 +105,7 @@ class NabuMediaPlayer : public Component, QueueHandle_t speaker_command_queue_; i2s_bits_per_sample_t bits_per_sample_; + uint32_t sample_rate_; uint8_t dout_pin_{0}; bool is_paused_{false};