diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index b39c4210..53ecb5ce 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -66,4 +66,9 @@ set(LIBSNDFILE ${LIBSNDFILE_PATH}/${CMAKE_SYSTEM_NAME}/${TARGET_LIB_ARCH}/libsnd # fftw set(LIBFFTW_PATH ${CMAKE_CURRENT_SOURCE_DIR}/fftw) set(LIBFFTW_INCLUDES ${LIBFFTW_PATH}/include PARENT_SCOPE) -set(LIBFFTW ${LIBFFTW_PATH}/${CMAKE_SYSTEM_NAME}/${TARGET_LIB_ARCH}/libfftw3f.a PARENT_SCOPE) \ No newline at end of file +set(LIBFFTW ${LIBFFTW_PATH}/${CMAKE_SYSTEM_NAME}/${TARGET_LIB_ARCH}/libfftw3f.a PARENT_SCOPE) + +# kaldi_native_fbank +set(LIBKALDI_NATIVE_FBANK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/kaldi_native_fbank) +set(LIBKALDI_NATIVE_FBANK_INCLUDES ${LIBKALDI_NATIVE_FBANK_PATH}/include PARENT_SCOPE) +set(LIBKALDI_NATIVE_FBANK ${LIBKALDI_NATIVE_FBANK_PATH}/${CMAKE_SYSTEM_NAME}/${TARGET_LIB_ARCH}/libkaldi-native-fbank-core.a PARENT_SCOPE) \ No newline at end of file diff --git a/3rdparty/allocator/dma/dma_alloc.hpp b/3rdparty/allocator/dma/dma_alloc.hpp index 53cc7e83..9b11170a 100644 --- a/3rdparty/allocator/dma/dma_alloc.hpp +++ b/3rdparty/allocator/dma/dma_alloc.hpp @@ -16,6 +16,9 @@ * limitations under the License. */ +#ifndef _DMA_ALLOC_HPP_ +#define _DMA_ALLOC_HPP_ + #include #include #include @@ -46,9 +49,16 @@ #include #include -#include "dma_alloc.h" +// #include "dma_alloc.h" #include "RgaUtils.h" +#define DMA_HEAP_UNCACHE_PATH "/dev/dma_heap/system-uncached" +#define DMA_HEAP_PATH "/dev/dma_heap/system" +#define DMA_HEAP_DMA32_UNCACHE_PATCH "/dev/dma_heap/system-uncached-dma32" +#define DMA_HEAP_DMA32_PATCH "/dev/dma_heap/system-dma32" +#define CMA_HEAP_UNCACHE_PATH "/dev/dma_heap/cma-uncached" +#define RV1106_CMA_HEAP_PATH "/dev/rk_dma_heap/rk-dma-heap-cma" + typedef unsigned long long __u64; typedef unsigned int __u32; @@ -87,21 +97,21 @@ struct dma_buf_sync { #define CMA_HEAP_SIZE 1024 * 1024 -int dma_sync_device_to_cpu(int fd) { +static int dma_sync_device_to_cpu(int fd) { struct dma_buf_sync sync = {0}; sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW; return ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync); } -int dma_sync_cpu_to_device(int fd) { +static int dma_sync_cpu_to_device(int fd) { struct dma_buf_sync sync = {0}; sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW; return ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync); } -int dma_buf_alloc(const char *path, size_t size, int *fd, void **va) { +static int dma_buf_alloc(const char *path, size_t size, int *fd, void **va) { int ret; int prot; void *mmap_va; @@ -149,7 +159,7 @@ int dma_buf_alloc(const char *path, size_t size, int *fd, void **va) { return 0; } -void dma_buf_free(size_t size, int *fd, void *va) { +static void dma_buf_free(size_t size, int *fd, void *va) { int len; len = size; @@ -159,5 +169,4 @@ void dma_buf_free(size_t size, int *fd, void *va) { *fd = -1; } - - +#endif // _DMA_ALLOC_HPP_ diff --git a/3rdparty/kaldi_native_fbank/Android/arm64-v8a/libkaldi-native-fbank-core.a b/3rdparty/kaldi_native_fbank/Android/arm64-v8a/libkaldi-native-fbank-core.a new file mode 100644 index 00000000..212b4cc2 Binary files /dev/null and b/3rdparty/kaldi_native_fbank/Android/arm64-v8a/libkaldi-native-fbank-core.a differ diff --git a/3rdparty/kaldi_native_fbank/Android/armeabi-v7a/libkaldi-native-fbank-core.a b/3rdparty/kaldi_native_fbank/Android/armeabi-v7a/libkaldi-native-fbank-core.a new file mode 100644 index 00000000..8c7622d0 Binary files /dev/null and b/3rdparty/kaldi_native_fbank/Android/armeabi-v7a/libkaldi-native-fbank-core.a differ diff --git a/3rdparty/kaldi_native_fbank/Linux/aarch64/libkaldi-native-fbank-core.a b/3rdparty/kaldi_native_fbank/Linux/aarch64/libkaldi-native-fbank-core.a new file mode 100644 index 00000000..c5b9a6cf Binary files /dev/null and b/3rdparty/kaldi_native_fbank/Linux/aarch64/libkaldi-native-fbank-core.a differ diff --git a/3rdparty/kaldi_native_fbank/Linux/armhf/libkaldi-native-fbank-core.a b/3rdparty/kaldi_native_fbank/Linux/armhf/libkaldi-native-fbank-core.a new file mode 100644 index 00000000..05976d8c Binary files /dev/null and b/3rdparty/kaldi_native_fbank/Linux/armhf/libkaldi-native-fbank-core.a differ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-fbank.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-fbank.h new file mode 100644 index 00000000..84005e0d --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-fbank.h @@ -0,0 +1,136 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-fbank.h + +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ + +#include +#include +#include +#include +#include + +#include "kaldi-native-fbank/csrc/feature-window.h" +#include "kaldi-native-fbank/csrc/mel-computations.h" +#include "kaldi-native-fbank/csrc/rfft.h" + +namespace knf { + +struct FbankOptions { + FrameExtractionOptions frame_opts; + MelBanksOptions mel_opts; + // append an extra dimension with energy to the filter banks + bool use_energy = false; + float energy_floor = 0.0f; // active iff use_energy==true + + // If true, compute log_energy before preemphasis and windowing + // If false, compute log_energy after preemphasis ans windowing + bool raw_energy = true; // active iff use_energy==true + + // If true, put energy last (if using energy) + // If false, put energy first + bool htk_compat = false; // active iff use_energy==true + + // if true (default), produce log-filterbank, else linear + bool use_log_fbank = true; + + // if true (default), use power in filterbank + // analysis, else magnitude. + bool use_power = true; + + FbankOptions() { mel_opts.num_bins = 23; } + + std::string ToString() const { + std::ostringstream os; + os << "frame_opts: \n"; + os << frame_opts << "\n"; + os << "\n"; + + os << "mel_opts: \n"; + os << mel_opts << "\n"; + + os << "use_energy: " << use_energy << "\n"; + os << "energy_floor: " << energy_floor << "\n"; + os << "raw_energy: " << raw_energy << "\n"; + os << "htk_compat: " << htk_compat << "\n"; + os << "use_log_fbank: " << use_log_fbank << "\n"; + os << "use_power: " << use_power << "\n"; + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const FbankOptions &opts); + +class FbankComputer { + public: + using Options = FbankOptions; + + explicit FbankComputer(const FbankOptions &opts); + ~FbankComputer(); + + int32_t Dim() const { + return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); + } + + // if true, compute log_energy_pre_window but after dithering and dc removal + bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; } + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + const FbankOptions &GetOptions() const { return opts_; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. It should be pre-allocated. + */ + void Compute(float signal_raw_log_energy, float vtln_warp, + std::vector *signal_frame, float *feature); + + private: + const MelBanks *GetMelBanks(float vtln_warp); + + FbankOptions opts_; + float log_energy_floor_; + std::map mel_banks_; // float is VTLN coefficient. + Rfft rfft_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-functions.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-functions.h new file mode 100644 index 00000000..b2216224 --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-functions.h @@ -0,0 +1,38 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-functions.h +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_ + +#include +namespace knf { + +// ComputePowerSpectrum converts a complex FFT (as produced by the FFT +// functions in csrc/rfft.h), and converts it into +// a power spectrum. If the complex FFT is a vector of size n (representing +// half of the complex FFT of a real signal of size n, as described there), +// this function computes in the first (n/2) + 1 elements of it, the +// energies of the fft bins from zero to the Nyquist frequency. Contents of the +// remaining (n/2) - 1 elements are undefined at output. + +void ComputePowerSpectrum(std::vector *complex_fft); + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-mfcc.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-mfcc.h new file mode 100644 index 00000000..32dc772e --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-mfcc.h @@ -0,0 +1,151 @@ +/** + * Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University + * 2014-2016 Johns Hopkins University (author: Daniel Povey) + * Copyright 2024 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This file is copied/modified from kaldi/src/feat/feature-mfcc.h + +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_MFCC_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_MFCC_H_ + +#include +#include +#include +#include +#include + +#include "kaldi-native-fbank/csrc/feature-window.h" +#include "kaldi-native-fbank/csrc/mel-computations.h" +#include "kaldi-native-fbank/csrc/rfft.h" + +namespace knf { + +/// MfccOptions contains basic options for computing MFCC features. +// (this class is copied from kaldi) +struct MfccOptions { + FrameExtractionOptions frame_opts; + MelBanksOptions mel_opts; + + // Number of cepstra in MFCC computation (including C0) + int32_t num_ceps = 13; + + // Use energy (not C0) in MFCC computation + bool use_energy = true; + + // Floor on energy (absolute, not relative) in MFCC + // computation. Only makes a difference if use_energy=true; + // only necessary if dither=0.0. + // Suggested values: 0.1 or 1.0 + float energy_floor = 0.0; + + // If true, compute energy before preemphasis and windowing + bool raw_energy = true; + + // Constant that controls scaling of MFCCs + float cepstral_lifter = 22.0; + + // If true, put energy or C0 last and use a factor of + // sqrt(2) on C0. + // Warning: not sufficient to get HTK compatible features + // (need to change other parameters) + bool htk_compat = false; + + MfccOptions() { mel_opts.num_bins = 23; } + + std::string ToString() const { + std::ostringstream os; + os << "MfccOptions("; + os << "frame_opts=" << frame_opts.ToString() << ", "; + os << "mel_opts=" << mel_opts.ToString() << ", "; + + os << "num_ceps=" << num_ceps << ", "; + os << "use_energy=" << (use_energy ? "True" : "False") << ", "; + os << "energy_floor=" << energy_floor << ", "; + os << "raw_energy=" << (raw_energy ? "True" : "False") << ", "; + os << "cepstral_lifter=" << cepstral_lifter << ", "; + os << "htk_compat=" << (htk_compat ? "True" : "False") << ")"; + + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const MfccOptions &opts); + +class MfccComputer { + public: + using Options = MfccOptions; + + explicit MfccComputer(const MfccOptions &opts); + ~MfccComputer(); + + int32_t Dim() const { return opts_.num_ceps; } + + // if true, compute log_energy_pre_window but after dithering and dc removal + bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; } + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + const MfccOptions &GetOptions() const { return opts_; } + + /** + Function that computes one frame of features from + one frame of signal. + + @param [in] signal_raw_log_energy The log-energy of the frame of the signal + prior to windowing and pre-emphasis, or + log(numeric_limits::min()), whichever is greater. Must be + ignored by this function if this class returns false from + this->NeedsRawLogEnergy(). + @param [in] vtln_warp The VTLN warping factor that the user wants + to be applied when computing features for this utterance. Will + normally be 1.0, meaning no warping is to be done. The value will + be ignored for feature types that don't support VLTN, such as + spectrogram features. + @param [in] signal_frame One frame of the signal, + as extracted using the function ExtractWindow() using the options + returned by this->GetFrameOptions(). The function will use the + vector as a workspace, which is why it's a non-const pointer. + @param [out] feature Pointer to a vector of size this->Dim(), to which + the computed feature will be written. It should be pre-allocated. + */ + void Compute(float signal_raw_log_energy, float vtln_warp, + std::vector *signal_frame, float *feature); + + private: + const MelBanks *GetMelBanks(float vtln_warp); + + MfccOptions opts_; + float log_energy_floor_; + std::map mel_banks_; // float is VTLN coefficient. + Rfft rfft_; + + // temp buffer of size num_mel_bins = opts.mel_opts.num_bins + std::vector mel_energies_; + + // opts_.num_ceps + std::vector lifter_coeffs_; + + // [num_ceps][num_mel_bins] + std::vector dct_matrix_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_MFCC_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-window.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-window.h new file mode 100644 index 00000000..88b509e8 --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/feature-window.h @@ -0,0 +1,178 @@ +// kaldi-native-fbank/csrc/feature-window.h +// +// Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + +// This file is copied/modified from kaldi/src/feat/feature-window.h + +#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_ +#define KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_ + +#include +#include +#include +#include + +#include "kaldi-native-fbank/csrc/log.h" + +namespace knf { + +inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) { + // copied from kaldi/src/base/kaldi-math.cc + KNF_CHECK_GT(n, 0); + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + return n + 1; +} + +struct FrameExtractionOptions { + float samp_freq = 16000; + float frame_shift_ms = 10.0f; // in milliseconds. + float frame_length_ms = 25.0f; // in milliseconds. + + float dither = 0.00003f; // Amount of dithering, 0.0 means no dither. + // Value 0.00003f is equivalent to 1.0 in kaldi. + + float preemph_coeff = 0.97f; // Preemphasis coefficient. + bool remove_dc_offset = true; // Subtract mean of wave before FFT. + std::string window_type = "povey"; // e.g. Hamming window + // May be "hamming", "rectangular", "povey", "hanning", "hann", "sine", + // "blackman". + // "povey" is a window I made to be similar to Hamming but to go to zero at + // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the + // Hamming window makes sense as a windowing function. + bool round_to_power_of_two = true; + float blackman_coeff = 0.42f; + bool snip_edges = true; + // bool allow_downsample = false; + // bool allow_upsample = false; + + int32_t WindowShift() const { + return static_cast(samp_freq * 0.001f * frame_shift_ms); + } + int32_t WindowSize() const { + return static_cast(samp_freq * 0.001f * frame_length_ms); + } + int32_t PaddedWindowSize() const { + return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) + : WindowSize()); + } + std::string ToString() const { + std::ostringstream os; +#define KNF_PRINT(x) os << #x << ": " << x << "\n" + KNF_PRINT(samp_freq); + KNF_PRINT(frame_shift_ms); + KNF_PRINT(frame_length_ms); + KNF_PRINT(dither); + KNF_PRINT(preemph_coeff); + KNF_PRINT(remove_dc_offset); + KNF_PRINT(window_type); + KNF_PRINT(round_to_power_of_two); + KNF_PRINT(blackman_coeff); + KNF_PRINT(snip_edges); + // KNF_PRINT(allow_downsample); + // KNF_PRINT(allow_upsample); +#undef KNF_PRINT + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts); + +class FeatureWindowFunction { + public: + FeatureWindowFunction() = default; + explicit FeatureWindowFunction(const FrameExtractionOptions &opts); + /** + * @param wave Pointer to a 1-D array of shape [window_size]. + * It is modified in-place: wave[i] = wave[i] * window_[i]. + * @param + */ + void Apply(float *wave) const; + const std::vector &GetWindow() const { return window_; } + + private: + std::vector window_; // of size opts.WindowSize() +}; + +int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts); + +/** + This function returns the number of frames that we can extract from a wave + file with the given number of samples in it (assumed to have the same + sampling rate as specified in 'opts'). + + @param [in] num_samples The number of samples in the wave file. + @param [in] opts The frame-extraction options class + + @param [in] flush True if we are asserting that this number of samples + is 'all there is', false if we expecting more data to possibly come in. This + only makes a difference to the answer + if opts.snip_edges== false. For offline feature extraction you always want + flush == true. In an online-decoding context, once you know (or decide) that + no more data is coming in, you'd call it with flush == true at the end to + flush out any remaining data. +*/ +int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts, + bool flush = true); + +/* + ExtractWindow() extracts a windowed frame of waveform (possibly with a + power-of-two, padded size, depending on the config), including all the + processing done by ProcessWindow(). + + @param [in] sample_offset If 'wave' is not the entire waveform, but + part of it to the left has been discarded, then the + number of samples prior to 'wave' that we have + already discarded. Set this to zero if you are + processing the entire waveform in one piece, or + if you get 'no matching function' compilation + errors when updating the code. + @param [in] wave The waveform + @param [in] f The frame index to be extracted, with + 0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true) + @param [in] opts The options class to be used + @param [in] window_function The windowing function, as derived from the + options class. + @param [out] window The windowed, possibly-padded waveform to be + extracted. Will be resized as needed. + @param [out] log_energy_pre_window If non-NULL, the log-energy of + the signal prior to pre-emphasis and multiplying by + the windowing function will be written to here. +*/ +void ExtractWindow(int64_t sample_offset, const std::vector &wave, + int32_t f, const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, + std::vector *window, + float *log_energy_pre_window = nullptr); + +/** + This function does all the windowing steps after actually + extracting the windowed signal: depending on the + configuration, it does dithering, dc offset removal, + preemphasis, and multiplication by the windowing function. + @param [in] opts The options class to be used + @param [in] window_function The windowing function-- should have + been initialized using 'opts'. + @param [in,out] window A vector of size opts.WindowSize(). Note: + it will typically be a sub-vector of a larger vector of size + opts.PaddedWindowSize(), with the remaining samples zero, + as the FFT code is more efficient if it operates on data with + power-of-two size. + @param [out] log_energy_pre_window If non-NULL, then after dithering and + DC offset removal, this function will write to this pointer the log of + the total energy (i.e. sum-squared) of the frame. + */ +void ProcessWindow(const FrameExtractionOptions &opts, + const FeatureWindowFunction &window_function, float *window, + float *log_energy_pre_window = nullptr); + +// Compute the inner product of two vectors +float InnerProduct(const float *a, const float *b, int32_t n); + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_FEATURE_WINDOW_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/kaldi-math.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/kaldi-math.h new file mode 100644 index 00000000..4db3f831 --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/kaldi-math.h @@ -0,0 +1,52 @@ +// kaldi-native-fbank/csrc/kaldi-math.h +// +// Copyright (c) 2024 Brno University of Technology (authors: Karel Vesely) + +// This file is an excerpt from kaldi/src/base/kaldi-math.h + +#ifndef KALDI_NATIVE_FBANK_CSRC_KALDI_MATH_H_ +#define KALDI_NATIVE_FBANK_CSRC_KALDI_MATH_H_ + +#include // logf, sqrtf, cosf +#include +#include // RAND_MAX + +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795 +#endif + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +#ifndef M_SQRT2 +#define M_SQRT2 1.4142135623730950488016887 +#endif + +namespace knf { + +inline float Log(float x) { return logf(x); } + +// Returns a random integer between 0 and RAND_MAX, inclusive +int Rand(struct RandomState *state = NULL); + +// State for thread-safe random number generator +struct RandomState { + RandomState(); + unsigned seed; +}; + +/// Returns a random number strictly between 0 and 1. +inline float RandUniform(struct RandomState *state = NULL) { + return static_cast((Rand(state) + 1.0) / (RAND_MAX + 2.0)); +} + +inline float RandGauss(struct RandomState *state = NULL) { + return static_cast(sqrtf(-2 * Log(RandUniform(state))) * + cosf(2 * M_PI * RandUniform(state))); +} + +void Sqrt(float *in_out, int32_t n); + +} // namespace knf +#endif // KALDI_NATIVE_FBANK_CSRC_KALDI_MATH_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/log.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/log.h new file mode 100644 index 00000000..bd21cc35 --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/log.h @@ -0,0 +1,383 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The content in this file is copied/modified from +// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h +#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_ +#define KALDI_NATIVE_FBANK_CSRC_LOG_H_ + +#include + +#include // NOLINT +#include +#include + +namespace knf { + +#if KNF_ENABLE_CHECK + +#if defined(NDEBUG) +constexpr bool kDisableDebug = true; +#else +constexpr bool kDisableDebug = false; +#endif + +enum class LogLevel { + kTrace = 0, + kDebug = 1, + kInfo = 2, + kWarning = 3, + kError = 4, + kFatal = 5, // print message and abort the program +}; + +// They are used in KNF_LOG(xxx), so their names +// do not follow the google c++ code style +// +// You can use them in the following way: +// +// KNF_LOG(TRACE) << "some message"; +// KNF_LOG(DEBUG) << "some message"; +#ifndef _MSC_VER +constexpr LogLevel TRACE = LogLevel::kTrace; +constexpr LogLevel DEBUG = LogLevel::kDebug; +constexpr LogLevel INFO = LogLevel::kInfo; +constexpr LogLevel WARNING = LogLevel::kWarning; +constexpr LogLevel ERROR = LogLevel::kError; +constexpr LogLevel FATAL = LogLevel::kFatal; +#else +#define TRACE LogLevel::kTrace +#define DEBUG LogLevel::kDebug +#define INFO LogLevel::kInfo +#define WARNING LogLevel::kWarning +#define ERROR LogLevel::kError +#define FATAL LogLevel::kFatal +#endif + +std::string GetStackTrace(); + +/* Return the current log level. + + + If the current log level is TRACE, then all logged messages are printed out. + + If the current log level is DEBUG, log messages with "TRACE" level are not + shown and all other levels are printed out. + + Similarly, if the current log level is INFO, log message with "TRACE" and + "DEBUG" are not shown and all other levels are printed out. + + If it is FATAL, then only FATAL messages are shown. + */ +inline LogLevel GetCurrentLogLevel() { + static LogLevel log_level = INFO; + static std::once_flag init_flag; + std::call_once(init_flag, []() { + const char *env_log_level = std::getenv("KNF_LOG_LEVEL"); + if (env_log_level == nullptr) return; + + std::string s = env_log_level; + if (s == "TRACE") + log_level = TRACE; + else if (s == "DEBUG") + log_level = DEBUG; + else if (s == "INFO") + log_level = INFO; + else if (s == "WARNING") + log_level = WARNING; + else if (s == "ERROR") + log_level = ERROR; + else if (s == "FATAL") + log_level = FATAL; + else + fprintf(stderr, + "Unknown KNF_LOG_LEVEL: %s" + "\nSupported values are: " + "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL", + s.c_str()); + }); + return log_level; +} + +inline bool EnableAbort() { + static std::once_flag init_flag; + static bool enable_abort = false; + std::call_once(init_flag, []() { + enable_abort = (std::getenv("KNF_ABORT") != nullptr); + }); + return enable_abort; +} + +class Logger { + public: + Logger(const char *filename, const char *func_name, uint32_t line_num, + LogLevel level) + : filename_(filename), + func_name_(func_name), + line_num_(line_num), + level_(level) { + cur_level_ = GetCurrentLogLevel(); + fprintf(stderr, "here\n"); + switch (level) { + case TRACE: + if (cur_level_ <= TRACE) fprintf(stderr, "[T] "); + break; + case DEBUG: + if (cur_level_ <= DEBUG) fprintf(stderr, "[D] "); + break; + case INFO: + if (cur_level_ <= INFO) fprintf(stderr, "[I] "); + break; + case WARNING: + if (cur_level_ <= WARNING) fprintf(stderr, "[W] "); + break; + case ERROR: + if (cur_level_ <= ERROR) fprintf(stderr, "[E] "); + break; + case FATAL: + if (cur_level_ <= FATAL) fprintf(stderr, "[F] "); + break; + } + + if (cur_level_ <= level_) { + fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name); + } + } + + ~Logger() noexcept(false) { + static constexpr const char *kErrMsg = R"( + Some bad things happened. Please read the above error messages and stack + trace. If you are using Python, the following command may be helpful: + + gdb --args python /path/to/your/code.py + + (You can use `gdb` to debug the code. Please consider compiling + a debug version of KNF.). + + If you are unable to fix it, please open an issue at: + + https://github.com/csukuangfj/kaldi-native-fbank/issues/new + )"; + fprintf(stderr, "\n"); + if (level_ == FATAL) { + std::string stack_trace = GetStackTrace(); + if (!stack_trace.empty()) { + fprintf(stderr, "\n\n%s\n", stack_trace.c_str()); + } + + fflush(nullptr); + +#ifndef __ANDROID_API__ + if (EnableAbort()) { + // NOTE: abort() will terminate the program immediately without + // printing the Python stack backtrace. + abort(); + } + + throw std::runtime_error(kErrMsg); +#else + abort(); +#endif + } + } + + const Logger &operator<<(bool b) const { + if (cur_level_ <= level_) { + fprintf(stderr, b ? "true" : "false"); + } + return *this; + } + + const Logger &operator<<(int8_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%d", i); + return *this; + } + + const Logger &operator<<(const char *s) const { + if (cur_level_ <= level_) fprintf(stderr, "%s", s); + return *this; + } + + const Logger &operator<<(int32_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%d", i); + return *this; + } + + const Logger &operator<<(uint32_t i) const { + if (cur_level_ <= level_) fprintf(stderr, "%u", i); + return *this; + } + + const Logger &operator<<(uint64_t i) const { + if (cur_level_ <= level_) + fprintf(stderr, "%llu", (long long unsigned int)i); // NOLINT + return *this; + } + + const Logger &operator<<(int64_t i) const { + if (cur_level_ <= level_) + fprintf(stderr, "%lli", (long long int)i); // NOLINT + return *this; + } + + const Logger &operator<<(float f) const { + if (cur_level_ <= level_) fprintf(stderr, "%f", f); + return *this; + } + + const Logger &operator<<(double d) const { + if (cur_level_ <= level_) fprintf(stderr, "%f", d); + return *this; + } + + template + const Logger &operator<<(const T &t) const { + // require T overloads operator<< + std::ostringstream os; + os << t; + return *this << os.str().c_str(); + } + + // specialization to fix compile error: `stringstream << nullptr` is ambiguous + const Logger &operator<<(const std::nullptr_t &null) const { + if (cur_level_ <= level_) *this << "(null)"; + return *this; + } + + private: + const char *filename_; + const char *func_name_; + uint32_t line_num_; + LogLevel level_; + LogLevel cur_level_; +}; +#endif // KNF_ENABLE_CHECK + +class Voidifier { + public: +#if KNF_ENABLE_CHECK + void operator&(const Logger &) const {} +#endif +}; +#if !defined(KNF_ENABLE_CHECK) +template +const Voidifier &operator<<(const Voidifier &v, T &&) { + return v; +} +#endif + +} // namespace knf + +#define KNF_STATIC_ASSERT(x) static_assert(x, "") + +#ifdef KNF_ENABLE_CHECK + +#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \ + defined(__PRETTY_FUNCTION__) +// for clang and GCC +#define KNF_FUNC __PRETTY_FUNCTION__ +#else +// for other compilers +#define KNF_FUNC __func__ +#endif + +#define KNF_CHECK(x) \ + (x) ? (void)0 \ + : ::knf::Voidifier() & \ + ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \ + << "Check failed: " << #x << " " + +// WARNING: x and y may be evaluated multiple times, but this happens only +// when the check fails. Since the program aborts if it fails, we don't think +// the extra evaluation of x and y matters. +// +// CAUTION: we recommend the following use case: +// +// auto x = Foo(); +// auto y = Bar(); +// KNF_CHECK_EQ(x, y) << "Some message"; +// +// And please avoid +// +// KNF_CHECK_EQ(Foo(), Bar()); +// +// if `Foo()` or `Bar()` causes some side effects, e.g., changing some +// local static variables or global variables. +#define _KNF_CHECK_OP(x, y, op) \ + ((x)op(y)) ? (void)0 \ + : ::knf::Voidifier() & \ + ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \ + << "Check failed: " << #x << " " << #op << " " << #y \ + << " (" << (x) << " vs. " << (y) << ") " + +#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==) +#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=) +#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <) +#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=) +#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >) +#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=) + +#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x) + +// ------------------------------------------------------------ +// For debug check +// ------------------------------------------------------------ +// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank, +// the following macros are in fact empty and does nothing. + +#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x) + +#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y) + +#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y) + +#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y) + +#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y) + +#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y) + +#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y) + +#define KNF_DLOG(x) \ + ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x) + +#else + +#define KNF_CHECK(x) ::knf::Voidifier() +#define KNF_LOG(x) ::knf::Voidifier() + +#define KNF_CHECK_EQ(x, y) ::knf::Voidifier() +#define KNF_CHECK_NE(x, y) ::knf::Voidifier() +#define KNF_CHECK_LT(x, y) ::knf::Voidifier() +#define KNF_CHECK_LE(x, y) ::knf::Voidifier() +#define KNF_CHECK_GT(x, y) ::knf::Voidifier() +#define KNF_CHECK_GE(x, y) ::knf::Voidifier() + +#define KNF_DCHECK(x) ::knf::Voidifier() +#define KNF_DLOG(x) ::knf::Voidifier() +#define KNF_DCHECK_EQ(x, y) ::knf::Voidifier() +#define KNF_DCHECK_NE(x, y) ::knf::Voidifier() +#define KNF_DCHECK_LT(x, y) ::knf::Voidifier() +#define KNF_DCHECK_LE(x, y) ::knf::Voidifier() +#define KNF_DCHECK_GT(x, y) ::knf::Voidifier() +#define KNF_DCHECK_GE(x, y) ::knf::Voidifier() + +#endif // KNF_CHECK_NE + +#endif // KALDI_NATIVE_FBANK_CSRC_LOG_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/mel-computations.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/mel-computations.h new file mode 100644 index 00000000..c81ce97c --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/mel-computations.h @@ -0,0 +1,182 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// This file is copied/modified from kaldi/src/feat/mel-computations.h +#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ +#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ + +#include +#include +#include +#include +#include +#include + +#include "kaldi-native-fbank/csrc/feature-window.h" + +namespace knf { +struct FrameExtractionOptions; + +struct MelBanksOptions { + int32_t num_bins = 25; // e.g. 25; number of triangular bins + float low_freq = 20; // e.g. 20; lower frequency cutoff + + // an upper frequency cutoff; 0 -> no cutoff, negative + // ->added to the Nyquist frequency to get the cutoff. + float high_freq = 0; + + float vtln_low = 100; // vtln lower cutoff of warping function. + + // vtln upper cutoff of warping function: if negative, added + // to the Nyquist frequency to get the cutoff. + float vtln_high = -500; + + bool debug_mel = false; + // htk_mode is a "hidden" config, it does not show up on command line. + // Enables more exact compatibility with HTK, for testing purposes. Affects + // mel-energy flooring and reproduces a bug in HTK. + bool htk_mode = false; + + // Note that if you set is_librosa, you probably need to set + // low_freq to 0. + // Please see + // https://librosa.org/doc/main/generated/librosa.filters.mel.html + bool is_librosa = false; + + // used only when is_librosa=true + // Possible values: "", slaney. We don't support a numeric value here, but + // it can be added on demand. + // See https://librosa.org/doc/main/generated/librosa.filters.mel.html + std::string norm = "slaney"; + + std::string ToString() const { + std::ostringstream os; + os << "num_bins: " << num_bins << "\n"; + os << "low_freq: " << low_freq << "\n"; + os << "high_freq: " << high_freq << "\n"; + os << "vtln_low: " << vtln_low << "\n"; + os << "vtln_high: " << vtln_high << "\n"; + os << "debug_mel: " << debug_mel << "\n"; + os << "htk_mode: " << htk_mode << "\n"; + os << "is_librosa: " << is_librosa << "\n"; + os << "norm: " << norm << "\n"; + return os.str(); + } +}; + +std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts); + +class MelBanks { + public: + // see also https://en.wikipedia.org/wiki/Mel_scale + // htk, mel to hz + static inline float InverseMelScale(float mel_freq) { + return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); + } + + // htk, hz to mel + static inline float MelScale(float freq) { + return 1127.0f * logf(1.0f + freq / 700.0f); + } + + // slaney, mel to hz + static inline float InverseMelScaleSlaney(float mel_freq) { + if (mel_freq <= 15) { + return 200.0f / 3 * mel_freq; + } + + // return 1000 * expf((mel_freq - 15) * logf(6.4f) / 27); + + // Note: log(6.4)/27 = 0.06875177742094911 + + return 1000 * expf((mel_freq - 15) * 0.06875177742094911f); + } + + // slaney, hz to mel + static inline float MelScaleSlaney(float freq) { + if (freq <= 1000) { + return freq * 3 / 200.0f; + } + + // return 15 + 27 * logf(freq / 1000) / logf(6.4f) + // + // Note: 27/log(6.4) = 14.545078505785561 + + return 15 + 14.545078505785561f * logf(freq / 1000); + } + + static float VtlnWarpFreq( + float vtln_low_cutoff, + float vtln_high_cutoff, // discontinuities in warp func + float low_freq, + float high_freq, // upper+lower frequency cutoffs in + // the mel computation + float vtln_warp_factor, float freq); + + static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff, + float low_freq, float high_freq, + float vtln_warp_factor, float mel_freq); + + // TODO(fangjun): Remove vtln_warp_factor + MelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, float vtln_warp_factor); + + // Initialize with a 2-d weights matrix + // @param weights Pointer to the start address of the matrix + // @param num_rows It equls to number of mel bins + // @param num_cols It equals to (number of fft bins)/2+1 + MelBanks(const float *weights, int32_t num_rows, int32_t num_cols); + + /// Compute Mel energies (note: not log energies). + /// At input, "fft_energies" contains the FFT energies (not log). + /// + /// @param fft_energies 1-D array of size num_fft_bins/2+1 + /// @param mel_energies_out 1-D array of size num_mel_bins + void Compute(const float *fft_energies, float *mel_energies_out) const; + + int32_t NumBins() const { return bins_.size(); } + + private: + // for kaldi-compatible + void InitKaldiMelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + float vtln_warp_factor); + + // for librosa-compatible + // See https://librosa.org/doc/main/generated/librosa.filters.mel.html + void InitLibrosaMelBanks(const MelBanksOptions &opts, + const FrameExtractionOptions &frame_opts, + float vtln_warp_factor); + + private: + // the "bins_" vector is a vector, one for each bin, of a pair: + // (the first nonzero fft-bin), (the vector of weights). + std::vector>> bins_; + + // TODO(fangjun): Remove debug_ and htk_mode_ + bool debug_ = false; + bool htk_mode_ = false; +}; + +// Compute liftering coefficients (scaling on cepstral coeffs) +// coeffs are numbered slightly differently from HTK: the zeroth +// index is C0, which is not affected. +void ComputeLifterCoeffs(float Q, std::vector *coeffs); + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/online-feature.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/online-feature.h new file mode 100644 index 00000000..3d42fd49 --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/online-feature.h @@ -0,0 +1,153 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The content in this file is copied/modified from +// This file is copied/modified from kaldi/src/feat/online-feature.h +#ifndef KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_ +#define KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_ + +#include +#include +#include + +#include "kaldi-native-fbank/csrc/feature-fbank.h" +#include "kaldi-native-fbank/csrc/feature-mfcc.h" +#include "kaldi-native-fbank/csrc/feature-window.h" +#include "kaldi-native-fbank/csrc/whisper-feature.h" + +namespace knf { + +/// This class serves as a storage for feature vectors with an option to limit +/// the memory usage by removing old elements. The deleted frames indices are +/// "remembered" so that regardless of the MAX_ITEMS setting, the user always +/// provides the indices as if no deletion was being performed. +/// This is useful when processing very long recordings which would otherwise +/// cause the memory to eventually blow up when the features are not being +/// removed. +class RecyclingVector { + public: + /// By default it does not remove any elements. + explicit RecyclingVector(int32_t items_to_hold = -1); + + ~RecyclingVector() = default; + RecyclingVector(const RecyclingVector &) = delete; + RecyclingVector &operator=(const RecyclingVector &) = delete; + + // The pointer is owned by RecyclingVector + // Users should not free it + const float *At(int32_t index) const; + + void PushBack(std::vector item); + + /// This method returns the size as if no "recycling" had happened, + /// i.e. equivalent to the number of times the PushBack method has been + /// called. + int32_t Size() const; + + // discard the first n frames + void Pop(int32_t n); + + private: + std::deque> items_; + int32_t items_to_hold_; + int32_t first_available_index_; +}; + +/// This is a templated class for online feature extraction; +/// it's templated on a class like MfccComputer or PlpComputer +/// that does the basic feature extraction. +template +class OnlineGenericBaseFeature { + public: + // Constructor from options class + explicit OnlineGenericBaseFeature(const typename C::Options &opts); + + int32_t Dim() const { return computer_.Dim(); } + + float FrameShiftInSeconds() const { + return computer_.GetFrameOptions().frame_shift_ms / 1000.0f; + } + + int32_t NumFramesReady() const { return features_.Size(); } + + // Note: IsLastFrame() will only ever return true if you have called + // InputFinished() (and this frame is the last frame). + bool IsLastFrame(int32_t frame) const { + return input_finished_ && frame == NumFramesReady() - 1; + } + + const float *GetFrame(int32_t frame) const { return features_.At(frame); } + + // This would be called from the application, when you get + // more wave data. Note: the sampling_rate is only provided so + // the code can assert that it matches the sampling rate + // expected in the options. + // + // @param sampling_rate The sampling_rate of the input waveform + // @param waveform Pointer to a 1-D array of size n + // @param n Number of entries in waveform + void AcceptWaveform(float sampling_rate, const float *waveform, int32_t n); + + // InputFinished() tells the class you won't be providing any + // more waveform. This will help flush out the last frame or two + // of features, in the case where snip-edges == false; it also + // affects the return value of IsLastFrame(). + void InputFinished(); + + // discard the first n frames + void Pop(int32_t n) { features_.Pop(n); } + + private: + // This function computes any additional feature frames that it is possible to + // compute from 'waveform_remainder_', which at this point may contain more + // than just a remainder-sized quantity (because AcceptWaveform() appends to + // waveform_remainder_ before calling this function). It adds these feature + // frames to features_, and shifts off any now-unneeded samples of input from + // waveform_remainder_ while incrementing waveform_offset_ by the same amount. + void ComputeFeatures(); + + C computer_; // class that does the MFCC or PLP or filterbank computation + + FeatureWindowFunction window_function_; + + // features_ is the Mfcc or Plp or Fbank features that we have already + // computed. + + RecyclingVector features_; + + // True if the user has called "InputFinished()" + bool input_finished_; + + // waveform_offset_ is the number of samples of waveform that we have + // already discarded, i.e. that were prior to 'waveform_remainder_'. + int64_t waveform_offset_; + + // waveform_remainder_ is a short piece of waveform that we may need to keep + // after extracting all the whole frames we can (whatever length of feature + // will be required for the next phase of computation). + // It is a 1-D tensor + std::vector waveform_remainder_; +}; + +using OnlineFbank = OnlineGenericBaseFeature; +using OnlineMfcc = OnlineGenericBaseFeature; +using OnlineWhisperFbank = OnlineGenericBaseFeature; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_ONLINE_FEATURE_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/rfft.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/rfft.h new file mode 100644 index 00000000..d4652f8c --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/rfft.h @@ -0,0 +1,58 @@ +/** + * Copyright (c) 2022 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_ +#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_ + +#include +#include + +namespace knf { + +// n-point Real discrete Fourier transform +// where n is a power of 2. n >= 2 +// +// R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2 +// I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0 impl_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_RFFT_H_ diff --git a/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/whisper-feature.h b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/whisper-feature.h new file mode 100644 index 00000000..8ada4bd9 --- /dev/null +++ b/3rdparty/kaldi_native_fbank/include/kaldi-native-fbank/csrc/whisper-feature.h @@ -0,0 +1,68 @@ +/** + * Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef KALDI_NATIVE_FBANK_CSRC_WHISPER_FEATURE_H_ +#define KALDI_NATIVE_FBANK_CSRC_WHISPER_FEATURE_H_ + +#include +#include +#include + +#include "kaldi-native-fbank/csrc/feature-window.h" +#include "kaldi-native-fbank/csrc/mel-computations.h" + +namespace knf { + +struct WhisperFeatureOptions { + WhisperFeatureOptions(const FrameExtractionOptions &frame_opts = {}, + int32_t dim = 80) + : frame_opts(frame_opts), dim(dim) {} + + FrameExtractionOptions frame_opts; + int32_t dim = 80; + + std::string ToString() const; +}; + +class WhisperFeatureComputer { + public: + // note: opts.frame_opts is ignored and we reset it inside + explicit WhisperFeatureComputer(const WhisperFeatureOptions &opts = {}); + + int32_t Dim() const { return opts_.dim; } + + const FrameExtractionOptions &GetFrameOptions() const { + return opts_.frame_opts; + } + + void Compute(float /*signal_raw_log_energy*/, float /*vtln_warp*/, + std::vector *signal_frame, float *feature); + + // if true, compute log_energy_pre_window but after dithering and dc removal + bool NeedRawLogEnergy() const { return false; } + + using Options = WhisperFeatureOptions; + + private: + std::unique_ptr mel_banks_; + WhisperFeatureOptions opts_; +}; + +} // namespace knf + +#endif // KALDI_NATIVE_FBANK_CSRC_WHISPER_FEATURE_H_ diff --git a/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so b/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so index dadb2a91..165f28ec 100644 Binary files a/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so and b/3rdparty/rknpu2/Android/arm64-v8a/librknnrt.so differ diff --git a/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so b/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so index 364c7f82..499146db 100644 Binary files a/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so and b/3rdparty/rknpu2/Android/armeabi-v7a/librknnrt.so differ diff --git a/3rdparty/rknpu2/Linux/aarch64/librknnrt.so b/3rdparty/rknpu2/Linux/aarch64/librknnrt.so index 22362e87..ec8a5451 100644 Binary files a/3rdparty/rknpu2/Linux/aarch64/librknnrt.so and b/3rdparty/rknpu2/Linux/aarch64/librknnrt.so differ diff --git a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a index 967e51af..1a1075e7 100644 Binary files a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a and b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.a differ diff --git a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so index cd8a941a..0df8af2e 100644 Binary files a/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so and b/3rdparty/rknpu2/Linux/armhf-uclibc/librknnmrt.so differ diff --git a/3rdparty/rknpu2/Linux/armhf/librknnrt.so b/3rdparty/rknpu2/Linux/armhf/librknnrt.so index d12a4f46..3b15274c 100644 Binary files a/3rdparty/rknpu2/Linux/armhf/librknnrt.so and b/3rdparty/rknpu2/Linux/armhf/librknnrt.so differ diff --git a/3rdparty/rknpu2/include/rknn_api.h b/3rdparty/rknpu2/include/rknn_api.h index 2e5e9f2e..c50673b5 100644 --- a/3rdparty/rknpu2/include/rknn_api.h +++ b/3rdparty/rknpu2/include/rknn_api.h @@ -183,6 +183,7 @@ typedef enum _rknn_tensor_type { RKNN_TENSOR_INT64, /* data type is int64. */ RKNN_TENSOR_BOOL, RKNN_TENSOR_INT4, + RKNN_TENSOR_BFLOAT16, RKNN_TENSOR_TYPE_MAX } rknn_tensor_type; @@ -201,6 +202,7 @@ inline static const char* get_type_string(rknn_tensor_type type) case RKNN_TENSOR_INT64: return "INT64"; case RKNN_TENSOR_BOOL: return "BOOL"; case RKNN_TENSOR_INT4: return "INT4"; + case RKNN_TENSOR_BFLOAT16: return "BF16"; default: return "UNKNOW"; } } diff --git a/3rdparty/rknpu2/include/rknn_matmul_api.h b/3rdparty/rknpu2/include/rknn_matmul_api.h index 0ae78798..15570db4 100644 --- a/3rdparty/rknpu2/include/rknn_matmul_api.h +++ b/3rdparty/rknpu2/include/rknn_matmul_api.h @@ -58,6 +58,7 @@ typedef enum _rknn_matmul_type RKNN_INT8_MM_INT8_TO_FLOAT32 = 9, RKNN_INT4_MM_INT4_TO_INT16 = 10, RKNN_INT8_MM_INT4_TO_INT32 = 11, + RKNN_FLOAT16_MM_INT4_TO_BFLOAT16 = 12, } rknn_matmul_type; inline static const char* get_matmul_type_string(rknn_matmul_type type) @@ -85,6 +86,8 @@ inline static const char* get_matmul_type_string(rknn_matmul_type type) return "RKNN_INT8_MM_INT4_TO_INT32"; case RKNN_INT8_MM_INT8_TO_FLOAT32: return "RKNN_INT8_MM_INT8_TO_FLOAT32"; + case RKNN_FLOAT16_MM_INT4_TO_BFLOAT16: + return "RKNN_FLOAT16_MM_INT4_TO_BFLOAT16"; default: return "UNKNOW"; } diff --git a/README.md b/README.md index 9c44a8c7..e24beab5 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ In addition to exporting the model from the corresponding respository, the model | Object Detection | [yolov8](https://github.com/airockchip/ultralytics_yolov8) | FP16/INT8 | [./yolov8n.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8/yolov8n.onnx)
[./yolov8s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8/yolov8s.onnx)
[./yolov8m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8/yolov8m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Object Detection | [yolov8_obb](https://github.com/airockchip/ultralytics_yolov8) | INT8 | [./yolov8n-obb.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_obb/yolov8n-obb.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Object Detection | [yolov10](https://github.com/THU-MIG/yolov10) | FP16/INT8 | [./yolov10n.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov10/yolov10n.onnx)
[./yolov10s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov10/yolov10s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RV1103\|RV1106
RK1808\|RK3399PRO
RV1109\|RV1126 | +| Object Detection | [yolo11](https://github.com/airockchip/ultralytics_yolo11) | FP16/INT8 | [./yolo11n.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11n.onnx)
[./yolo11s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11s.onnx)
[./yolo11m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RV1103\|RV1106
RK1808\|RK3399PRO
RV1109\|RV1126 | | Object Detection | [yolox](https://github.com/airockchip/YOLOX) | FP16/INT8 | [./yolox_s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolox/yolox_s.onnx)
[./yolox_m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolox/yolox_m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Object Detection | [ppyoloe](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.6/configs/ppyoloe) | FP16/INT8 | [./ppyoloe_s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppyoloe/ppyoloe_s.onnx)
[./ppyoloe_m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppyoloe/ppyoloe_m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Object Detection | [yolo_world](https://github.com/AILab-CVC/YOLO-World) | FP16/INT8 | [./yolo_world_v2s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo_world/yolo_world_v2s.onnx)
[./clip_text.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo_world/clip_text.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| @@ -46,14 +47,16 @@ In addition to exporting the model from the corresponding respository, the model | Image Segmentation | [ppseg](https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.8) | FP16/INT8 | [pp_liteseg_cityscapes.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Image Segmentation | [mobilesam](https://github.com/ChaoningZhang/MobileSAM) | FP16 | [mobilesam_encoder_tiny.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mobilesam/mobilesam_encoder_tiny.onnx)
[mobilesam_decoder.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mobilesam/mobilesam_decoder.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576 | | Face Key Points | [RetinaFace](https://github.com/biubug6/Pytorch_Retinaface) | INT8 | [RetinaFace_mobile320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_mobile320.onnx)
[RetinaFace_resnet50_320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_resnet50_320.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | -| Car Plate Recognition | [LPRNet](https://github.com/sirius-ai/LPRNet_Pytorch/) | FP16/INT8 | [./lprnet.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/LPRNet/lprnet.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | +| Car Plate Recognition | [LPRNet](https://github.com/sirius-ai/LPRNet_Pytorch/) | FP16/INT8 | [./lprnet.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/LPRNet/lprnet.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RV1103\|RV1106
RK1808\|RK3399PRO
RV1109\|RV1126 | | Text Detection | [PPOCR-Det](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7) | FP16/INT8 | [../ppocrv4_det.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/PPOCR/ppocrv4_det.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Text Recognition | [PPOCR-Rec](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7) | FP16 | [../ppocrv4_rec.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/PPOCR/ppocrv4_rec.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Neural Machine Translation | [lite_transformer](https://github.com/airockchip/lite-transformer) | FP16 | [lite-transformer-encoder-16.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/lite_transformer/lite-transformer-encoder-16.onnx)
[lite-transformer-decoder-16.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/lite_transformer/lite-transformer-decoder-16.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | Image-Text Matching | [clip](https://huggingface.co/openai/clip-vit-base-patch32) | FP16 | [./clip_images.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/clip/clip_images.onnx)
[./clip_text.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/clip/clip_text.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| | Speech Recognition | [wav2vec2](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec#wav2vec-20) | FP16 | [wav2vec2_base_960h_20s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/wav2vec2/wav2vec2_base_960h_20s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576 | | Speech Recognition | [whisper](https://github.com/openai/whisper) | FP16 | [whisper_encoder_base_20s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx)
[whisper_decoder_base_20s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| +| Speech Recognition | [zipformer](https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t) | FP16 | [encoder-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/encoder-epoch-99-avg-1.onnx)
[decoder-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/decoder-epoch-99-avg-1.onnx)
[joiner-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/joiner-epoch-99-avg-1.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| | Speech Classification | [yamnet](https://www.tensorflow.org/hub/tutorials/yamnet) | FP16 | [yamnet_3s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yamnet/yamnet_3s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576 | +| Text to Speech | [mms_tts](https://huggingface.co/facebook/mms-tts-eng) | FP16 | [mms_tts_eng_encoder_200.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_encoder_200.onnx)
[mms_tts_eng_decoder_200.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_decoder_200.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| @@ -62,52 +65,57 @@ In addition to exporting the model from the corresponding respository, the model | demo | model_name | inputs_shape     | dtype | RK3566
RK3568 | RK3562 | RK3588
@single_core | RK3576
@single_core | RV1109 | RV1126 | RK1808 | | ---------------- | ----------------------------------- | ------------------------------------ | ----- | ------------------ | -------------- | ------------------------ | ------------------------ | ---------- | ---------- | ---------- | -| mobilenet | mobilenetv2-12 | [1, 3, 224, 224] | INT8 | 184.3 | 295.5 | 455.4 | 467.2 | 212.9 | 322.3 | 170.3 | -| resnet | resnet50-v2-7 | [1, 3, 224, 224] | INT8 | 38.5 | 56.1 | 110.4 | 99.8 | 24.4 | 36.2 | 37.1 | -| yolov5 | yolov5s_relu | [1, 3, 640, 640] | INT8 | 25.6 | 33.7 | 66.2 | 65.4 | 20.2 | 29.2 | 37.2 | -| | yolov5n | [1, 3, 640, 640] | INT8 | 39.8 | 48.9 | 82.9 | 113.0 | 36.3 | 53.2 | 61.2 | -| | yolov5s | [1, 3, 640, 640] | INT8 | 19.5 | 23.9 | 48.5 | 57.7 | 13.6 | 20.0 | 28.2 | -| | yolov5m | [1, 3, 640, 640] | INT8 | 8.7 | 11.0 | 20.9 | 23.8 | 5.8 | 8.5 | 13.3 | -| yolov6 | yolov6n | [1, 3, 640, 640] | INT8 | 49.0 | 57.4 | 106.8 | 110.3 | 37.8 | 56.8 | 66.8 | -| | yolov6s | [1, 3, 640, 640] | INT8 | 15.3 | 17.5 | 36.4 | 34.8 | 10.8 | 16.3 | 24.1 | -| | yolov6m | [1, 3, 640, 640] | INT8 | 7.3 | 8.6 | 17.6 | 17.5 | 5.6 | 8.3 | 11.5 | -| yolov7 | yolov7-tiny | [1, 3, 640, 640] | INT8 | 28.0 | 37.1 | 72.9 | 75.1 | 15.4 | 22.4 | 37.2 | -| | yolov7 | [1, 3, 640, 640] | INT8 | 4.7 | 5.9 | 11.5 | 13.0 | 3.3 | 4.8 | 7.4 | -| yolov8 | yolov8n | [1, 3, 640, 640] | INT8 | 34.4 | 41.5 | 74.0 | 90.2 | 24.0 | 35.4 | 42.3 | -| | yolov8s | [1, 3, 640, 640] | INT8 | 15.2 | 18.3 | 35.2 | 41.0 | 8.9 | 13.1 | 19.1 | -| | yolov8m | [1, 3, 640, 640] | INT8 | 6.6 | 8.3 | 16.2 | 16.7 | 3.9 | 5.8 | 9.1 | -| yolov8_obb | yolov8n-obb | [1, 3, 640, 640] | INT8 | 34.3 | 41.7 | 74.1 | 91.0 | 25.1 | 37.3 | 42.8 | -| yolov10 | yolov10n | [1, 3, 640, 640] | INT8 | 21.1 | 31.7 | 56.1 | 74.5 | / | / | / | -| | yolov10s | [1, 3, 640, 640] | INT8 | 10.4 | 16.1 | 32.0 | 38.2 | / | / | / | -| yolox | yolox_s | [1, 3, 640, 640] | INT8 | 15.3 | 18.4 | 37.1 | 42.0 | 10.6 | 15.7 | 23.0 | -| | yolox_m | [1, 3, 640, 640] | INT8 | 6.7 | 8.3 | 16.0 | 17.6 | 4.6 | 6.8 | 10.7 | -| ppyoloe | ppyoloe_s | [1, 3, 640, 640] | INT8 | 7.5 | 20.4 | 31.9 | 41.5 | 11.2 | 16.4 | 21.1 | -| | ppyoloe_m | [1, 3, 640, 640] | INT8 | 4.2 | 9.3 | 15.3 | 17.8 | 5.2 | 7.7 | 9.4 | -| yolo_world | yolo_world_v2s | [1, 3, 640, 640] | INT8 | 7.5 | 9.6 | 22.2 | 22.1 | / | / | / | -| | clip_text | [1, 20] | FP16 | 28.33 | 64.6 | 92.4 | 61.7 | / | / | / | -| yolov8_pose | yolov8n-pose | [1, 3, 640, 640] | INT8 | 23.0 | 31.6 | 56.0 | 67.2 | / | / | / | -| deeplabv3 | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1] | INT8 | 10.0 | 21.6 | 34.3 | 39.6 | 10.1 | 13.0 | 4.4 | -| yolov5_seg | yolov5n-seg | [1, 3, 640, 640] | INT8 | 32.6 | 40.0 | 69.5 | 89.2 | 28.6 | 42.2 | 49.6 | -| | yolov5s-seg | [1, 3, 640, 640] | INT8 | 15.1 | 18.3 | 36.9 | 41.8 | 9.6 | 14.0 | 22.5 | -| | yolov5m-seg | [1, 3, 640, 640] | INT8 | 6.8 | 8.5 | 16.4 | 18.0 | 4.7 | 6.8 | 10.8 | -| yolov8_seg | yolov8n-seg | [1, 3, 640, 640] | INT8 | 28.1 | 33.7 | 61.0 | 71.5 | 18.6 | 27.6 | 32.9 | -| | yolov8s-seg | [1, 3, 640, 640] | INT8 | 11.8 | 14.2 | 29.0 | 30.9 | 6.6 | 9.8 | 14.6 | -| | yolov8m-seg | [1, 3, 640, 640] | INT8 | 5.2 | 6.4 | 12.6 | 12.8 | 3.1 | 4.6 | 6.9 | -| ppseg | ppseg_lite_1024x512 | [1, 3, 512, 512] | INT8 | 4.4 | 12.0 | 29.9 | 29.2 | 18.4 | 27.1 | 20.9 | -| mobilesam | mobilesam_encoder_tiny | [1, 3, 448, 448] | FP16 | 0.9 | 6.7 | 9.8 | 12.0 | / | / | / | -| | mobilesam_decoder | [1, 1, 112, 112] | FP16 | 23.3 | 71.3 | 116.3 | 108.9 | / | / | / | -| RetinaFace | RetinaFace_mobile320 | [1, 3, 320, 320] | INT8 | 157.0 | 316.5 | 230.3 | 476.6 | 144.8 | 212.5 | 198.5 | -| | RetinaFace_resnet50_320 | [1, 3, 320, 320] | INT8 | 18.8 | 26.8 | 49.4 | 56.8 | 14.6 | 20.8 | 24.6 | -| LPRNet | lprnet | [1, 3, 24, 94] | FP16 | 154.7 | 320.0 | 432.8 | 492.5 | 30.6(INT8) | 47.6(INT8) | 30.1(INT8) | -| PPOCR-Det | ppocrv4_det | [1, 3, 480, 480] | INT8 | 22.6 | 28.9 | 50.8 | 64.8 | 11.0 | 16.1 | 14.2 | -| PPOCR-Rec | ppocrv4_rec | [1, 3, 48, 320] | FP16 | 19.7 | 54.1 | 74.3 | 96.6 | 1.0 | 1.6 | 6.7 | -| lite_transformer | lite-transformer-encoder-16 | embedding-256, token-16 | FP16 | 331.6 | 728.8 | 878.5 | 786.4 | 22.7 | 35.4 | 98.3 | -| | lite-transformer-decoder-16 | embedding-256, token-16 | FP16 | 142.4 | 252.2 | 346.3 | 270.3 | 48.0 | 65.8 | 109.9 | -| clip | clip_images | [1, 3, 224, 224] | FP16 | 2.3 | 3.4 | 6.5 | 6.7 | / | / | / | -| | clip_text | [1, 20] | FP16 | 28.33 | 64.6 | 92.5 | 61.5 | / | / | / | -| wav2vec2 | wav2vec2_base_960h_20s | 20s audio | FP16 | RTF
0.861 | RTF
0.333 | RTF
0.131 | RTF
0.073 | / | / | / | -| whisper | encoder+decoder+NPU-outside process | 20s audio | FP16 | RTF
1.253 | RTF
0.417 | RTF
0.219 | RTF
0.216 | / | / | / | -| yamnet | yamnet_3s | 3s audio | FP16 | RTF
0.013 | RTF
0.008 | RTF
0.004 | RTF
0.005 | / | / | / | +| mobilenet | mobilenetv2-12 | [1, 3, 224, 224] | INT8 | 180.7 | 281.3 | 450.7 | 467.0 | 212.9 | 322.3 | 170.3 | +| resnet | resnet50-v2-7 | [1, 3, 224, 224] | INT8 | 37.9 | 54.9 | 110.1 | 99.0 | 24.4 | 36.2 | 37.1 | +| yolov5 | yolov5s_relu | [1, 3, 640, 640] | INT8 | 25.5 | 33.2 | 66.1 | 65.0 | 20.2 | 29.2 | 37.2 | +| | yolov5n | [1, 3, 640, 640] | INT8 | 39.7 | 47.4 | 82.5 | 112.7 | 36.3 | 53.2 | 61.2 | +| | yolov5s | [1, 3, 640, 640] | INT8 | 19.3 | 23.6 | 48.4 | 57.5 | 13.6 | 20.0 | 28.2 | +| | yolov5m | [1, 3, 640, 640] | INT8 | 8.6 | 10.8 | 20.9 | 23.7 | 5.8 | 8.5 | 13.3 | +| yolov6 | yolov6n | [1, 3, 640, 640] | INT8 | 48.8 | 56.4 | 106.4 | 109.1 | 37.8 | 56.8 | 66.8 | +| | yolov6s | [1, 3, 640, 640] | INT8 | 15.2 | 17.3 | 36.4 | 35.0 | 10.8 | 16.3 | 24.1 | +| | yolov6m | [1, 3, 640, 640] | INT8 | 7.2 | 8.6 | 17.8 | 17.4 | 5.6 | 8.3 | 11.5 | +| yolov7 | yolov7-tiny | [1, 3, 640, 640] | INT8 | 27.9 | 36.5 | 72.7 | 74.8 | 15.4 | 22.4 | 37.2 | +| | yolov7 | [1, 3, 640, 640] | INT8 | 4.6 | 5.9 | 11.4 | 13.0 | 3.3 | 4.8 | 7.4 | +| yolov8 | yolov8n | [1, 3, 640, 640] | INT8 | 34.0 | 40.9 | 73.5 | 90.2 | 24.0 | 35.4 | 42.3 | +| | yolov8s | [1, 3, 640, 640] | INT8 | 15.1 | 18.4 | 38.0 | 40.8 | 8.9 | 13.1 | 19.1 | +| | yolov8m | [1, 3, 640, 640] | INT8 | 6.5 | 8.2 | 16.2 | 16.7 | 3.9 | 5.8 | 9.1 | +| yolov8_obb | yolov8n-obb | [1, 3, 640, 640] | INT8 | 33.9 | 41.3 | 74.0 | 90.2 | 25.1 | 37.3 | 42.8 | +| yolov10 | yolov10n | [1, 3, 640, 640] | INT8 | 20.7 | 34.1 | 61.2 | 80.2 | / | / | / | +| | yolov10s | [1, 3, 640, 640] | INT8 | 10.3 | 16.9 | 33.8 | 39.9 | / | / | / | +| yolo11 | yolo11n | [1, 3, 640, 640] | INT8 | 20.6 | 34.0 | 60.0 | 77.9 | 11.7 | 17.0 | 17.6 | +| | yolo11s | [1, 3, 640, 640] | INT8 | 10.2 | 16.7 | 33.0 | 38.2 | 5.0 | 7.3 | 8.4 | +| | yolo11m | [1, 3, 640, 640] | INT8 | 4.6 | 6.5 | 12.7 | 14.6 | 2.8 | 4.0 | 5.1 | +| yolox | yolox_s | [1, 3, 640, 640] | INT8 | 15.2 | 18.3 | 37.1 | 41.5 | 10.6 | 15.7 | 23.0 | +| | yolox_m | [1, 3, 640, 640] | INT8 | 6.6 | 8.2 | 16.0 | 17.6 | 4.6 | 6.8 | 10.7 | +| ppyoloe | ppyoloe_s | [1, 3, 640, 640] | INT8 | 7.4 | 20.0 | 32.5 | 41.3 | 11.2 | 16.4 | 21.1 | +| | ppyoloe_m | [1, 3, 640, 640] | INT8 | 4.1 | 9.2 | 15.8 | 17.8 | 5.2 | 7.7 | 9.4 | +| yolo_world | yolo_world_v2s | [1, 3, 640, 640] | INT8 | 7.4 | 9.6 | 22.1 | 22.3 | / | / | / | +| | clip_text | [1, 20] | FP16 | 29.8 | 67.4 | 95.8 | 63.5 | / | / | / | +| yolov8_pose | yolov8n-pose | [1, 3, 640, 640] | INT8 | 22.6 | 31.0 | 55.9 | 66.8 | / | / | / | +| deeplabv3 | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1] | INT8 | 10.9 | 21.4 | 34.0 | 39.4 | 10.1 | 13.0 | 4.4 | +| yolov5_seg | yolov5n-seg | [1, 3, 640, 640] | INT8 | 32.2 | 38.5 | 69.3 | 88.3 | 28.6 | 42.2 | 49.6 | +| | yolov5s-seg | [1, 3, 640, 640] | INT8 | 15.0 | 18.1 | 36.8 | 41.6 | 9.6 | 14.0 | 22.5 | +| | yolov5m-seg | [1, 3, 640, 640] | INT8 | 6.8 | 8.4 | 16.4 | 18.0 | 4.7 | 6.8 | 10.8 | +| yolov8_seg | yolov8n-seg | [1, 3, 640, 640] | INT8 | 27.8 | 33.0 | 60.8 | 71.1 | 18.6 | 27.6 | 32.9 | +| | yolov8s-seg | [1, 3, 640, 640] | INT8 | 11.7 | 14.1 | 28.9 | 30.8 | 6.6 | 9.8 | 14.6 | +| | yolov8m-seg | [1, 3, 640, 640] | INT8 | 5.2 | 6.4 | 12.6 | 12.7 | 3.1 | 4.6 | 6.9 | +| ppseg | ppseg_lite_1024x512 | [1, 3, 512, 512] | INT8 | 5.9 | 13.9 | 35.7 | 33.6 | 18.4 | 27.1 | 20.9 | +| mobilesam | mobilesam_encoder_tiny | [1, 3, 448, 448] | FP16 | 1.0 | 6.6 | 10.0 | 11.9 | / | / | / | +| | mobilesam_decoder | [1, 1, 112, 112] | FP16 | 24.3 | 69.6 | 116.4 | 108.6 | / | / | / | +| RetinaFace | RetinaFace_mobile320 | [1, 3, 320, 320] | INT8 | 156.4 | 300.8 | 227.2 | 470.5 | 144.8 | 212.5 | 198.5 | +| | RetinaFace_resnet50_320 | [1, 3, 320, 320] | INT8 | 18.7 | 26.9 | 49.2 | 56.6 | 14.6 | 20.8 | 24.6 | +| LPRNet | lprnet | [1, 3, 24, 94] | FP16 | 143.2 | 420.6 | 586.4 | 647.8 | 30.6(INT8) | 47.6(INT8) | 30.1(INT8) | +| PPOCR-Det | ppocrv4_det | [1, 3, 480, 480] | INT8 | 22.1 | 28.0 | 50.7 | 64.3 | 11.0 | 16.1 | 14.2 | +| PPOCR-Rec | ppocrv4_rec | [1, 3, 48, 320] | FP16 | 19.5 | 54.3 | 73.9 | 96.8 | 1.0 | 1.6 | 6.7 | +| lite_transformer | lite-transformer-encoder-16 | embedding-256, token-16 | FP16 | 337.5 | 725.8 | 867.6 | 784.1 | 22.7 | 35.4 | 98.3 | +| | lite-transformer-decoder-16 | embedding-256, token-16 | FP16 | 142.5 | 252.0 | 343.8 | 272.3 | 48.0 | 65.8 | 109.9 | +| clip | clip_images | [1, 3, 224, 224] | FP16 | 2.3 | 3.4 | 6.5 | 6.7 | / | / | / | +| | clip_text | [1, 20] | FP16 | 29.7 | 66.6 | 96.0 | 63.7 | / | / | / | +| wav2vec2 | wav2vec2_base_960h_20s | 20s audio | FP16 | RTF
0.817 | RTF
0.323 | RTF
0.133 | RTF
0.073 | / | / | / | +| whisper | whisper_base_20s | 20s audio | FP16 | RTF
1.178 | RTF
0.420 | RTF
0.215 | RTF
0.218 | / | / | / | +| zipformer | zipformer-bilingual-zh-en-t | streaming audio | FP16 | RTF
0.196 | RTF
0.116 | RTF
0.065 | RTF
0.082 | / | / | / | +| yamnet | yamnet_3s | 3s audio | FP16 | RTF
0.013 | RTF
0.008 | RTF
0.004 | RTF
0.005 | / | / | / | +| mms_tts | mms_tts_eng_200 | token-200 | FP16 | RTF
0.311 | RTF
0.138 | RTF
0.069 | RTF
0.069 | / | / | / | - This performance data are collected based on the maximum NPU frequency of each platform. - This performance data calculate the time-consuming of model inference. Does not include the time-consuming of pre-processing and post-processing if not specified. @@ -154,6 +162,7 @@ export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b | Version | Description | | ------- | ------------------------------------------------------------ | +| 2.3.0 | New demos released, including yolo11, zipformer, mms_tts, etc. | | 2.2.0 | New demo wav2vec, mobilesam release. Update demo guide about exporting model. | | 2.1.0 | New demo release, including yolov8_pose, yolov8_obb, yolov10, yolo_world, clip, whisper, yamnet
`RK1808`, `RV1109`, `RV1126` platform support of these demo will be added in next version. | | 2.0.0 | Add new support for `RK3576` for all demo.
Full support for `RK1808`, `RV1109`, `RV1126` platform. | @@ -168,6 +177,7 @@ All demos in `RKNN Model Zoo` are verified based on the latest RKNPU SDK. If usi | Version | RKNPU2 SDK | RKNPU1 SDK | | ------- | ---------- | ---------- | +| 2.3.0 | >=2.3.0 | >=1.7.5 | | 2.2.0 | >=2.2.0 | >=1.7.5 | | 2.1.0 | >=2.1.0 | >=1.7.5 | | 2.0.0 | >=2.0.0 | >=1.7.5 | diff --git a/README_CN.md b/README_CN.md index d95cf822..ef78617c 100644 --- a/README_CN.md +++ b/README_CN.md @@ -36,6 +36,7 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要 | 物体检测 | [yolov8](https://github.com/airockchip/ultralytics_yolov8) | FP16/INT8 | [./yolov8n.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8/yolov8n.onnx)
[./yolov8s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8/yolov8s.onnx)
[./yolov8m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8/yolov8m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 物体检测 | [yolov8_obb](https://github.com/airockchip/ultralytics_yolov8) | INT8 | [./yolov8n-obb.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov8_obb/yolov8n-obb.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 物体检测 | [yolov10](https://github.com/THU-MIG/yolov10) | FP16/INT8 | [./yolov10n.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov10/yolov10n.onnx)
[./yolov10s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolov10/yolov10s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RV1103\|RV1106
RK1808\|RK3399PRO
RV1109\|RV1126 | +| 物体检测 | [yolo11](https://github.com/airockchip/ultralytics_yolo11) | FP16/INT8 | [./yolo11n.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11n.onnx)
[./yolo11s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11s.onnx)
[./yolo11m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RV1103\|RV1106
RK1808\|RK3399PRO
RV1109\|RV1126 | | 物体检测 | [yolox](https://github.com/airockchip/YOLOX) | FP16/INT8 | [./yolox_s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolox/yolox_s.onnx)
[./yolox_m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolox/yolox_m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 物体检测 | [ppyoloe](https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.6/configs/ppyoloe) | FP16/INT8 | [./ppyoloe_s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppyoloe/ppyoloe_s.onnx)
[./ppyoloe_m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppyoloe/ppyoloe_m.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 物体检测 | [yolo_world](https://github.com/AILab-CVC/YOLO-World) | FP16/INT8 | [./yolo_world_v2s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo_world/yolo_world_v2s.onnx)
[./clip_text.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo_world/clip_text.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| @@ -46,14 +47,16 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要 | 图像分割 | [ppseg](https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.8) | FP16/INT8 | [pp_liteseg_cityscapes.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/ppseg/pp_liteseg_cityscapes.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 图像分割 | [mobilesam](https://github.com/ChaoningZhang/MobileSAM) | FP16 | [mobilesam_encoder_tiny.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mobilesam/mobilesam_encoder_tiny.onnx)
[mobilesam_decoder.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mobilesam/mobilesam_decoder.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576 | | 人脸关键点 | [RetinaFace](https://github.com/biubug6/Pytorch_Retinaface) | INT8 | [RetinaFace_mobile320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_mobile320.onnx)
[RetinaFace_resnet50_320.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/RetinaFace/RetinaFace_resnet50_320.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | -| 车牌识别 | [LPRNet](https://github.com/sirius-ai/LPRNet_Pytorch/) | FP16/INT8 | [./lprnet.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/LPRNet/lprnet.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | +| 车牌识别 | [LPRNet](https://github.com/sirius-ai/LPRNet_Pytorch/) | FP16/INT8 | [./lprnet.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/LPRNet/lprnet.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RV1103\|RV1106
RK1808\|RK3399PRO
RV1109\|RV1126 | | 文字检测 | [PPOCR-Det](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7) | FP16/INT8 | [../ppocrv4_det.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/PPOCR/ppocrv4_det.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 文字识别 | [PPOCR-Rec](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.7) | FP16 | [../ppocrv4_rec.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/PPOCR/ppocrv4_rec.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 自然语言翻译 | [lite_transformer](https://github.com/airockchip/lite-transformer) | FP16 | [lite-transformer-encoder-16.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/lite_transformer/lite-transformer-encoder-16.onnx)
[lite-transformer-decoder-16.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/lite_transformer/lite-transformer-decoder-16.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
RK1808\|RK3399PRO
RV1109\|RV1126 | | 图文匹配 | [clip](https://huggingface.co/openai/clip-vit-base-patch32) | FP16 | [./clip_images.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/clip/clip_images.onnx)
[./clip_text.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/clip/clip_text.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| | 语音识别 | [wav2vec2](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec#wav2vec-20) | FP16 | [wav2vec2_base_960h_20s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/wav2vec2/wav2vec2_base_960h_20s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576 | | 语音识别 | [whisper](https://github.com/openai/whisper) | FP16 | [whisper_encoder_base_20s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_encoder_base_20s.onnx)
[whisper_decoder_base_20s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/whisper/whisper_decoder_base_20s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| +| 语音识别 | [zipformer](https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t) | FP16 | [encoder-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/encoder-epoch-99-avg-1.onnx)
[decoder-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/decoder-epoch-99-avg-1.onnx)
[joiner-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/joiner-epoch-99-avg-1.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| | 语音分类 | [yamnet](https://www.tensorflow.org/hub/tutorials/yamnet) | FP16 | [yamnet_3s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yamnet/yamnet_3s.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576 | +| 文字转语音 | [mms_tts](https://huggingface.co/facebook/mms-tts-eng) | FP16 | [mms_tts_eng_encoder_200.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_encoder_200.onnx)
[mms_tts_eng_decoder_200.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_decoder_200.onnx) | RK3566\|RK3568\|RK3588\|RK3562\|RK3576
| @@ -61,52 +64,57 @@ RKNN Model Zoo依赖 RKNN-Toolkit2 进行模型转换, 编译安卓demo时需要 | demo | model_name | inputs_shape     | dtype | RK3566
RK3568 | RK3562 | RK3588
@single_core | RK3576
@single_core | RV1109 | RV1126 | RK1808 | | ---------------- | ----------------------------------- | ------------------------------------ | ----- | ------------------ | -------------- | ------------------------ | ------------------------ | ---------- | ---------- | ---------- | -| mobilenet | mobilenetv2-12 | [1, 3, 224, 224] | INT8 | 184.3 | 295.5 | 455.4 | 467.2 | 212.9 | 322.3 | 170.3 | -| resnet | resnet50-v2-7 | [1, 3, 224, 224] | INT8 | 38.5 | 56.1 | 110.4 | 99.8 | 24.4 | 36.2 | 37.1 | -| yolov5 | yolov5s_relu | [1, 3, 640, 640] | INT8 | 25.6 | 33.7 | 66.2 | 65.4 | 20.2 | 29.2 | 37.2 | -| | yolov5n | [1, 3, 640, 640] | INT8 | 39.8 | 48.9 | 82.9 | 113.0 | 36.3 | 53.2 | 61.2 | -| | yolov5s | [1, 3, 640, 640] | INT8 | 19.5 | 23.9 | 48.5 | 57.7 | 13.6 | 20.0 | 28.2 | -| | yolov5m | [1, 3, 640, 640] | INT8 | 8.7 | 11.0 | 20.9 | 23.8 | 5.8 | 8.5 | 13.3 | -| yolov6 | yolov6n | [1, 3, 640, 640] | INT8 | 49.0 | 57.4 | 106.8 | 110.3 | 37.8 | 56.8 | 66.8 | -| | yolov6s | [1, 3, 640, 640] | INT8 | 15.3 | 17.5 | 36.4 | 34.8 | 10.8 | 16.3 | 24.1 | -| | yolov6m | [1, 3, 640, 640] | INT8 | 7.3 | 8.6 | 17.6 | 17.5 | 5.6 | 8.3 | 11.5 | -| yolov7 | yolov7-tiny | [1, 3, 640, 640] | INT8 | 28.0 | 37.1 | 72.9 | 75.1 | 15.4 | 22.4 | 37.2 | -| | yolov7 | [1, 3, 640, 640] | INT8 | 4.7 | 5.9 | 11.5 | 13.0 | 3.3 | 4.8 | 7.4 | -| yolov8 | yolov8n | [1, 3, 640, 640] | INT8 | 34.4 | 41.5 | 74.0 | 90.2 | 24.0 | 35.4 | 42.3 | -| | yolov8s | [1, 3, 640, 640] | INT8 | 15.2 | 18.3 | 35.2 | 41.0 | 8.9 | 13.1 | 19.1 | -| | yolov8m | [1, 3, 640, 640] | INT8 | 6.6 | 8.3 | 16.2 | 16.7 | 3.9 | 5.8 | 9.1 | -| yolov8_obb | yolov8n-obb | [1, 3, 640, 640] | INT8 | 34.3 | 41.7 | 74.1 | 91.0 | 25.1 | 37.3 | 42.8 | -| yolov10 | yolov10n | [1, 3, 640, 640] | INT8 | 21.1 | 31.7 | 56.1 | 74.5 | / | / | / | -| | yolov10s | [1, 3, 640, 640] | INT8 | 10.4 | 16.1 | 32.0 | 38.2 | / | / | / | -| yolox | yolox_s | [1, 3, 640, 640] | INT8 | 15.3 | 18.4 | 37.1 | 42.0 | 10.6 | 15.7 | 23.0 | -| | yolox_m | [1, 3, 640, 640] | INT8 | 6.7 | 8.3 | 16.0 | 17.6 | 4.6 | 6.8 | 10.7 | -| ppyoloe | ppyoloe_s | [1, 3, 640, 640] | INT8 | 7.5 | 20.4 | 31.9 | 41.5 | 11.2 | 16.4 | 21.1 | -| | ppyoloe_m | [1, 3, 640, 640] | INT8 | 4.2 | 9.3 | 15.3 | 17.8 | 5.2 | 7.7 | 9.4 | -| yolo_world | yolo_world_v2s | [1, 3, 640, 640] | INT8 | 7.5 | 9.6 | 22.2 | 22.1 | / | / | / | -| | clip_text | [1, 20] | FP16 | 28.33 | 64.6 | 92.4 | 61.7 | / | / | / | -| yolov8_pose | yolov8n-pose | [1, 3, 640, 640] | INT8 | 23.0 | 31.6 | 56.0 | 67.2 | / | / | / | -| deeplabv3 | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1] | INT8 | 10.0 | 21.6 | 34.3 | 39.6 | 10.1 | 13.0 | 4.4 | -| yolov5_seg | yolov5n-seg | [1, 3, 640, 640] | INT8 | 32.6 | 40.0 | 69.5 | 89.2 | 28.6 | 42.2 | 49.6 | -| | yolov5s-seg | [1, 3, 640, 640] | INT8 | 15.1 | 18.3 | 36.9 | 41.8 | 9.6 | 14.0 | 22.5 | -| | yolov5m-seg | [1, 3, 640, 640] | INT8 | 6.8 | 8.5 | 16.4 | 18.0 | 4.7 | 6.8 | 10.8 | -| yolov8_seg | yolov8n-seg | [1, 3, 640, 640] | INT8 | 28.1 | 33.7 | 61.0 | 71.5 | 18.6 | 27.6 | 32.9 | -| | yolov8s-seg | [1, 3, 640, 640] | INT8 | 11.8 | 14.2 | 29.0 | 30.9 | 6.6 | 9.8 | 14.6 | -| | yolov8m-seg | [1, 3, 640, 640] | INT8 | 5.2 | 6.4 | 12.6 | 12.8 | 3.1 | 4.6 | 6.9 | -| ppseg | ppseg_lite_1024x512 | [1, 3, 512, 512] | INT8 | 4.4 | 12.0 | 29.9 | 29.2 | 18.4 | 27.1 | 20.9 | -| mobilesam | mobilesam_encoder_tiny | [1, 3, 448, 448] | FP16 | 0.9 | 6.7 | 9.8 | 12.0 | / | / | / | -| | mobilesam_decoder | [1, 1, 112, 112] | FP16 | 23.3 | 71.3 | 116.3 | 108.9 | / | / | / | -| RetinaFace | RetinaFace_mobile320 | [1, 3, 320, 320] | INT8 | 157.0 | 316.5 | 230.3 | 476.6 | 144.8 | 212.5 | 198.5 | -| | RetinaFace_resnet50_320 | [1, 3, 320, 320] | INT8 | 18.8 | 26.8 | 49.4 | 56.8 | 14.6 | 20.8 | 24.6 | -| LPRNet | lprnet | [1, 3, 24, 94] | FP16 | 154.7 | 320.0 | 432.8 | 492.5 | 30.6(INT8) | 47.6(INT8) | 30.1(INT8) | -| PPOCR-Det | ppocrv4_det | [1, 3, 480, 480] | INT8 | 22.6 | 28.9 | 50.8 | 64.8 | 11.0 | 16.1 | 14.2 | -| PPOCR-Rec | ppocrv4_rec | [1, 3, 48, 320] | FP16 | 19.7 | 54.1 | 74.3 | 96.6 | 1.0 | 1.6 | 6.7 | -| lite_transformer | lite-transformer-encoder-16 | embedding-256, token-16 | FP16 | 331.6 | 728.8 | 878.5 | 786.4 | 22.7 | 35.4 | 98.3 | -| | lite-transformer-decoder-16 | embedding-256, token-16 | FP16 | 142.4 | 252.2 | 346.3 | 270.3 | 48.0 | 65.8 | 109.9 | -| clip | clip_images | [1, 3, 224, 224] | FP16 | 2.3 | 3.4 | 6.5 | 6.7 | / | / | / | -| | clip_text | [1, 20] | FP16 | 28.33 | 64.6 | 92.5 | 61.5 | / | / | / | -| wav2vec2 | wav2vec2_base_960h_20s | 20s audio | FP16 | RTF
0.861 | RTF
0.333 | RTF
0.131 | RTF
0.073 | / | / | / | -| whisper | encoder+decoder+NPU-outside process | 20s audio | FP16 | RTF
1.253 | RTF
0.417 | RTF
0.219 | RTF
0.216 | / | / | / | -| yamnet | yamnet_3s | 3s audio | FP16 | RTF
0.013 | RTF
0.008 | RTF
0.004 | RTF
0.005 | / | / | / | +| mobilenet | mobilenetv2-12 | [1, 3, 224, 224] | INT8 | 180.7 | 281.3 | 450.7 | 467.0 | 212.9 | 322.3 | 170.3 | +| resnet | resnet50-v2-7 | [1, 3, 224, 224] | INT8 | 37.9 | 54.9 | 110.1 | 99.0 | 24.4 | 36.2 | 37.1 | +| yolov5 | yolov5s_relu | [1, 3, 640, 640] | INT8 | 25.5 | 33.2 | 66.1 | 65.0 | 20.2 | 29.2 | 37.2 | +| | yolov5n | [1, 3, 640, 640] | INT8 | 39.7 | 47.4 | 82.5 | 112.7 | 36.3 | 53.2 | 61.2 | +| | yolov5s | [1, 3, 640, 640] | INT8 | 19.3 | 23.6 | 48.4 | 57.5 | 13.6 | 20.0 | 28.2 | +| | yolov5m | [1, 3, 640, 640] | INT8 | 8.6 | 10.8 | 20.9 | 23.7 | 5.8 | 8.5 | 13.3 | +| yolov6 | yolov6n | [1, 3, 640, 640] | INT8 | 48.8 | 56.4 | 106.4 | 109.1 | 37.8 | 56.8 | 66.8 | +| | yolov6s | [1, 3, 640, 640] | INT8 | 15.2 | 17.3 | 36.4 | 35.0 | 10.8 | 16.3 | 24.1 | +| | yolov6m | [1, 3, 640, 640] | INT8 | 7.2 | 8.6 | 17.8 | 17.4 | 5.6 | 8.3 | 11.5 | +| yolov7 | yolov7-tiny | [1, 3, 640, 640] | INT8 | 27.9 | 36.5 | 72.7 | 74.8 | 15.4 | 22.4 | 37.2 | +| | yolov7 | [1, 3, 640, 640] | INT8 | 4.6 | 5.9 | 11.4 | 13.0 | 3.3 | 4.8 | 7.4 | +| yolov8 | yolov8n | [1, 3, 640, 640] | INT8 | 34.0 | 40.9 | 73.5 | 90.2 | 24.0 | 35.4 | 42.3 | +| | yolov8s | [1, 3, 640, 640] | INT8 | 15.1 | 18.4 | 38.0 | 40.8 | 8.9 | 13.1 | 19.1 | +| | yolov8m | [1, 3, 640, 640] | INT8 | 6.5 | 8.2 | 16.2 | 16.7 | 3.9 | 5.8 | 9.1 | +| yolov8_obb | yolov8n-obb | [1, 3, 640, 640] | INT8 | 33.9 | 41.3 | 74.0 | 90.2 | 25.1 | 37.3 | 42.8 | +| yolov10 | yolov10n | [1, 3, 640, 640] | INT8 | 20.7 | 34.1 | 61.2 | 80.2 | / | / | / | +| | yolov10s | [1, 3, 640, 640] | INT8 | 10.3 | 16.9 | 33.8 | 39.9 | / | / | / | +| yolo11 | yolo11n | [1, 3, 640, 640] | INT8 | 20.6 | 34.0 | 60.0 | 77.9 | 11.7 | 17.0 | 17.6 | +| | yolo11s | [1, 3, 640, 640] | INT8 | 10.2 | 16.7 | 33.0 | 38.2 | 5.0 | 7.3 | 8.4 | +| | yolo11m | [1, 3, 640, 640] | INT8 | 4.6 | 6.5 | 12.7 | 14.6 | 2.8 | 4.0 | 5.1 | +| yolox | yolox_s | [1, 3, 640, 640] | INT8 | 15.2 | 18.3 | 37.1 | 41.5 | 10.6 | 15.7 | 23.0 | +| | yolox_m | [1, 3, 640, 640] | INT8 | 6.6 | 8.2 | 16.0 | 17.6 | 4.6 | 6.8 | 10.7 | +| ppyoloe | ppyoloe_s | [1, 3, 640, 640] | INT8 | 7.4 | 20.0 | 32.5 | 41.3 | 11.2 | 16.4 | 21.1 | +| | ppyoloe_m | [1, 3, 640, 640] | INT8 | 4.1 | 9.2 | 15.8 | 17.8 | 5.2 | 7.7 | 9.4 | +| yolo_world | yolo_world_v2s | [1, 3, 640, 640] | INT8 | 7.4 | 9.6 | 22.1 | 22.3 | / | / | / | +| | clip_text | [1, 20] | FP16 | 29.8 | 67.4 | 95.8 | 63.5 | / | / | / | +| yolov8_pose | yolov8n-pose | [1, 3, 640, 640] | INT8 | 22.6 | 31.0 | 55.9 | 66.8 | / | / | / | +| deeplabv3 | deeplab-v3-plus-mobilenet-v2 | [1, 513, 513, 1] | INT8 | 10.9 | 21.4 | 34.0 | 39.4 | 10.1 | 13.0 | 4.4 | +| yolov5_seg | yolov5n-seg | [1, 3, 640, 640] | INT8 | 32.2 | 38.5 | 69.3 | 88.3 | 28.6 | 42.2 | 49.6 | +| | yolov5s-seg | [1, 3, 640, 640] | INT8 | 15.0 | 18.1 | 36.8 | 41.6 | 9.6 | 14.0 | 22.5 | +| | yolov5m-seg | [1, 3, 640, 640] | INT8 | 6.8 | 8.4 | 16.4 | 18.0 | 4.7 | 6.8 | 10.8 | +| yolov8_seg | yolov8n-seg | [1, 3, 640, 640] | INT8 | 27.8 | 33.0 | 60.8 | 71.1 | 18.6 | 27.6 | 32.9 | +| | yolov8s-seg | [1, 3, 640, 640] | INT8 | 11.7 | 14.1 | 28.9 | 30.8 | 6.6 | 9.8 | 14.6 | +| | yolov8m-seg | [1, 3, 640, 640] | INT8 | 5.2 | 6.4 | 12.6 | 12.7 | 3.1 | 4.6 | 6.9 | +| ppseg | ppseg_lite_1024x512 | [1, 3, 512, 512] | INT8 | 5.9 | 13.9 | 35.7 | 33.6 | 18.4 | 27.1 | 20.9 | +| mobilesam | mobilesam_encoder_tiny | [1, 3, 448, 448] | FP16 | 1.0 | 6.6 | 10.0 | 11.9 | / | / | / | +| | mobilesam_decoder | [1, 1, 112, 112] | FP16 | 24.3 | 69.6 | 116.4 | 108.6 | / | / | / | +| RetinaFace | RetinaFace_mobile320 | [1, 3, 320, 320] | INT8 | 156.4 | 300.8 | 227.2 | 470.5 | 144.8 | 212.5 | 198.5 | +| | RetinaFace_resnet50_320 | [1, 3, 320, 320] | INT8 | 18.7 | 26.9 | 49.2 | 56.6 | 14.6 | 20.8 | 24.6 | +| LPRNet | lprnet | [1, 3, 24, 94] | FP16 | 143.2 | 420.6 | 586.4 | 647.8 | 30.6(INT8) | 47.6(INT8) | 30.1(INT8) | +| PPOCR-Det | ppocrv4_det | [1, 3, 480, 480] | INT8 | 22.1 | 28.0 | 50.7 | 64.3 | 11.0 | 16.1 | 14.2 | +| PPOCR-Rec | ppocrv4_rec | [1, 3, 48, 320] | FP16 | 19.5 | 54.3 | 73.9 | 96.8 | 1.0 | 1.6 | 6.7 | +| lite_transformer | lite-transformer-encoder-16 | embedding-256, token-16 | FP16 | 337.5 | 725.8 | 867.6 | 784.1 | 22.7 | 35.4 | 98.3 | +| | lite-transformer-decoder-16 | embedding-256, token-16 | FP16 | 142.5 | 252.0 | 343.8 | 272.3 | 48.0 | 65.8 | 109.9 | +| clip | clip_images | [1, 3, 224, 224] | FP16 | 2.3 | 3.4 | 6.5 | 6.7 | / | / | / | +| | clip_text | [1, 20] | FP16 | 29.7 | 66.6 | 96.0 | 63.7 | / | / | / | +| wav2vec2 | wav2vec2_base_960h_20s | 20s audio | FP16 | RTF
0.817 | RTF
0.323 | RTF
0.133 | RTF
0.073 | / | / | / | +| whisper | whisper_base_20s | 20s audio | FP16 | RTF
1.178 | RTF
0.420 | RTF
0.215 | RTF
0.218 | / | / | / | +| zipformer | zipformer-bilingual-zh-en-t | streaming audio | FP16 | RTF
0.196 | RTF
0.116 | RTF
0.065 | RTF
0.082 | / | / | / | +| yamnet | yamnet_3s | 3s audio | FP16 | RTF
0.013 | RTF
0.008 | RTF
0.004 | RTF
0.005 | / | / | / | +| mms_tts | mms_tts_eng_200 | token-200 | FP16 | RTF
0.311 | RTF
0.138 | RTF
0.069 | RTF
0.069 | / | / | / | - 该性能数据基于各平台的最大NPU频率进行测试 - 该性能数据指模型推理的耗时, 不包含前后处理的耗时 @@ -153,6 +161,7 @@ export ANDROID_NDK_PATH=~/opts/ndk/android-ndk-r18b | 版本 | 说明 | | ----- | ------------------------------------------------------------ | +| 2.3.0 | 新增 yolo11、zipformer、mms_tts 等示例 | | 2.2.0 | 添加新例程 wav2vec, mobilesam. 更新部分模型的导出说明 | | 2.1.0 | 新例程添加, 包含 yolov8_pose, yolov8_obb, yolov10, yolo_world, clip, whisper, yamnet
部分模型暂不支持 `RK1808`, `RV1109`, `RV1126` 平台, 将在下个版本添加支持 | | 2.0.0 | 新增所有示例`RK3576`平台的支持
支持`RK1808`, `RV1109`, `RV1126`平台 | @@ -167,6 +176,7 @@ RKNN Model Zoo 的例程基于当前最新的 RKNPU SDK 进行验证。若使用 | 版本 | RKNPU2 SDK | RKNPU1 SDK | | ----- | ---------- | ---------- | +| 2.3.0 | >=2.3.0 | >=1.7.5 | | 2.2.0 | >=2.2.0 | >=1.7.5 | | 2.1.0 | >=2.1.0 | >=1.7.5 | | 2.0.0 | >=2.0.0 | >=1.7.5 | diff --git a/examples/LPRNet/README.md b/examples/LPRNet/README.md index b2033f18..1ebbe36e 100644 --- a/examples/LPRNet/README.md +++ b/examples/LPRNet/README.md @@ -30,8 +30,7 @@ https://github.com/sirius-ai/LPRNet_Pytorch/ ## 2. Current Support Platform -RK3562, RK3566, RK3568, RK3576, RK3588, RK1808, RV1109, RV1126 - +RV1103, RV1106, RK3562, RK3566, RK3568, RK3576, RK3588, RV1109, RV1126, RK1808, RK3399PRO ## 3. Pretrained Model @@ -150,6 +149,12 @@ export LD_LIBRARY_PATH=./lib ./rknn_lprnet_demo model/lprnet.rknn model/test.jpg ``` +- RV1106/1103 LD_LIBRARY_PATH must specify as the absolute path. Such as + + ```sh + export LD_LIBRARY_PATH=/userdata/rknn_LPRNet_demo/lib + ``` + ## 8. Expected Results diff --git a/examples/LPRNet/cpp/CMakeLists.txt b/examples/LPRNet/cpp/CMakeLists.txt index a0bc231f..92205223 100644 --- a/examples/LPRNet/cpp/CMakeLists.txt +++ b/examples/LPRNet/cpp/CMakeLists.txt @@ -9,9 +9,6 @@ if (ENABLE_ASAN) set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") endif () -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out) - #opencv if(CMAKE_SIZEOF_VOID_P EQUAL 8) message(STATUS "64bit") @@ -41,7 +38,12 @@ message(STATUS OpenCV_LIBS=${OpenCV_LIBS}) set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib") -if (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126") +if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103") + add_definitions(-DRV1106_1103) + set(lprnet_file rknpu2/lprnet_rv1106_1103.cc) + #dma + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma) +elseif (TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126") set(lprnet_file rknpu1/lprnet.cc) else() set(lprnet_file rknpu2/lprnet.cc) @@ -50,6 +52,9 @@ endif() file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out) + add_executable(${PROJECT_NAME} main.cc ${lprnet_file} diff --git a/examples/LPRNet/cpp/lprnet.h b/examples/LPRNet/cpp/lprnet.h index bf16c76c..ca1236cc 100644 --- a/examples/LPRNet/cpp/lprnet.h +++ b/examples/LPRNet/cpp/lprnet.h @@ -8,15 +8,36 @@ #include #include +#define MODEL_HEIGHT 24 +#define MODEL_WIDTH 94 +#define OUT_ROWS 68 +#define OUT_COLS 18 + +#if defined(RV1106_1103) +#include "dma_alloc.hpp" +typedef struct +{ + char *dma_buf_virt_addr; + int dma_buf_fd; + int size; +} rknn_dma_buf; +#endif + typedef struct { rknn_context rknn_ctx; rknn_input_output_num io_num; rknn_tensor_attr *input_attrs; rknn_tensor_attr *output_attrs; +#if defined(RV1106_1103) + rknn_tensor_mem *input_mems[1]; + rknn_tensor_mem *output_mems[1]; + rknn_dma_buf img_dma_buf; +#endif int model_channel; int model_width; int model_height; + bool is_quant; } rknn_app_context_t; typedef struct diff --git a/examples/LPRNet/cpp/main.cc b/examples/LPRNet/cpp/main.cc index ae3f3436..86463528 100644 --- a/examples/LPRNet/cpp/main.cc +++ b/examples/LPRNet/cpp/main.cc @@ -23,43 +23,56 @@ #include "lprnet.h" #include "image_utils.h" #include "file_utils.h" +#include "opencv2/opencv.hpp" + +static void image_preprocess(image_buffer_t src_image) +{ + cv::Mat img_ori = cv::Mat(src_image.height, src_image.width, CV_8UC3, (uint8_t *)src_image.virt_addr); + cv::resize(img_ori, img_ori, cv::Size(MODEL_WIDTH, MODEL_HEIGHT)); + cv::cvtColor(img_ori, img_ori, cv::COLOR_RGB2BGR); + src_image.virt_addr = img_ori.data; +} /*------------------------------------------- Main Function -------------------------------------------*/ -int main(int argc, char** argv) +int main(int argc, char **argv) { - if (argc != 3) { + if (argc != 3) + { printf("%s \n", argv[0]); return -1; } - const char* model_path = argv[1]; - const char* image_path = argv[2]; - + const char *model_path = argv[1]; + const char *image_path = argv[2]; int ret; rknn_app_context_t rknn_app_ctx; memset(&rknn_app_ctx, 0, sizeof(rknn_app_context_t)); + image_buffer_t src_image; + memset(&src_image, 0, sizeof(image_buffer_t)); + lprnet_result result; ret = init_lprnet_model(model_path, &rknn_app_ctx); - if (ret != 0) { + if (ret != 0) + { printf("init_lprnet_model fail! ret=%d model_path=%s\n", ret, model_path); - return -1; + goto out; } - image_buffer_t src_image; - memset(&src_image, 0, sizeof(image_buffer_t)); ret = read_image(image_path, &src_image); - if (ret != 0) { + if (ret != 0) + { printf("read image fail! ret=%d image_path=%s\n", ret, image_path); - return -1; + goto out; } - - lprnet_result result; + // Image preprocessing + image_preprocess(src_image); ret = inference_lprnet_model(&rknn_app_ctx, &src_image, &result); - if (ret != 0) { + if (ret != 0) + { printf("init_lprnet_model fail! ret=%d\n", ret); goto out; } @@ -68,12 +81,19 @@ int main(int argc, char** argv) out: ret = release_lprnet_model(&rknn_app_ctx); - if (ret != 0) { + if (ret != 0) + { printf("release_lprnet_model fail! ret=%d\n", ret); } - if (src_image.virt_addr != NULL) { + if (src_image.virt_addr != NULL) + { +#if defined(RV1106_1103) + dma_buf_free(rknn_app_ctx.img_dma_buf.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, + rknn_app_ctx.img_dma_buf.dma_buf_virt_addr); +#else free(src_image.virt_addr); +#endif } return 0; diff --git a/examples/LPRNet/cpp/rknpu1/lprnet.cc b/examples/LPRNet/cpp/rknpu1/lprnet.cc index a5d690b1..bb038b79 100644 --- a/examples/LPRNet/cpp/rknpu1/lprnet.cc +++ b/examples/LPRNet/cpp/rknpu1/lprnet.cc @@ -10,11 +10,25 @@ static void dump_tensor_attr(rknn_tensor_attr *attr) { - printf(" index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " - "zp=%d, scale=%f\n", - attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0], - attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type), - get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); + char dims_str[100]; + char temp_str[100]; + memset(dims_str, 0, sizeof(dims_str)); + for (int i = attr->n_dims - 1; i >= 0; --i) + { + strcpy(temp_str, dims_str); + if (i == 0) + { + sprintf(dims_str, "%s%d", temp_str, attr->dims[i]); + } + else + { + sprintf(dims_str, "%s%d, ", temp_str, attr->dims[i]); + } + } + + printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, dims_str, attr->n_elems, attr->size, get_format_string(attr->fmt), + get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); } int init_lprnet_model(const char *model_path, rknn_app_context_t *app_ctx) @@ -139,18 +153,12 @@ int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img, memset(inputs, 0, sizeof(inputs)); memset(outputs, 0, sizeof(outputs)); - // Pre Process - cv::Mat img_ori = cv::Mat(src_img->height, src_img->width, CV_8UC3, (uint8_t *)src_img->virt_addr); - cv::Mat img_pre; - cv::resize(img_ori, img_pre, cv::Size(94, 24)); - cv::cvtColor(img_pre, img_pre, cv::COLOR_RGB2BGR); - // Set Input Data inputs[0].index = 0; inputs[0].type = RKNN_TENSOR_UINT8; inputs[0].fmt = RKNN_TENSOR_NHWC; inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel; - inputs[0].buf = img_pre.data; + inputs[0].buf = (uint8_t *)src_img->virt_addr; ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs); if (ret < 0) @@ -179,35 +187,31 @@ int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img, // Post Process std::vector no_repeat_blank_label{}; - float prebs[18]; + float prebs[OUT_COLS]; int pre_c; - for (int x = 0; x < 18; x++) // Traverse 18 license plate positions + for (int x = 0; x < OUT_COLS; x++) // Traverse OUT_COLS license plate positions { float *ptr = (float *)outputs[0].buf; - float preb[68]; - for (int y = 0; y < 68; y++) // Traverse 68 string positions + float preb[OUT_ROWS]; + for (int y = 0; y < OUT_ROWS; y++) // Traverse OUT_ROWS string positions { preb[y] = ptr[x]; - ptr += 18; + ptr += OUT_COLS; } - int max_num_index = std::max_element(preb, preb + 68) - preb; + int max_num_index = std::max_element(preb, preb + OUT_ROWS) - preb; prebs[x] = max_num_index; } // Remove duplicates and blanks pre_c = prebs[0]; - if (pre_c != 67) + if (pre_c != OUT_ROWS - 1) { no_repeat_blank_label.push_back(pre_c); } for (int value : prebs) { - if (value == 67 or value == pre_c) + if (value == OUT_ROWS - 1 or value == pre_c) { - if (value == 67 or value == pre_c) - { - pre_c = value; - } continue; } no_repeat_blank_label.push_back(value); diff --git a/examples/LPRNet/cpp/rknpu2/lprnet.cc b/examples/LPRNet/cpp/rknpu2/lprnet.cc index 450c6653..39a40171 100644 --- a/examples/LPRNet/cpp/rknpu2/lprnet.cc +++ b/examples/LPRNet/cpp/rknpu2/lprnet.cc @@ -10,11 +10,16 @@ static void dump_tensor_attr(rknn_tensor_attr *attr) { - printf(" index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " + std::string shape_str = attr->n_dims < 1 ? "" : std::to_string(attr->dims[0]); + for (int i = 1; i < attr->n_dims; ++i) + { + shape_str += ", " + std::to_string(attr->dims[i]); + } + + printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " "zp=%d, scale=%f\n", - attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3], - attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type), - get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); + attr->index, attr->name, attr->n_dims, shape_str.c_str(), attr->n_elems, attr->size, get_format_string(attr->fmt), + get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); } int init_lprnet_model(const char *model_path, rknn_app_context_t *app_ctx) @@ -139,18 +144,12 @@ int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img, memset(inputs, 0, sizeof(inputs)); memset(outputs, 0, sizeof(outputs)); - // Pre Process - cv::Mat img_ori = cv::Mat(src_img->height, src_img->width, CV_8UC3, (uint8_t *)src_img->virt_addr); - cv::Mat img_pre; - cv::resize(img_ori, img_pre, cv::Size(94, 24)); - cv::cvtColor(img_pre, img_pre, cv::COLOR_RGB2BGR); - // Set Input Data inputs[0].index = 0; inputs[0].type = RKNN_TENSOR_UINT8; inputs[0].fmt = RKNN_TENSOR_NHWC; inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel; - inputs[0].buf = img_pre.data; + inputs[0].buf = (uint8_t *)src_img->virt_addr; ret = rknn_inputs_set(app_ctx->rknn_ctx, 1, inputs); if (ret < 0) @@ -179,35 +178,31 @@ int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img, // Post Process std::vector no_repeat_blank_label{}; - float prebs[18]; + float prebs[OUT_COLS]; int pre_c; - for (int x = 0; x < 18; x++) // Traverse 18 license plate positions + for (int x = 0; x < OUT_COLS; x++) // Traverse OUT_COLS license plate positions { float *ptr = (float *)outputs[0].buf; - float preb[68]; - for (int y = 0; y < 68; y++) // Traverse 68 string positions + float preb[OUT_ROWS]; + for (int y = 0; y < OUT_ROWS; y++) // Traverse OUT_ROWS string positions { preb[y] = ptr[x]; - ptr += 18; + ptr += OUT_COLS; } - int max_num_index = std::max_element(preb, preb + 68) - preb; + int max_num_index = std::max_element(preb, preb + OUT_ROWS) - preb; prebs[x] = max_num_index; } // Remove duplicates and blanks pre_c = prebs[0]; - if (pre_c != 67) + if (pre_c != OUT_ROWS - 1) { no_repeat_blank_label.push_back(pre_c); } for (int value : prebs) { - if (value == 67 or value == pre_c) + if (value == OUT_ROWS - 1 or value == pre_c) { - if (value == 67 or value == pre_c) - { - pre_c = value; - } continue; } no_repeat_blank_label.push_back(value); diff --git a/examples/LPRNet/cpp/rknpu2/lprnet_rv1106_1103.cc b/examples/LPRNet/cpp/rknpu2/lprnet_rv1106_1103.cc new file mode 100644 index 00000000..efa91673 --- /dev/null +++ b/examples/LPRNet/cpp/rknpu2/lprnet_rv1106_1103.cc @@ -0,0 +1,273 @@ +#include +#include +#include +#include + +#include "lprnet.h" +#include "common.h" +#include "file_utils.h" +#include "image_utils.h" +#include "opencv2/opencv.hpp" + +static void dump_tensor_attr(rknn_tensor_attr *attr) +{ + std::string shape_str = attr->n_dims < 1 ? "" : std::to_string(attr->dims[0]); + for (int i = 1; i < attr->n_dims; ++i) + { + shape_str += ", " + std::to_string(attr->dims[i]); + } + + printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " + "zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, shape_str.c_str(), attr->n_elems, attr->size, get_format_string(attr->fmt), + get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); +} + +int init_lprnet_model(const char *model_path, rknn_app_context_t *app_ctx) +{ + int ret; + int model_len = 0; + char *model; + rknn_context ctx = 0; + + // Load RKNN Model + model_len = read_data_from_file(model_path, &model); + if (model == NULL) + { + printf("load_model fail!\n"); + return -1; + } + + ret = rknn_init(&ctx, model, model_len, 0, NULL); + free(model); + if (ret < 0) + { + printf("rknn_init fail! ret=%d\n", ret); + return -1; + } + + // Get Model Input Output Number + rknn_input_output_num io_num; + ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); + + // Get Model Input Info + printf("input tensors:\n"); + rknn_tensor_attr input_attrs[io_num.n_input]; + memset(input_attrs, 0, sizeof(input_attrs)); + for (int i = 0; i < io_num.n_input; i++) + { + input_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(input_attrs[i])); + } + + // Get Model Output Info + printf("output tensors:\n"); + rknn_tensor_attr output_attrs[io_num.n_output]; + memset(output_attrs, 0, sizeof(output_attrs)); + for (int i = 0; i < io_num.n_output; i++) + { + output_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(output_attrs[i])); + } + + // default input type is int8 (normalize and quantize need compute in outside) + // if set uint8, will fuse normalize and quantize to npu + input_attrs[0].type = RKNN_TENSOR_UINT8; + // default fmt is NHWC,1106 npu only support NHWC in zero copy mode + input_attrs[0].fmt = RKNN_TENSOR_NHWC; + printf("input_attrs[0].size_with_stride=%d\n", input_attrs[0].size_with_stride); + app_ctx->input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride); + + // Set input tensor memory + ret = rknn_set_io_mem(ctx, app_ctx->input_mems[0], &input_attrs[0]); + if (ret < 0) + { + printf("input_mems rknn_set_io_mem fail! ret=%d\n", ret); + return -1; + } + + // Set output tensor memory + for (uint32_t i = 0; i < io_num.n_output; ++i) + { + printf("output_attrs[0].size_with_stride=%d\n", output_attrs[i].size_with_stride); + + app_ctx->output_mems[i] = rknn_create_mem(ctx, output_attrs[i].size_with_stride); + ret = rknn_set_io_mem(ctx, app_ctx->output_mems[i], &output_attrs[i]); + if (ret < 0) + { + printf("output_mems rknn_set_io_mem fail! ret=%d\n", ret); + return -1; + } + } + + // Set to context + app_ctx->rknn_ctx = ctx; + + // TODO + if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC) + { + app_ctx->is_quant = true; + } + else + { + app_ctx->is_quant = false; + } + + app_ctx->io_num = io_num; + app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) + { + printf("model is NCHW input fmt\n"); + app_ctx->model_channel = input_attrs[0].dims[1]; + app_ctx->model_height = input_attrs[0].dims[2]; + app_ctx->model_width = input_attrs[0].dims[3]; + } + else + { + printf("model is NHWC input fmt\n"); + app_ctx->model_height = input_attrs[0].dims[1]; + app_ctx->model_width = input_attrs[0].dims[2]; + app_ctx->model_channel = input_attrs[0].dims[3]; + } + printf("model input height=%d, width=%d, channel=%d\n", + app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel); + + return 0; +} + +int release_lprnet_model(rknn_app_context_t *app_ctx) +{ + if (app_ctx->input_attrs != NULL) + { + free(app_ctx->input_attrs); + app_ctx->input_attrs = NULL; + } + if (app_ctx->output_attrs != NULL) + { + free(app_ctx->output_attrs); + app_ctx->output_attrs = NULL; + } + for (int i = 0; i < app_ctx->io_num.n_input; i++) + { + if (app_ctx->input_mems[i] != NULL) + { + rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]); + } + } + for (int i = 0; i < app_ctx->io_num.n_output; i++) + { + if (app_ctx->output_mems[i] != NULL) + { + rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]); + } + } + if (app_ctx->rknn_ctx != 0) + { + rknn_destroy(app_ctx->rknn_ctx); + app_ctx->rknn_ctx = 0; + } + return 0; +} + +int inference_lprnet_model(rknn_app_context_t *app_ctx, image_buffer_t *src_img, lprnet_result *out_result) +{ + int ret; + // Copy input data to input tensor memory + int width = app_ctx->input_attrs[0].dims[2]; + int stride = app_ctx->input_attrs[0].w_stride; + + if (width == stride) + { + memcpy(app_ctx->input_mems[0]->virt_addr, src_img->virt_addr, width * app_ctx->input_attrs[0].dims[1] * app_ctx->input_attrs[0].dims[3]); + } + else + { + int height = app_ctx->input_attrs[0].dims[1]; + int channel = app_ctx->input_attrs[0].dims[3]; + // copy from src to dst with stride + uint8_t *src_ptr = src_img->virt_addr; + uint8_t *dst_ptr = (uint8_t *)app_ctx->input_mems[0]->virt_addr; + // width-channel elements + int src_wc_elems = width * channel; + int dst_wc_elems = stride * channel; + for (int h = 0; h < height; ++h) + { + memcpy(dst_ptr, src_ptr, src_wc_elems); + src_ptr += src_wc_elems; + dst_ptr += dst_wc_elems; + } + } + + // Run + printf("rknn_run\n"); + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + return -1; + } + + // Post Processs + std::vector no_repeat_blank_label{}; + int8_t prebs[OUT_COLS]; + int pre_c; + for (int x = 0; x < OUT_COLS; x++) // Traverse OUT_COLS license plate positions + { + int8_t *ptr = (int8_t *)app_ctx->output_mems[0]->virt_addr; + int8_t preb[OUT_ROWS]; + for (int y = 0; y < OUT_ROWS; y++) // Traverse OUT_ROWS string positions + { + preb[y] = ptr[x]; + ptr += OUT_COLS; + } + int max_num_index = std::max_element(preb, preb + OUT_ROWS) - preb; + prebs[x] = max_num_index; + } + + // Remove duplicates and blanks + pre_c = prebs[0]; + if (pre_c != OUT_ROWS - 1) + { + no_repeat_blank_label.push_back(pre_c); + } + for (int value : prebs) + { + if (value == OUT_ROWS - 1 or value == pre_c) + { + continue; + } + no_repeat_blank_label.push_back(value); + pre_c = value; + } + + // The license plate is converted into a string according to the dictionary + out_result->plate_name.clear(); + for (int hh : no_repeat_blank_label) + { + out_result->plate_name += plate_code[hh]; + } + + return ret; +} \ No newline at end of file diff --git a/examples/LPRNet/python/lprnet.py b/examples/LPRNet/python/lprnet.py index f48c65d2..8dd7fa98 100644 --- a/examples/LPRNet/python/lprnet.py +++ b/examples/LPRNet/python/lprnet.py @@ -54,7 +54,7 @@ def decode(preds, CHARS): args = parser.parse_args() # Create RKNN object - rknn = RKNN(verbose=True) + rknn = RKNN(verbose=False) # Load RKNN model ret = rknn.load_rknn(args.model_path) diff --git a/examples/mms_tts/README.md b/examples/mms_tts/README.md new file mode 100644 index 00000000..ad122dd5 --- /dev/null +++ b/examples/mms_tts/README.md @@ -0,0 +1,216 @@ +# MMS-TTS + +## Table of contents + +- [1. Description](#1-description) +- [2. Current Support Platform](#2-current-support-platform) +- [3. Pretrained Model](#3-pretrained-model) +- [4. Convert to RKNN](#4-convert-to-rknn) +- [5. Python Demo](#5-python-demo) +- [6. Android Demo](#6-android-demo) + - [6.1 Compile and Build](#61-compile-and-build) + - [6.2 Push demo files to device](#62-push-demo-files-to-device) + - [6.3 Run demo](#63-run-demo) +- [7. Linux Demo](#7-linux-demo) + - [7.1 Compile \&\& Build](#71-compile-and-build) + - [7.2 Push demo files to device](#72-push-demo-files-to-device) + - [7.3 Run demo](#73-run-demo) +- [8. Expected Results](#8-expected-results) + + + +## 1. Description + +MMS-TTS (Massively Multilingual Speech, Text to Speech) is part of Facebook's Massively Multilingual Speech project, aiming to provide speech technology across a diverse range of languages. + +The model used in this example comes from the following open source projects: + +https://huggingface.co/facebook/mms-tts-eng + + + +## 2. Current Support Platform + +RK3566, RK3568, RK3588, RK3562, RK3576 + + + +## 3. Pretrained Model + +Download link: + +[mms_tts_eng_encoder_200.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_encoder_200.onnx)
[mms_tts_eng_decoder_200.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_decoder_200.onnx) + +Download with shell command: + +``` +cd model +./download_model.sh +``` + +**Note: For exporting mms_tts onnx models, please refer to [export_onnx.md](./export_onnx.md)** + + +## 4. Convert to RKNN + +*Usage:* + +```shell +cd python +python convert.py + +# such as: +python convert.py ../model/mms_tts_eng_encoder_200.onnx rk3588 +# output model will be saved as ../model/mms_tts_eng_encoder_200.rknn + +python convert.py ../model/mms_tts_eng_decoder_200.onnx rk3588 +# output model will be saved as ../model/mms_tts_eng_decoder_200.rknn +``` + +*Description:* + +- ``: Specify ONNX model path. +- ``: Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- `(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `fp`. +- `(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model. + + + +## 5. Python Demo + +*Usage:* + +```shell +cd python +# Inference with ONNX model +python mms_tts.py --encoder_model_path --decoder_model_path + +# Inference with RKNN model +python mms_tts.py --encoder_model_path --decoder_model_path --target +``` +*Description:* +- : Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- : Specify model path. + + + +## 6. Android Demo + +#### 6.1 Compile and Build + +*Usage:* + +```sh +# go back to the rknn_model_zoo root directory +cd ../../ +export ANDROID_NDK_PATH= + +./build-android.sh -t -a -d mms_tts + +# such as +./build-android.sh -t rk3588 -a arm64-v8a -d mms_tts +``` + +*Description:* +- ``: Specify Android NDK path. +- ``: Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- ``: Specify device system architecture. To query device architecture, refer to the following command: + ```shell + # Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log. + adb shell cat /proc/version + ``` + +#### 6.2 Push demo files to device + +With device connected via USB port, push demo files to devices: + +```shell +adb root +adb remount +adb push install/_android_/rknn_mms_tts_demo/ /data/ +``` + +#### 6.3 Run demo + +```sh +adb shell +cd /data/rknn_mms_tts_demo + +export LD_LIBRARY_PATH=./lib +./rknn_mms_tts_demo model/mms_tts_eng_encoder_200.rknn model/mms_tts_eng_decoder_200.rknn '"Mister quilter is the apostle of the middle classes and we are glad to welcome his gospel."' +``` + +- After running, the result was saved as `output.wav`. To check the result on host PC, pull back result referring to the following command: + + ```sh + adb pull /data/rknn_mms_tts_demo/output.wav + ``` + + + +## 7. Linux Demo + +#### 7.1 Compile and Build + +*usage* + +```shell +# go back to the rknn_model_zoo root directory +cd ../../ + +# if GCC_COMPILER not found while building, please set GCC_COMPILER path +(optional)export GCC_COMPILER= + +./build-linux.sh -t -a -d mms_tts + +# such as +./build-linux.sh -t rk3588 -a aarch64 -d mms_tts +``` + +*Description:* + +- ``: Specified as GCC_COMPILER path. +- `` : Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- ``: Specify device system architecture. To query device architecture, refer to the following command: + + ```shell + # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log. + adb shell cat /proc/version + ``` + +#### 7.2 Push demo files to device + +- If device connected via USB port, push demo files to devices: + +```shell +adb push install/_linux_/rknn_mms_tts_demo/ /data/ +``` + +- For other boards, use `scp` or other approaches to push all files under `install/_linux_/rknn_mms_tts_demo/` to `data`. + +#### 7.3 Run demo + +```sh +adb shell +cd /data/rknn_mms_tts_demo + +export LD_LIBRARY_PATH=./lib +./rknn_mms_tts_demo model/mms_tts_eng_encoder_200.rknn model/mms_tts_eng_decoder_200.rknn '"Mister quilter is the apostle of the middle classes and we are glad to welcome his gospel."' +``` + +- After running, the result was saved as `output.wav`. To check the result on host PC, pull back result referring to the following command: + + ``` + adb pull /data/rknn_mms_tts_demo/output.wav + ``` + + + +## 8. Expected Results + +This example will generate a wav file corresponding to the input text: "Mister quilter is the apostle of the middle classes and we are glad to welcome his gospel.", such as: + +[output.wav](output.wav) + + +- Note: Different platforms, different versions of tools and drivers may have slightly different results. \ No newline at end of file diff --git a/examples/mms_tts/cpp/CMakeLists.txt b/examples/mms_tts/cpp/CMakeLists.txt new file mode 100644 index 00000000..6c704f1b --- /dev/null +++ b/examples/mms_tts/cpp/CMakeLists.txt @@ -0,0 +1,56 @@ +cmake_minimum_required(VERSION 3.10) + +project(rknn_mms_tts_demo) + +if (ENABLE_ASAN) + message(STATUS "BUILD WITH ADDRESS SANITIZER") + set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") +endif () + +set(rknpu_mms_tts_file rknpu2/mms_tts.cc) +if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126") + set(rknpu_mms_tts_file rknpu1/mms_tts.cc) +endif() + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out) + +set(CMAKE_INSTALL_RPATH "$ORIGIN/lib") + +file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) + +add_executable(${PROJECT_NAME} + main.cc + process.cc + ${rknpu_mms_tts_file} +) + +target_link_libraries(${PROJECT_NAME} + fileutils + audioutils + ${LIBRKNNRT} +) + +if (CMAKE_SYSTEM_NAME STREQUAL "Android") + target_link_libraries(${PROJECT_NAME} + log +) +endif() + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(${PROJECT_NAME} Threads::Threads) +endif() + +target_include_directories(${PROJECT_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${LIBRKNNRT_INCLUDES} + ${LIBTIMER_INCLUDES} +) + +install(TARGETS ${PROJECT_NAME} DESTINATION .) +file(GLOB RKNN_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../model/*.rknn") +install(FILES ${RKNN_FILES} DESTINATION model) \ No newline at end of file diff --git a/examples/mms_tts/cpp/main.cc b/examples/mms_tts/cpp/main.cc new file mode 100644 index 00000000..c980fd64 --- /dev/null +++ b/examples/mms_tts/cpp/main.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*------------------------------------------- + Includes +-------------------------------------------*/ +#include +#include +#include +#include +#include "mms_tts.h" +#include +#include +#include + +/*------------------------------------------- + Main Function +-------------------------------------------*/ +int main(int argc, char **argv) +{ + if (argc != 4) + { + printf("%s \n", argv[0]); + return -1; + } + + const char *encoder_path = argv[1]; + const char *decoder_path = argv[2]; + const char *input_text = argv[3]; + const char *audio_save_path = "output.wav"; + + int ret; + TIMER timer; + rknn_mms_tts_context_t rknn_app_ctx; + std::map vocab; + std::vector input_ids(MAX_LENGTH, 0); + std::vector attention_mask(MAX_LENGTH, 0); + float infer_time = 0.0; + int predicted_lengths_max_real = 0; + float audio_length = 0.0; + float max_audio_length = (float)PREDICTED_LENGTHS_MAX * PREDICTED_BATCH / SAMPLE_RATE; + float rtf = 0.0; + memset(&rknn_app_ctx, 0, sizeof(rknn_mms_tts_context_t)); + + timer.tik(); + ret = init_mms_tts_model(encoder_path, &rknn_app_ctx.encoder_context); + if (ret != 0) + { + printf("init_mms_tts_model fail! ret=%d encoder_path=%s\n", ret, encoder_path); + goto out; + } + timer.tok(); + timer.print_time("init_mms_tts_encoder_model"); + + timer.tik(); + ret = init_mms_tts_model(decoder_path, &rknn_app_ctx.decoder_context); + if (ret != 0) + { + printf("init_mms_tts_model fail! ret=%d decoder_path=%s\n", ret, decoder_path); + goto out; + } + timer.tok(); + timer.print_time("init_mms_tts_decoder_model"); + + // set data + timer.tik(); + read_vocab(vocab); + timer.tok(); + timer.print_time("read_vocab"); + + timer.tik(); + preprocess_input(input_text, vocab, VOCAB_NUM, MAX_LENGTH, input_ids, attention_mask); + ret = inference_mms_tts_model(&rknn_app_ctx, input_ids, attention_mask, predicted_lengths_max_real, audio_save_path); + if (ret != 0) + { + printf("inference_mms_tts_model fail! ret=%d\n", ret); + goto out; + } + timer.tok(); + timer.print_time("inference_mms_tts_model"); + + infer_time = timer.get_time() / 1000.0; // sec + audio_length = max_audio_length; // sec + rtf = infer_time / audio_length; + printf("\nReal Time Factor (RTF): %.3f / %.3f = %.3f\n", infer_time, audio_length, rtf); + printf("\nThe output wav file is saved: %s\n", audio_save_path); + +out: + + ret = release_mms_tts_model(&rknn_app_ctx.encoder_context); + if (ret != 0) + { + printf("release_mms_tts_model encoder_context fail! ret=%d\n", ret); + } + ret = release_mms_tts_model(&rknn_app_ctx.decoder_context); + if (ret != 0) + { + printf("release_ppocr_model decoder_context fail! ret=%d\n", ret); + } + + return 0; +} diff --git a/examples/mms_tts/cpp/mms_tts.h b/examples/mms_tts/cpp/mms_tts.h new file mode 100644 index 00000000..a13f9857 --- /dev/null +++ b/examples/mms_tts/cpp/mms_tts.h @@ -0,0 +1,42 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _RKNN_DEMO_MMS_TTS_H_ +#define _RKNN_DEMO_MMS_TTS_H_ + +#include "rknn_api.h" +#include +#include +#include +#include "process.h" + +typedef struct +{ + rknn_context rknn_ctx; + rknn_input_output_num io_num; + rknn_tensor_attr *input_attrs; + rknn_tensor_attr *output_attrs; +} rknn_app_context_t; + +typedef struct +{ + rknn_app_context_t encoder_context; + rknn_app_context_t decoder_context; +} rknn_mms_tts_context_t; + +int init_mms_tts_model(const char *model_path, rknn_app_context_t *app_ctx); +int release_mms_tts_model(rknn_app_context_t *app_ctx); +int inference_mms_tts_model(rknn_mms_tts_context_t *app_ctx, std::vector &input_ids, std::vector &attention_mask, int &predicted_lengths_max_real, const char *audio_save_path); + +#endif //_RKNN_DEMO_MMS_TTS_H_ \ No newline at end of file diff --git a/examples/mms_tts/cpp/process.cc b/examples/mms_tts/cpp/process.cc new file mode 100644 index 00000000..d66e24cd --- /dev/null +++ b/examples/mms_tts/cpp/process.cc @@ -0,0 +1,188 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mms_tts.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int strlenarr(const char *arr) +{ + int count = 0; + while (*arr++) + count++; + return count; +} + +static void compute_output_padding_mask(std::vector &output_padding_mask, int predicted_lengths_max_real, int predicted_lengths_max) +{ + std::transform(output_padding_mask.begin(), output_padding_mask.end(), output_padding_mask.begin(), + [predicted_lengths_max_real](int i) + { + return (float)(i < predicted_lengths_max_real); + }); +} + +static void compute_attn_mask(std::vector &output_padding_mask, std::vector &input_padding_mask, + std::vector &attn_mask, int predicted_lengths_max, int input_padding_mask_size) +{ + std::transform(attn_mask.begin(), attn_mask.end(), attn_mask.begin(), + [&output_padding_mask, &input_padding_mask, predicted_lengths_max, input_padding_mask_size](int index) + { + int i = index / input_padding_mask_size; + int j = index % input_padding_mask_size; + return int(output_padding_mask[i] * input_padding_mask[j]); + }); +} + +static void compute_duration(const std::vector &exp_log_duration, const std::vector &input_padding_mask, + std::vector &duration, float length_scale) +{ + std::transform(exp_log_duration.begin(), exp_log_duration.end(), input_padding_mask.begin(), duration.begin(), + [length_scale](float exp_log_val, float mask_val) + { + return ceil(exp_log_val * mask_val * length_scale); + }); +} + +static void compute_valid_indices(const std::vector &cum_duration, std::vector &valid_indices, int input_padding_mask_size, int predicted_lengths_max) +{ + std::vector indices(valid_indices.size()); + std::iota(indices.begin(), indices.end(), 0); + + std::for_each(indices.begin(), indices.end(), + [cum_duration, &valid_indices, predicted_lengths_max](int index) + { + int i = index / predicted_lengths_max; + int j = index % predicted_lengths_max; + valid_indices[index] = j < cum_duration[i] ? 1 : 0; + }); +} + +static std::vector exp_vector(const std::vector &vec) +{ + std::vector result(vec.size()); + std::transform(vec.begin(), vec.end(), result.begin(), [](float v) + { return exp(v); }); + return result; +} + +static std::vector cumsum(const std::vector &vec) +{ + std::vector result(vec.size()); + std::partial_sum(vec.begin(), vec.end(), result.begin()); + return result; +} + +static void transpose_mul(const std::vector &input, int input_rows, int input_cols, std::vector attn_mask, std::vector &output) +{ + std::vector indices(input.size()); + std::iota(indices.begin(), indices.end(), 0); + + std::for_each(indices.begin(), indices.end(), + [&input, &attn_mask, &output, input_rows, input_cols](int index) + { + int i = index / input_cols; + int j = index % input_cols; + output[j * input_rows + i] = (float)(input[index] * attn_mask[j * input_rows + i]); + }); +} + +static void compute_pad_indices(const std::vector &valid_indices, std::vector &sliced_indices, int input_length, int output_length) +{ + int padded_length = input_length + 1; + std::vector padded_indices(padded_length * output_length, 0); + + std::copy(valid_indices.begin(), valid_indices.end(), padded_indices.begin() + output_length); + + std::copy(padded_indices.begin(), padded_indices.begin() + input_length * output_length, sliced_indices.begin()); + + std::transform(valid_indices.begin(), valid_indices.end(), sliced_indices.begin(), + sliced_indices.begin(), std::minus()); +} + +void read_vocab(std::map &vocab) +{ + vocab = { + {' ', 19}, {'\'', 1}, {'-', 14}, {'0', 23}, {'1', 15}, {'2', 28}, {'3', 11}, {'4', 27}, {'5', 35}, {'6', 36}, {'_', 30}, {'a', 26}, + {'b', 24}, {'c', 12}, {'d', 5}, {'e', 7}, {'f', 20}, {'g', 37}, {'h', 6}, {'i', 18}, {'j', 16}, {'k', 0}, {'l', 21}, {'m', 17}, {'n', 29}, + {'o', 22}, {'p', 13}, {'q', 34}, {'r', 25}, {'s', 8}, {'t', 33}, {'u', 4}, {'v', 32}, {'w', 9}, {'x', 31}, {'y', 3}, {'z', 2}, {u'\u2013', 10}}; +} + +void preprocess_input(const char *text, std::map vocab, int vocab_size, int max_length, std::vector &input_ids, + std::vector &attention_mask) +{ + int text_len = strlenarr(text); + int input_len = 0; + + for (int i = 0; i < text_len; i++) + { + char token = tolower(text[i]); + int token_index = vocab[token]; + + if (input_len < max_length - 2) + { + input_ids[input_len++] = 0; + input_ids[input_len++] = token_index; + } + else + { + break; + } + } + + input_ids[input_len++] = 0; + + for (int i = 0; i < input_len; i++) + { + attention_mask[i] = 1; + } +} + +void middle_process(std::vector log_duration, std::vector input_padding_mask, std::vector &attn, + std::vector &output_padding_mask, int &predicted_lengths_max_real) +{ + + float speaking_rate = 1.0f; + float length_scale = 1.0f / speaking_rate; + + std::vector duration(LOG_DURATION_SIZE); + std::vector exp_log_duration = exp_vector(log_duration); + compute_duration(exp_log_duration, input_padding_mask, duration, length_scale); + + float predicted_length_sum = std::accumulate(duration.begin(), duration.end(), 0.0f); + predicted_lengths_max_real = std::max(1.0f, predicted_length_sum); + int predicted_lengths_max = PREDICTED_LENGTHS_MAX; + compute_output_padding_mask(output_padding_mask, predicted_lengths_max_real, predicted_lengths_max); + + int input_padding_mask_size = MAX_LENGTH; + std::vector attn_mask(predicted_lengths_max * input_padding_mask_size); + compute_attn_mask(output_padding_mask, input_padding_mask, attn_mask, predicted_lengths_max, input_padding_mask_size); + + std::vector cum_duration = cumsum(duration); + std::vector valid_indices(input_padding_mask_size * predicted_lengths_max, 0); + compute_valid_indices(cum_duration, valid_indices, input_padding_mask_size, predicted_lengths_max); + + std::vector padded_indices(input_padding_mask_size * predicted_lengths_max, 0); + compute_pad_indices(valid_indices, padded_indices, input_padding_mask_size, predicted_lengths_max); + + transpose_mul(padded_indices, input_padding_mask_size, predicted_lengths_max, attn_mask, attn); +} \ No newline at end of file diff --git a/examples/mms_tts/cpp/process.h b/examples/mms_tts/cpp/process.h new file mode 100644 index 00000000..87505c29 --- /dev/null +++ b/examples/mms_tts/cpp/process.h @@ -0,0 +1,29 @@ +#ifndef _RKNN_MMS_TTS_DEMO_PROCESS_H_ +#define _RKNN_MMS_TTS_DEMO_PROCESS_H_ + +#include "rknn_api.h" +#include "easy_timer.h" +#include "audio_utils.h" +#include + +#define VOCAB_NUM 38 +#define SAMPLE_RATE 16000 + +#define MAX_LENGTH 200 +#define PREDICTED_LENGTHS_MAX MAX_LENGTH * 2 +#define PREDICTED_BATCH 256 + +#define INPUT_IDS_SIZE 1 * MAX_LENGTH +#define ATTENTION_MASK_SIZE 1 * MAX_LENGTH +#define LOG_DURATION_SIZE 1 * 1 * MAX_LENGTH +#define INPUT_PADDING_MASK_SIZE 1 * 1 * MAX_LENGTH +#define PRIOR_MEANS_SIZE 1 * MAX_LENGTH * 192 +#define PRIOR_LOG_VARIANCES_SIZE 1 * MAX_LENGTH * 192 +#define ATTN_SIZE 1 * 1 * PREDICTED_LENGTHS_MAX *MAX_LENGTH +#define OUTPUT_PADDING_MASK_SIZE 1 * 1 * PREDICTED_LENGTHS_MAX + +void preprocess_input(const char *text, std::map vocab, int vocab_size, int max_length, std::vector &input_id, std::vector &attention_mask); +void read_vocab(std::map &vocab); +void middle_process(std::vector log_duration, std::vector input_padding_mask, std::vector &attn, std::vector &output_padding_mask, int &predicted_lengths_max_real); + +#endif //_RKNN_MMS_TTS_DEMO_PROCESS_H_ diff --git a/examples/mms_tts/cpp/rknpu2/mms_tts.cc b/examples/mms_tts/cpp/rknpu2/mms_tts.cc new file mode 100644 index 00000000..95752e37 --- /dev/null +++ b/examples/mms_tts/cpp/rknpu2/mms_tts.cc @@ -0,0 +1,335 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "mms_tts.h" +#include "file_utils.h" +#include +#include "process.h" + +static void dump_tensor_attr(rknn_tensor_attr *attr) +{ + char dims_str[100]; + char temp_str[100]; + memset(dims_str, 0, sizeof(dims_str)); + for (int i = 0; i < attr->n_dims; i++) + { + strcpy(temp_str, dims_str); + if (i == attr->n_dims - 1) + { + sprintf(dims_str, "%s%d", temp_str, attr->dims[i]); + } + else + { + sprintf(dims_str, "%s%d, ", temp_str, attr->dims[i]); + } + } + + printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, dims_str, attr->n_elems, attr->size, get_format_string(attr->fmt), + get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); +} + +int init_mms_tts_model(const char *model_path, rknn_app_context_t *app_ctx) +{ + int ret; + int model_len = 0; + rknn_context ctx = 0; + + ret = rknn_init(&ctx, (char *)model_path, model_len, 0, NULL); + if (ret < 0) + { + printf("rknn_init fail! ret=%d\n", ret); + return -1; + } + + // Get Model Input Output Number + rknn_input_output_num io_num; + ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); + + // Get Model Input Info + printf("input tensors:\n"); + rknn_tensor_attr input_attrs[io_num.n_input]; + memset(input_attrs, 0, sizeof(input_attrs)); + for (int i = 0; i < io_num.n_input; i++) + { + input_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(input_attrs[i])); + } + + // Get Model Output Info + printf("output tensors:\n"); + rknn_tensor_attr output_attrs[io_num.n_output]; + memset(output_attrs, 0, sizeof(output_attrs)); + for (int i = 0; i < io_num.n_output; i++) + { + output_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(output_attrs[i])); + } + + // Set to context + app_ctx->rknn_ctx = ctx; + app_ctx->io_num = io_num; + app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + return 0; +} + +int release_mms_tts_model(rknn_app_context_t *app_ctx) +{ + if (app_ctx->input_attrs != NULL) + { + free(app_ctx->input_attrs); + app_ctx->input_attrs = NULL; + } + if (app_ctx->output_attrs != NULL) + { + free(app_ctx->output_attrs); + app_ctx->output_attrs = NULL; + } + if (app_ctx->rknn_ctx != 0) + { + rknn_destroy(app_ctx->rknn_ctx); + app_ctx->rknn_ctx = 0; + } + return 0; +} + +int inference_encoder_model(rknn_app_context_t *app_ctx, std::vector &input_ids, std::vector &attention_mask, + std::vector &log_duration, std::vector &input_padding_mask, std::vector &prior_means, std::vector &prior_log_variances) +{ + int ret; + int n_input = 2; + int n_output = 4; + rknn_input inputs[n_input]; + rknn_output outputs[n_output]; + + memset(inputs, 0, sizeof(inputs)); + memset(outputs, 0, sizeof(outputs)); + + // Set Input Data + inputs[0].index = 0; + inputs[0].type = RKNN_TENSOR_INT64; + inputs[0].size = INPUT_IDS_SIZE * sizeof(int64_t); + inputs[0].buf = (int64_t *)malloc(inputs[0].size); + memcpy(inputs[0].buf, input_ids.data(), inputs[0].size); + + inputs[1].index = 1; + inputs[1].type = RKNN_TENSOR_INT64; + inputs[1].size = ATTENTION_MASK_SIZE * sizeof(int64_t); + inputs[1].buf = (int64_t *)malloc(inputs[1].size); + memcpy(inputs[1].buf, attention_mask.data(), inputs[1].size); + + ret = rknn_inputs_set(app_ctx->rknn_ctx, n_input, inputs); + if (ret < 0) + { + printf("rknn_input_set fail! ret=%d\n", ret); + goto out; + } + + // Run + ret = rknn_run(app_ctx->rknn_ctx, NULL); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + goto out; + } + + // Get Output + for (int i = 0; i < n_output; i++) + { + outputs[i].want_float = 1; + } + ret = rknn_outputs_get(app_ctx->rknn_ctx, n_output, outputs, NULL); + if (ret < 0) + { + printf("rknn_outputs_get fail! ret=%d\n", ret); + goto out; + } + + memcpy(log_duration.data(), (float *)outputs[0].buf, LOG_DURATION_SIZE * sizeof(float)); + memcpy(input_padding_mask.data(), (float *)outputs[1].buf, INPUT_PADDING_MASK_SIZE * sizeof(float)); + memcpy(prior_means.data(), (float *)outputs[2].buf, PRIOR_MEANS_SIZE * sizeof(float)); + memcpy(prior_log_variances.data(), (float *)outputs[3].buf, PRIOR_LOG_VARIANCES_SIZE * sizeof(float)); + +out: + + // Remeber to release rknn output + rknn_outputs_release(app_ctx->rknn_ctx, n_output, outputs); + for (int i = 0; i < n_input; i++) + { + if (inputs[i].buf != NULL) + { + free(inputs[i].buf); + } + } + + return ret; +} + +int inference_decoder_model(rknn_app_context_t *app_ctx, std::vector attn, std::vector output_padding_mask, + std::vector prior_means, std::vector prior_log_variances, std::vector &output_wav_data) +{ + int ret; + int n_input = 4; + int n_output = 1; + rknn_input inputs[n_input]; + rknn_output outputs[n_output]; + + memset(inputs, 0, sizeof(inputs)); + memset(outputs, 0, sizeof(outputs)); + + // Set Input Data + inputs[0].index = 0; + inputs[0].type = RKNN_TENSOR_FLOAT32; + inputs[0].size = ATTN_SIZE * sizeof(float); + inputs[0].buf = (float *)malloc(inputs[0].size); + inputs[0].fmt = RKNN_TENSOR_NHWC; + memcpy(inputs[0].buf, attn.data(), inputs[0].size); + + inputs[1].index = 1; + inputs[1].type = RKNN_TENSOR_FLOAT32; + inputs[1].size = OUTPUT_PADDING_MASK_SIZE * sizeof(float); + inputs[1].buf = (float *)malloc(inputs[1].size); + memcpy(inputs[1].buf, output_padding_mask.data(), inputs[1].size); + + inputs[2].index = 2; + inputs[2].type = RKNN_TENSOR_FLOAT32; + inputs[2].size = PRIOR_MEANS_SIZE * sizeof(float); + inputs[2].buf = (float *)malloc(inputs[2].size); + memcpy(inputs[2].buf, prior_means.data(), inputs[2].size); + + inputs[3].index = 3; + inputs[3].type = RKNN_TENSOR_FLOAT32; + inputs[3].size = PRIOR_LOG_VARIANCES_SIZE * sizeof(float); + inputs[3].buf = (float *)malloc(inputs[3].size); + memcpy(inputs[3].buf, prior_log_variances.data(), inputs[3].size); + + ret = rknn_inputs_set(app_ctx->rknn_ctx, n_input, inputs); + if (ret < 0) + { + printf("rknn_input_set fail! ret=%d\n", ret); + goto out; + } + + // Run + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + goto out; + } + + // Get Output + outputs[0].want_float = 1; + ret = rknn_outputs_get(app_ctx->rknn_ctx, n_output, outputs, NULL); + if (ret < 0) + { + printf("rknn_outputs_get fail! ret=%d\n", ret); + goto out; + } + + memcpy(output_wav_data.data(), (float *)outputs[0].buf, output_wav_data.size() * sizeof(float)); + +out: + + // Remeber to release rknn output + rknn_outputs_release(app_ctx->rknn_ctx, n_output, outputs); + for (int i = 0; i < n_input; i++) + { + if (inputs[i].buf != NULL) + { + free(inputs[i].buf); + } + } + + return ret; +} + +int inference_mms_tts_model(rknn_mms_tts_context_t *app_ctx, std::vector &input_ids, std::vector &attention_mask, int &predicted_lengths_max_real, const char *audio_save_path) +{ + int ret; + TIMER timer; + std::vector log_duration(LOG_DURATION_SIZE); + std::vector input_padding_mask(INPUT_PADDING_MASK_SIZE); + std::vector prior_means(PRIOR_MEANS_SIZE); + std::vector prior_log_variances(PRIOR_LOG_VARIANCES_SIZE); + std::vector attn(ATTN_SIZE); + std::vector output_padding_mask(OUTPUT_PADDING_MASK_SIZE); + std::vector output_wav_data; + + // timer.tik(); + ret = inference_encoder_model(&app_ctx->encoder_context, input_ids, attention_mask, log_duration, input_padding_mask, prior_means, prior_log_variances); + if (ret != 0) + { + printf("inference_encoder_model fail! ret=%d\n", ret); + goto out; + } + // timer.tok(); + // timer.print_time("inference_encoder_model"); + + // timer.tik(); + middle_process(log_duration, input_padding_mask, attn, output_padding_mask, predicted_lengths_max_real); + // timer.tok(); + // timer.print_time("middle_process"); + + // timer.tik(); + output_wav_data.resize(predicted_lengths_max_real * PREDICTED_BATCH); + ret = inference_decoder_model(&app_ctx->decoder_context, attn, output_padding_mask, prior_means, prior_log_variances, output_wav_data); + if (ret != 0) + { + printf("inference_decoder_model fail! ret=%d\n", ret); + goto out; + } + // timer.tok(); + // timer.print_time("inference_decoder_model"); + + // timer.tik(); + ret = save_audio(audio_save_path, output_wav_data.data(), output_wav_data.size(), SAMPLE_RATE, 1); + if (ret != 0) + { + printf("save_audio fail! ret=%d\n", ret); + goto out; + } + // timer.tok(); + // timer.print_time("save_audio"); + +out: + + return ret; +} \ No newline at end of file diff --git a/examples/mms_tts/export_onnx.md b/examples/mms_tts/export_onnx.md new file mode 100644 index 00000000..b4a31b09 --- /dev/null +++ b/examples/mms_tts/export_onnx.md @@ -0,0 +1,300 @@ + +# Guidelines for exporting mms_tts onnx models + +## Table of contents +- [Export mms\_tts onnx model](#export-mms_tts-onnx-model) +- [Special Notes](#special-notes) + + +## Export mms_tts onnx model + +1.Install environment + +```sh +pip install torch==1.10.0+cpu torchvision==0.11.0+cpu torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html + +pip install transformers==4.39.3 +``` + +2.Modify source code
+ +Copy all the code in [modeling_vits_for_export_onnx.py](./python/modeling_vits_for_export_onnx.py) to the `transformers/models/vits/modeling_vits.py` installation package path. For example, copy to `~/python3.8/site-packages/transformers/models/vits/modeling_vits.py` + +***Differences from the original source code*** + +- To handle dynamic shapes caused by `Range` operators, the model is split into `encoder/decoder/middle_process`. + ```py + # before + @add_start_docstrings_to_model_forward(VITS_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=VitsModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + speaker_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.FloatTensor] = None, + ) -> Union[Tuple[Any], VitsModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if attention_mask is not None: + input_padding_mask = attention_mask.unsqueeze(-1).float() + else: + input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float() + + if self.config.num_speakers > 1 and speaker_id is not None: + if not 0 <= speaker_id < self.config.num_speakers: + raise ValueError(f"Set `speaker_id` in the range 0-{self.config.num_speakers - 1}.") + if isinstance(speaker_id, int): + speaker_id = torch.full(size=(1,), fill_value=speaker_id, device=self.device) + speaker_embeddings = self.embed_speaker(speaker_id).unsqueeze(-1) + else: + speaker_embeddings = None + + if labels is not None: + raise NotImplementedError("Training of VITS is not supported yet.") + + text_encoder_output = self.text_encoder( + input_ids=input_ids, + padding_mask=input_padding_mask, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state + hidden_states = hidden_states.transpose(1, 2) + input_padding_mask = input_padding_mask.transpose(1, 2) + prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means + prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances + + if self.config.use_stochastic_duration_prediction: + log_duration = self.duration_predictor( + hidden_states, + input_padding_mask, + speaker_embeddings, + reverse=True, + noise_scale=self.noise_scale_duration, + ) + else: + log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings) + + length_scale = 1.0 / self.speaking_rate + duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale) + predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long() + + # Create a padding mask for the output lengths of shape (batch, 1, max_output_length) + indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device) + output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1) + output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype) + + # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length) + attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1) + batch_size, _, output_length, input_length = attn_mask.shape + cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1) + indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device) + valid_indices = indices.unsqueeze(0) < cum_duration + valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length) + padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1] + attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask + + # Expand prior distribution + prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2) + prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2) + + prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale + latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True) + + spectrogram = latents * output_padding_mask + waveform = self.decoder(spectrogram, speaker_embeddings) + waveform = waveform.squeeze(1) + sequence_lengths = predicted_lengths * np.prod(self.config.upsample_rates) + + if not return_dict: + outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:] + return outputs + + return VitsModelOutput( + waveform=waveform, + sequence_lengths=sequence_lengths, + spectrogram=spectrogram, + hidden_states=text_encoder_output.hidden_states, + attentions=text_encoder_output.attentions, + ) + + --------------------------------------------- + + # after + def forward_encoder( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + speaker_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.FloatTensor] = None, + ) -> Union[Tuple[Any], VitsModelOutput]: + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if attention_mask is not None: + input_padding_mask = attention_mask.unsqueeze(-1).float() + else: + input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float() + + if self.config.num_speakers > 1 and speaker_id is not None: + # if not 0 <= speaker_id < self.config.num_speakers: + # raise ValueError(f"Set `speaker_id` in the range 0-{self.config.num_speakers - 1}.") + if isinstance(speaker_id, int): + speaker_id = torch.full(size=(1,), fill_value=speaker_id, device=self.device) + speaker_embeddings = self.embed_speaker(speaker_id).unsqueeze(-1) + else: + speaker_embeddings = None + speaker_embeddings = None + + if labels is not None: + raise NotImplementedError("Training of VITS is not supported yet.") + + text_encoder_output = self.text_encoder( + input_ids=input_ids, + padding_mask=input_padding_mask, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state + hidden_states = hidden_states.transpose(1, 2) + input_padding_mask = input_padding_mask.transpose(1, 2) + prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means + prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances + + if self.config.use_stochastic_duration_prediction: + log_duration = self.duration_predictor( + hidden_states, + input_padding_mask, + speaker_embeddings, + reverse=True, + noise_scale=self.noise_scale_duration, + ) + else: + log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings) + + return log_duration, input_padding_mask, prior_means, prior_log_variances + + def forward_decoder( + self, + attn: Optional[torch.Tensor] = None, + output_padding_mask: Optional[torch.Tensor] = None, + prior_means: Optional[torch.Tensor] = None, + prior_log_variances: Optional[torch.Tensor] = None, + ) -> Union[Tuple[Any], VitsModelOutput]: + + # Expand prior distribution + speaker_embeddings = None + prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2) + prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2) + + prior_latents = prior_means + torch.randn(prior_means.shape) * torch.exp(prior_log_variances) * self.noise_scale + latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True) + + spectrogram = latents * output_padding_mask + + waveform = self.decoder(spectrogram, speaker_embeddings) + waveform = waveform.squeeze(1) + return waveform + + def forward(self, *args, **kwargs) -> Union[Tuple[Any], VitsModelOutput]: + if len(args) == 2: + return self.forward_encoder(*args, **kwargs) + else: + return self.forward_decoder(*args, **kwargs) + ``` +- To handle the randomness of the results, the input of random values ​​is fixed, so `randn_like_latents.npy` is generated in `export_onnx.py` before exporting the model. + ```py + # before + latents = ( + torch.randn(inputs.size(0), 2, inputs.size(2)).to(device=inputs.device, dtype=inputs.dtype) + * noise_scale + ) + + --------------------------------------------- + + # after + randn_like_latents = np.load(randn_like_latents_path) + randn_like_latents = torch.from_numpy(randn_like_latents).to(device=inputs.device, dtype=inputs.dtype) + latents = ( + randn_like_latents + * noise_scale + ) + ``` + +- Replace `GreaterOrEqual/LessOrEqual` to avoid precision loss issues. + ```py + # before + inside_interval_mask = (inputs >= -tail_bound).float().int() * (inputs <= tail_bound).float().int() + + --------------------------------------------- + + # after + inside_interval_mask = (-inputs < tail_bound).float().int() * (-inputs > -tail_bound).float().int() + ``` +- Replace `Cumsum` to avoid failure in converting onnx model to rknn model. + ```py + # before + cumwidths = torch.cumsum(widths, dim=-1) + + --------------------------------------------- + + # after + cumwidths = torch.zeros_like(widths) + for i in range(widths.size(-1)): + if i == 0: + cumwidths[..., i] = widths[..., i] + else: + cumwidths[..., i] = cumwidths[..., i - 1] + widths[..., i] + + ``` + +- Replace the operation of obtaining element values similar to `a[indices]` to avoid generating dynamic shape. + ```py + # before + outputs[outside_interval_mask] = inputs[outside_interval_mask] + + --------------------------------------------- + + # after + outputs = torch.zeros_like(inputs) + outputs = torch.add(outputs, outside_interval_mask.float().int() * inputs) + ``` + +3.Export onnx model +```sh +cd python +python export_onnx.py --max_length + +# such as: +python export_onnx.py --max_length 200 +``` + +*Description:* +- : Specify the maximum length of the encoder model input. Such as `100`, `200`, `300`. Default is `200`. + + +## Special Notes +1.About Python Demo +- The value of `MAX_LENGTH` in `mms_tts.py` should be modified according to the input length of the encoder model. + +2.About CPP Demo +- The value of `MAX_LENGTH` in `process.h` should be modified according to the input length of the encoder model. \ No newline at end of file diff --git a/examples/mms_tts/model/download_model.sh b/examples/mms_tts/model/download_model.sh new file mode 100644 index 00000000..bda26e41 --- /dev/null +++ b/examples/mms_tts/model/download_model.sh @@ -0,0 +1,2 @@ +wget -O mms_tts_eng_encoder_200.onnx https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_encoder_200.onnx +wget -O mms_tts_eng_decoder_200.onnx https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/mms_tts/mms_tts_eng_decoder_200.onnx \ No newline at end of file diff --git a/examples/mms_tts/output.wav b/examples/mms_tts/output.wav new file mode 100644 index 00000000..b8aa5adc Binary files /dev/null and b/examples/mms_tts/output.wav differ diff --git a/examples/mms_tts/python/convert.py b/examples/mms_tts/python/convert.py new file mode 100644 index 00000000..f6e7aa7f --- /dev/null +++ b/examples/mms_tts/python/convert.py @@ -0,0 +1,74 @@ +import sys +from rknn.api import RKNN + +DEFAULT_QUANT = False + +def parse_arg(): + if len(sys.argv) < 3: + print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0])) + print(" platform choose from [rk3562,rk3566,rk3568,rk3576,rk3588]") + print(" dtype choose from [fp] for [rk3562,rk3566,rk3568,rk3576,rk3588]") + exit(1) + + model_path = sys.argv[1] + platform = sys.argv[2] + + do_quant = DEFAULT_QUANT + if len(sys.argv) > 3: + model_type = sys.argv[3] + if model_type not in ['i8', 'u8', 'fp']: + print("ERROR: Invalid model type: {}".format(model_type)) + exit(1) + elif model_type in ['i8', 'u8']: + do_quant = True + else: + do_quant = False + + if len(sys.argv) > 4: + output_path = sys.argv[4] + else: + output_path = model_path.replace('.onnx', '.rknn') + + return model_path, platform, do_quant, output_path + +if __name__ == '__main__': + model_path, platform, do_quant, output_path = parse_arg() + + # Create RKNN object + rknn = RKNN(verbose=False) + + # Pre-process config + print('--> Config model') + if 'encoder' in model_path: + op_target = {'7398-rs':'cpu', '5773-rs':'cpu'} + rknn.config(target_platform=platform, op_target=op_target) + else: + rknn.config(target_platform=platform) + print('done') + + # Load model + print('--> Loading model') + ret = rknn.load_onnx(model=model_path) + if ret != 0: + print('Load model failed!') + exit(ret) + print('done') + + # Build model + print('--> Building model') + ret = rknn.build(do_quantization=do_quant) + if ret != 0: + print('Build model failed!') + exit(ret) + print('done') + + # Export rknn model + print('--> Export rknn model') + ret = rknn.export_rknn(output_path) + if ret != 0: + print('Export rknn model failed!') + exit(ret) + print('done') + + # Release + rknn.release() diff --git a/examples/mms_tts/python/export_onnx.py b/examples/mms_tts/python/export_onnx.py new file mode 100644 index 00000000..4aad6935 --- /dev/null +++ b/examples/mms_tts/python/export_onnx.py @@ -0,0 +1,83 @@ + +import os +randn_like_latents_path = '../model/randn_like_latents.npy' +absolute_randn_like_latents_path = os.path.abspath(randn_like_latents_path) +os.system(f'export randn_like_latents_path={absolute_randn_like_latents_path}') +os.environ['randn_like_latents_path'] = absolute_randn_like_latents_path + +from transformers import VitsModel, AutoTokenizer +import torch +import torch.nn as nn +import numpy as np +import argparse +import warnings +warnings.filterwarnings("ignore", category=UserWarning) +warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) + +def setup_model(model_name): + model = VitsModel.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + model.requires_grad_(False) + model.eval() + return model, tokenizer + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Export mms_tts onnx model', add_help=True) + parser.add_argument('--max_length', type=int, required=True, default= 200, + help='input length of encoder model, default is 200') + args = parser.parse_args() + + model, tokenizer = setup_model("facebook/mms-tts-eng") + + text = "some example text in the English language" + inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=args.max_length, truncation=True) + input_ids = inputs['input_ids'] + attention_mask = inputs['attention_mask'] + + randn_like_latents = torch.randn(input_ids.size(0), 2, input_ids.size(1)) + np.save(randn_like_latents_path, randn_like_latents) + + log_duration, input_padding_mask, prior_means, prior_log_variances = model(input_ids, attention_mask) + + speaking_rate = 1.0 + length_scale = 1.0 / speaking_rate + duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale) + predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long() + # predicted_lengths_max = predicted_lengths.max() + predicted_lengths_max = args.max_length * 2 + indices = torch.arange(predicted_lengths_max, dtype=predicted_lengths.dtype, device=predicted_lengths.device) + output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1) + output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype) + attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1) + batch_size, _, output_length, input_length = attn_mask.shape + cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1) + indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device) + valid_indices = indices.unsqueeze(0) < cum_duration + valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length) + padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1] + attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask + + save_encoder_model_path = "../model/mms_tts_eng_encoder_{}.onnx".format(args.max_length) + save_decoder_model_path = "../model/mms_tts_eng_decoder_{}.onnx".format(args.max_length) + torch.onnx.export( + model, + (input_ids, attention_mask), + save_encoder_model_path, + do_constant_folding=True, + export_params=True, + input_names=['input_ids', 'attention_mask'], + output_names=['log_duration', 'input_padding_mask', 'prior_means', 'prior_log_variances'], + opset_version=12) + print("\nThe encoder model is saved in:", save_encoder_model_path) + + torch.onnx.export( + model, + (attn, output_padding_mask, prior_means, prior_log_variances), + save_decoder_model_path, + do_constant_folding=True, + export_params=True, + input_names=['attn', 'output_padding_mask', 'prior_means', 'prior_log_variances'], + output_names=['waveform'], + opset_version=12) + + print("The decoder model is saved in:", save_decoder_model_path) diff --git a/examples/mms_tts/python/mms_tts.py b/examples/mms_tts/python/mms_tts.py new file mode 100644 index 00000000..7b8c8401 --- /dev/null +++ b/examples/mms_tts/python/mms_tts.py @@ -0,0 +1,166 @@ +import numpy as np +from rknn.api import RKNN +import argparse +import onnxruntime +import soundfile as sf +import torch +import torch.nn as nn +onnxruntime.set_default_logger_severity(3) + +MAX_LENGTH = 200 + +vocab = {' ': 19, "'": 1, '-': 14, '0': 23, '1': 15, '2': 28, '3': 11, '4': 27, '5': 35, '6': 36, '_': 30, + 'a': 26, 'b': 24, 'c': 12, 'd': 5, 'e': 7, 'f': 20, 'g': 37, 'h': 6, 'i': 18, 'j': 16, 'k': 0, 'l': 21, 'm': 17, + 'n': 29, 'o': 22, 'p': 13, 'q': 34, 'r': 25, 's': 8, 't': 33, 'u': 4, 'v': 32, 'w': 9, 'x': 31, 'y': 3, 'z': 2, '–': 10} + +def init_model(model_path, target=None, device_id=None): + if model_path.endswith(".rknn"): + # Create RKNN object + model = RKNN() + + # Load RKNN model + print('--> Loading model') + ret = model.load_rknn(model_path) + if ret != 0: + print('Load RKNN model \"{}\" failed!'.format(model_path)) + exit(ret) + print('done') + + # init runtime environment + print('--> Init runtime environment') + ret = model.init_runtime(target=target, device_id=device_id) + if ret != 0: + print('Init runtime environment failed') + exit(ret) + print('done') + + elif model_path.endswith(".onnx"): + model = onnxruntime.InferenceSession(model_path, providers=['CPUExecutionProvider']) + + return model + +def release_model(model): + if 'rknn' in str(type(model)): + model.release() + elif 'onnx' in str(type(model)): + del model + model = None + +def run_encoder(encoder_model, input_ids_array, attention_mask_array): + if 'rknn' in str(type(encoder_model)): + log_duration, input_padding_mask, prior_means, prior_log_variances = encoder_model.inference(inputs=[input_ids_array, attention_mask_array]) + elif 'onnx' in str(type(encoder_model)): + log_duration, input_padding_mask, prior_means, prior_log_variances = encoder_model.run(None, {"input_ids": input_ids_array, "attention_mask": attention_mask_array}) + + return log_duration, input_padding_mask, prior_means, prior_log_variances + +def run_decoder(decoder_model, attn, output_padding_mask, prior_means, prior_log_variances): + if 'rknn' in str(type(decoder_model)): + waveform = decoder_model.inference(inputs=[attn, output_padding_mask, prior_means, prior_log_variances])[0] + elif 'onnx' in str(type(decoder_model)): + waveform = decoder_model.run(None, {"attn": attn, "output_padding_mask": output_padding_mask, "prior_means": prior_means, "prior_log_variances": prior_log_variances})[0] + + return waveform + +def pad_or_trim(token_id, attention_mask, max_length): + pad_len = max_length - len(token_id) + if pad_len <= 0: + token_id = token_id[:max_length] + attention_mask = attention_mask[:max_length] + + if pad_len > 0: + token_id = token_id + [0] * pad_len + attention_mask = attention_mask + [0] * pad_len + + return token_id, attention_mask + +def preprocess_input(text, vocab, max_length): + text = list(text.lower()) + input_id = [] + for token in text: + if token not in vocab: + continue + input_id.append(0) + input_id.append(int(vocab[token])) + input_id.append(0) + attention_mask = [1] * len(input_id) + + input_id, attention_mask = pad_or_trim(input_id, attention_mask, max_length) + + input_ids_array = np.array(input_id)[None,...] + attention_mask_array = np.array(attention_mask)[None,...] + + return input_ids_array, attention_mask_array + +def middle_process(log_duration, input_padding_mask, max_length): + log_duration = torch.tensor(log_duration) + input_padding_mask = torch.tensor(input_padding_mask) + + speaking_rate = 1 + length_scale = 1.0 / speaking_rate + duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale) + predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long() + + # Create a padding mask for the output lengths of shape (batch, 1, max_output_length) + predicted_lengths_max_real = predicted_lengths.max() + predicted_lengths_max = max_length * 2 + + indices = torch.arange(predicted_lengths_max, dtype=predicted_lengths.dtype) + output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1) + output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype) + + # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length) + attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1) + batch_size, _, output_length, input_length = attn_mask.shape + cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1) + indices = torch.arange(output_length, dtype=duration.dtype) + valid_indices = indices.unsqueeze(0) < cum_duration + valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length) + padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1] + attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask + + attn = attn.numpy() + output_padding_mask = output_padding_mask.numpy() + + return attn, output_padding_mask, predicted_lengths_max_real + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='MMS_TTS Python Demo', add_help=True) + # basic params + parser.add_argument('--encoder_model_path', type=str, required=True, + help='model path, could be .rknn or .onnx file') + parser.add_argument('--decoder_model_path', type=str, required=True, + help='model path, could be .rknn or .onnx file') + parser.add_argument('--target', type=str, + default='rk3588', help='target RKNPU platform') + parser.add_argument('--device_id', type=str, + default=None, help='device id') + args = parser.parse_args() + + # Set inputs + text = "Mister quilter is the apostle of the middle classes and we are glad to welcome his gospel." + input_ids_array, attention_mask_array = preprocess_input(text, vocab, max_length=MAX_LENGTH) + + # Init model + encoder_model = init_model(args.encoder_model_path, args.target, args.device_id) + decoder_model = init_model(args.decoder_model_path, args.target, args.device_id) + + # Encode + log_duration, input_padding_mask, prior_means, prior_log_variances = run_encoder(encoder_model, input_ids_array, attention_mask_array) + + # Middle process + attn, output_padding_mask, predicted_lengths_max_real = middle_process(log_duration, input_padding_mask, MAX_LENGTH) + + # Decode + waveform = run_decoder(decoder_model, attn, output_padding_mask, prior_means, prior_log_variances) + + # Post process + audio_save_path = "../output.wav" + sf.write(file=audio_save_path, data=np.array(waveform[0][:predicted_lengths_max_real * 256]), samplerate=16000) + print('\nThe output wav file is saved:', audio_save_path) + + # Release + release_model(encoder_model) + release_model(decoder_model) + + diff --git a/examples/mms_tts/python/modeling_vits_for_export_onnx.py b/examples/mms_tts/python/modeling_vits_for_export_onnx.py new file mode 100644 index 00000000..57bc64d3 --- /dev/null +++ b/examples/mms_tts/python/modeling_vits_for_export_onnx.py @@ -0,0 +1,1485 @@ +# coding=utf-8 +# Copyright 2023 The Kakao Enterprise Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch VITS model.""" + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import ( + BaseModelOutput, + ModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_vits import VitsConfig + + +logger = logging.get_logger(__name__) + +import os +randn_like_latents_path = os.environ.get('randn_like_latents_path') + +# General docstring +_CONFIG_FOR_DOC = "VitsConfig" + + +VITS_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/mms-tts-eng", + # See all VITS models at https://huggingface.co/models?filter=vits + # and all MMS models at https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts +] + + +@dataclass +class VitsModelOutput(ModelOutput): + """ + Describes the outputs for the VITS model, with potential hidden states and attentions. + + Args: + waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + The final audio waveform predicted by the model. + sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`): + The length in samples of each element in the `waveform` batch. + spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`): + The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi + GAN decoder model to obtain the final audio waveform. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attention weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + waveform: torch.FloatTensor = None + sequence_lengths: torch.FloatTensor = None + spectrogram: Optional[Tuple[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class VitsTextEncoderOutput(ModelOutput): + """ + Describes the outputs for the VITS text encoder model, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The predicted mean values of the prior distribution for the latent text variables. + prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The predicted log-variance values of the prior distribution for the latent text variables. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attention weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: torch.FloatTensor = None + prior_means: torch.FloatTensor = None + prior_log_variances: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels): + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :num_channels, :]) + s_act = torch.sigmoid(in_act[:, num_channels:, :]) + acts = t_act * s_act + return acts + + +def _unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + reverse=False, + tail_bound=5.0, + min_bin_width=1e-3, + min_bin_height=1e-3, + min_derivative=1e-3, +): + """ + This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the + `tail_bound`, the transform behaves as an identity function. + + Args: + inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`: + Second half of the hidden-states input to the Vits convolutional flow module. + unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`): + First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection + layer in the convolutional flow module + unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`): + Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection + layer in the convolutional flow module + unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`): + Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection + layer in the convolutional flow module + reverse (`bool`, *optional*, defaults to `False`): + Whether the model is being run in reverse mode. + tail_bound (`float`, *optional* defaults to 5): + Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the + transform behaves as an identity function. + min_bin_width (`float`, *optional*, defaults to 1e-3): + Minimum bin value across the width dimension for the piecewise rational quadratic function. + min_bin_height (`float`, *optional*, defaults to 1e-3): + Minimum bin value across the height dimension for the piecewise rational quadratic function. + min_derivative (`float`, *optional*, defaults to 1e-3): + Minimum bin value across the derivatives for the piecewise rational quadratic function. + Returns: + outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`: + Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits + applied. + log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`: + Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound` + limits applied. + """ + # Do not use GreaterOrEqual/LessOrEqual to avoid accuracy loss!! + # inside_interval_mask = (inputs >= -tail_bound).float().int() * (inputs <= tail_bound).float().int() + inside_interval_mask = (-inputs < tail_bound).float().int() * (-inputs > -tail_bound).float().int() + outside_interval_mask = 1 - inside_interval_mask + + outside_interval_mask = outside_interval_mask.bool() + inside_interval_mask = inside_interval_mask.bool() + + outputs = torch.zeros_like(inputs) + log_abs_det = torch.zeros_like(inputs) + constant = np.log(np.exp(1 - min_derivative) - 1) + + unnormalized_derivatives = nn.functional.pad(unnormalized_derivatives, pad=(1, 1)) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs = torch.add(outputs, outside_interval_mask.float().int() * inputs) + + log_abs_det[outside_interval_mask] = 0.0 + inputs_inside_interval_mask = inside_interval_mask.float().int() * inputs + unnormalized_widths_inside_interval_mask = inside_interval_mask.float().int()[..., None] * unnormalized_widths + unnormalized_heights_inside_interval_mask = inside_interval_mask.float().int()[..., None] * unnormalized_heights + unnormalized_derivatives_inside_interval_mask = inside_interval_mask.float().int()[..., None] * unnormalized_derivatives + + outputs_inside_interval_mask, log_abs_det_inside_interval_mask = _rational_quadratic_spline( + inputs=inputs_inside_interval_mask.squeeze(), + unnormalized_widths=unnormalized_widths_inside_interval_mask.squeeze(), + unnormalized_heights=unnormalized_heights_inside_interval_mask.squeeze(), + unnormalized_derivatives=unnormalized_derivatives_inside_interval_mask.squeeze(), + reverse=reverse, + tail_bound=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + outputs = torch.add(outputs_inside_interval_mask, inside_interval_mask.float().int() * outputs) + log_abs_det = torch.add(log_abs_det_inside_interval_mask, inside_interval_mask.float().int() * log_abs_det) + return outputs, log_abs_det + + +def _rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + reverse, + tail_bound, + min_bin_width, + min_bin_height, + min_derivative, +): + """ + This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the + function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`. + + Args: + inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`: + Second half of the hidden-states input to the Vits convolutional flow module. + unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`): + First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection + layer in the convolutional flow module + unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`): + Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection + layer in the convolutional flow module + unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`): + Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection + layer in the convolutional flow module + reverse (`bool`): + Whether the model is being run in reverse mode. + tail_bound (`float`): + Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the + transform behaves as an identity function. + min_bin_width (`float`): + Minimum bin value across the width dimension for the piecewise rational quadratic function. + min_bin_height (`float`): + Minimum bin value across the height dimension for the piecewise rational quadratic function. + min_derivative (`float`): + Minimum bin value across the derivatives for the piecewise rational quadratic function. + Returns: + outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`: + Hidden-states as transformed by the piecewise rational quadratic function. + log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`: + Logarithm of the absolute value of the determinants corresponding to the `outputs`. + """ + upper_bound = tail_bound + lower_bound = -tail_bound + + if torch.min(inputs) < lower_bound or torch.max(inputs) > upper_bound: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError(f"Minimal bin width {min_bin_width} too large for the number of bins {num_bins}") + if min_bin_height * num_bins > 1.0: + raise ValueError(f"Minimal bin height {min_bin_height} too large for the number of bins {num_bins}") + + widths = nn.functional.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + + # cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = torch.zeros_like(widths) + for i in range(widths.size(-1)): + if i == 0: + cumwidths[..., i] = widths[..., i] + else: + cumwidths[..., i] = cumwidths[..., i - 1] + widths[..., i] + + cumwidths = nn.functional.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (upper_bound - lower_bound) * cumwidths + lower_bound + cumwidths[..., 0] = lower_bound + cumwidths[..., -1] = upper_bound + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + nn.functional.softplus(unnormalized_derivatives) + + heights = nn.functional.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + + # cumheights = torch.cumsum(heights, dim=-1) + cumheights = torch.zeros_like(heights) + for i in range(heights.size(-1)): + if i == 0: + cumheights[..., i] = heights[..., i] + else: + cumheights[..., i] = cumheights[..., i - 1] + heights[..., i] + + cumheights = nn.functional.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (upper_bound - lower_bound) * cumheights + lower_bound + cumheights[..., 0] = lower_bound + cumheights[..., -1] = upper_bound + heights = cumheights[..., 1:] - cumheights[..., :-1] + + bin_locations = cumheights if reverse else cumwidths + bin_locations[..., -1] += 1e-6 + # Do not use GreaterOrEqual/LessOrEqual to avoid accuracy loss!! + # bin_idx = torch.sum(inputs.unsqueeze(-1) >= bin_locations, dim=-1) - 1 + bin_idx = torch.sum(-inputs.unsqueeze(-1) < -bin_locations, dim=-1) - 1 + bin_idx = bin_idx[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + intermediate1 = input_derivatives + input_derivatives_plus_one - 2 * input_delta + if not reverse: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta) + denominator = input_delta + intermediate1 * theta_one_minus_theta + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator) + return outputs, log_abs_det + else: + # find the roots of a quadratic equation + intermediate2 = inputs - input_cumheights + intermediate3 = intermediate2 * intermediate1 + a = input_heights * (input_delta - input_derivatives) + intermediate3 + b = input_heights * input_derivatives - intermediate3 + c = -input_delta * intermediate2 + + discriminant = b.pow(2) - 4 * a * c + # if not (discriminant >= 0).all(): + # raise RuntimeError(f"invalid discriminant {discriminant}") + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + intermediate1 * theta_one_minus_theta + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator) + return outputs, -log_abs_det + + +class VitsWaveNet(torch.nn.Module): + def __init__(self, config: VitsConfig, num_layers: int): + super().__init__() + self.hidden_size = config.hidden_size + self.num_layers = num_layers + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.dropout = nn.Dropout(config.wavenet_dropout) + + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + else: + weight_norm = nn.utils.weight_norm + + if config.speaker_embedding_size != 0: + cond_layer = torch.nn.Conv1d(config.speaker_embedding_size, 2 * config.hidden_size * num_layers, 1) + self.cond_layer = weight_norm(cond_layer, name="weight") + + for i in range(num_layers): + dilation = config.wavenet_dilation_rate**i + padding = (config.wavenet_kernel_size * dilation - dilation) // 2 + in_layer = torch.nn.Conv1d( + in_channels=config.hidden_size, + out_channels=2 * config.hidden_size, + kernel_size=config.wavenet_kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < num_layers - 1: + res_skip_channels = 2 * config.hidden_size + else: + res_skip_channels = config.hidden_size + + res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1) + res_skip_layer = weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, inputs, padding_mask, global_conditioning=None): + outputs = torch.zeros_like(inputs) + num_channels_tensor = torch.IntTensor([self.hidden_size]) + + if global_conditioning is not None: + global_conditioning = self.cond_layer(global_conditioning) + + for i in range(self.num_layers): + hidden_states = self.in_layers[i](inputs) + + if global_conditioning is not None: + cond_offset = i * 2 * self.hidden_size + global_states = global_conditioning[:, cond_offset : cond_offset + 2 * self.hidden_size, :] + else: + global_states = torch.zeros_like(hidden_states) + + acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0]) + acts = self.dropout(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.num_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_size, :] + inputs = (inputs + res_acts) * padding_mask + outputs = outputs + res_skip_acts[:, self.hidden_size :, :] + else: + outputs = outputs + res_skip_acts + + return outputs * padding_mask + + def remove_weight_norm(self): + if self.speaker_embedding_size != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for layer in self.in_layers: + torch.nn.utils.remove_weight_norm(layer) + for layer in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(layer) + + +class VitsPosteriorEncoder(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.out_channels = config.flow_size + + self.conv_pre = nn.Conv1d(config.spectrogram_bins, config.hidden_size, 1) + self.wavenet = VitsWaveNet(config, num_layers=config.posterior_encoder_num_wavenet_layers) + self.conv_proj = nn.Conv1d(config.hidden_size, self.out_channels * 2, 1) + + def forward(self, inputs, padding_mask, global_conditioning=None): + inputs = self.conv_pre(inputs) * padding_mask + inputs = self.wavenet(inputs, padding_mask, global_conditioning) + stats = self.conv_proj(inputs) * padding_mask + mean, log_stddev = torch.split(stats, self.out_channels, dim=1) + + sampled = (mean + torch.randn_like(mean) * torch.exp(log_stddev)) * padding_mask + return sampled, mean, log_stddev + + +# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock +class HifiGanResidualBlock(nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1): + super().__init__() + self.leaky_relu_slope = leaky_relu_slope + + self.convs1 = nn.ModuleList( + [ + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=dilation[i], + padding=self.get_padding(kernel_size, dilation[i]), + ) + for i in range(len(dilation)) + ] + ) + self.convs2 = nn.ModuleList( + [ + nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + dilation=1, + padding=self.get_padding(kernel_size, 1), + ) + for _ in range(len(dilation)) + ] + ) + + def get_padding(self, kernel_size, dilation=1): + return (kernel_size * dilation - dilation) // 2 + + def apply_weight_norm(self): + for layer in self.convs1: + nn.utils.weight_norm(layer) + for layer in self.convs2: + nn.utils.weight_norm(layer) + + def remove_weight_norm(self): + for layer in self.convs1: + nn.utils.remove_weight_norm(layer) + for layer in self.convs2: + nn.utils.remove_weight_norm(layer) + + def forward(self, hidden_states): + for conv1, conv2 in zip(self.convs1, self.convs2): + residual = hidden_states + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = conv1(hidden_states) + hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope) + hidden_states = conv2(hidden_states) + hidden_states = hidden_states + residual + return hidden_states + + +class VitsHifiGan(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.config = config + self.num_kernels = len(config.resblock_kernel_sizes) + self.num_upsamples = len(config.upsample_rates) + self.conv_pre = nn.Conv1d( + config.flow_size, + config.upsample_initial_channel, + kernel_size=7, + stride=1, + padding=3, + ) + + self.upsampler = nn.ModuleList() + for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)): + self.upsampler.append( + nn.ConvTranspose1d( + config.upsample_initial_channel // (2**i), + config.upsample_initial_channel // (2 ** (i + 1)), + kernel_size=kernel_size, + stride=upsample_rate, + padding=(kernel_size - upsample_rate) // 2, + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.upsampler)): + channels = config.upsample_initial_channel // (2 ** (i + 1)) + for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes): + self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope)) + + self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False) + + if config.speaker_embedding_size != 0: + self.cond = nn.Conv1d(config.speaker_embedding_size, config.upsample_initial_channel, 1) + + def apply_weight_norm(self): + for layer in self.upsampler: + nn.utils.weight_norm(layer) + for layer in self.resblocks: + layer.apply_weight_norm() + + def remove_weight_norm(self): + for layer in self.upsampler: + nn.utils.remove_weight_norm(layer) + for layer in self.resblocks: + layer.remove_weight_norm() + + def forward( + self, spectrogram: torch.FloatTensor, global_conditioning: Optional[torch.FloatTensor] = None + ) -> torch.FloatTensor: + r""" + Converts a spectrogram into a speech waveform. + + Args: + spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`): + Tensor containing the spectrograms. + global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*): + Tensor containing speaker embeddings, for multispeaker models. + + Returns: + `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform. + """ + hidden_states = self.conv_pre(spectrogram) + + if global_conditioning is not None: + hidden_states = hidden_states + self.cond(global_conditioning) + + for i in range(self.num_upsamples): + hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope) + hidden_states = self.upsampler[i](hidden_states) + + res_state = self.resblocks[i * self.num_kernels](hidden_states) + for j in range(1, self.num_kernels): + res_state += self.resblocks[i * self.num_kernels + j](hidden_states) + hidden_states = res_state / self.num_kernels + + hidden_states = nn.functional.leaky_relu(hidden_states) + hidden_states = self.conv_post(hidden_states) + waveform = torch.tanh(hidden_states) + return waveform + + +class VitsResidualCouplingLayer(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.half_channels = config.flow_size // 2 + + self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1) + self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers) + self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1) + + def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False): + first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1) + hidden_states = self.conv_pre(first_half) * padding_mask + hidden_states = self.wavenet(hidden_states, padding_mask, global_conditioning) + mean = self.conv_post(hidden_states) * padding_mask + log_stddev = torch.zeros_like(mean) + + if not reverse: + second_half = mean + second_half * torch.exp(log_stddev) * padding_mask + outputs = torch.cat([first_half, second_half], dim=1) + log_determinant = torch.sum(log_stddev, [1, 2]) + return outputs, log_determinant + else: + second_half = (second_half - mean) * torch.exp(-log_stddev) * padding_mask + outputs = torch.cat([first_half, second_half], dim=1) + return outputs, None + + +class VitsResidualCouplingBlock(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.flows = nn.ModuleList() + for _ in range(config.prior_encoder_num_flows): + self.flows.append(VitsResidualCouplingLayer(config)) + + def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False): + if not reverse: + for flow in self.flows: + inputs, _ = flow(inputs, padding_mask, global_conditioning) + inputs = torch.flip(inputs, [1]) + else: + for flow in reversed(self.flows): + inputs = torch.flip(inputs, [1]) + inputs, _ = flow(inputs, padding_mask, global_conditioning, reverse=True) + return inputs + + +class VitsDilatedDepthSeparableConv(nn.Module): + def __init__(self, config: VitsConfig, dropout_rate=0.0): + super().__init__() + kernel_size = config.duration_predictor_kernel_size + channels = config.hidden_size + self.num_layers = config.depth_separable_num_layers + + self.dropout = nn.Dropout(dropout_rate) + self.convs_dilated = nn.ModuleList() + self.convs_pointwise = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(self.num_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_dilated.append( + nn.Conv1d( + in_channels=channels, + out_channels=channels, + kernel_size=kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_pointwise.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(nn.LayerNorm(channels)) + self.norms_2.append(nn.LayerNorm(channels)) + + def forward(self, inputs, padding_mask, global_conditioning=None): + if global_conditioning is not None: + inputs = inputs + global_conditioning + + for i in range(self.num_layers): + hidden_states = self.convs_dilated[i](inputs * padding_mask) + hidden_states = self.norms_1[i](hidden_states.transpose(1, -1)).transpose(1, -1) + hidden_states = nn.functional.gelu(hidden_states) + hidden_states = self.convs_pointwise[i](hidden_states) + hidden_states = self.norms_2[i](hidden_states.transpose(1, -1)).transpose(1, -1) + hidden_states = nn.functional.gelu(hidden_states) + hidden_states = self.dropout(hidden_states) + inputs = inputs + hidden_states + + return inputs * padding_mask + + +class VitsConvFlow(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.filter_channels = config.hidden_size + self.half_channels = config.depth_separable_channels // 2 + self.num_bins = config.duration_predictor_flow_bins + self.tail_bound = config.duration_predictor_tail_bound + + self.conv_pre = nn.Conv1d(self.half_channels, self.filter_channels, 1) + self.conv_dds = VitsDilatedDepthSeparableConv(config) + self.conv_proj = nn.Conv1d(self.filter_channels, self.half_channels * (self.num_bins * 3 - 1), 1) + + def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False): + first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1) + + hidden_states = self.conv_pre(first_half) + hidden_states = self.conv_dds(hidden_states, padding_mask, global_conditioning) + hidden_states = self.conv_proj(hidden_states) * padding_mask + + batch_size, channels, length = first_half.shape + hidden_states = hidden_states.reshape(batch_size, channels, -1, length).permute(0, 1, 3, 2) + + unnormalized_widths = hidden_states[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = hidden_states[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_derivatives = hidden_states[..., 2 * self.num_bins :] + + second_half, log_abs_det = _unconstrained_rational_quadratic_spline( + second_half, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + reverse=reverse, + tail_bound=self.tail_bound, + ) + + outputs = torch.cat([first_half, second_half], dim=1) * padding_mask + if not reverse: + log_determinant = torch.sum(log_abs_det * padding_mask, [1, 2]) + return outputs, log_determinant + else: + return outputs, None + + +class VitsElementwiseAffine(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.channels = config.depth_separable_channels + self.translate = nn.Parameter(torch.zeros(self.channels, 1)) + self.log_scale = nn.Parameter(torch.zeros(self.channels, 1)) + + def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False): + if not reverse: + outputs = self.translate + torch.exp(self.log_scale) * inputs + outputs = outputs * padding_mask + log_determinant = torch.sum(self.log_scale * padding_mask, [1, 2]) + return outputs, log_determinant + else: + outputs = (inputs - self.translate) * torch.exp(-self.log_scale) * padding_mask + return outputs, None + + +class VitsStochasticDurationPredictor(nn.Module): + def __init__(self, config): + super().__init__() + embed_dim = config.speaker_embedding_size + filter_channels = config.hidden_size + + self.conv_pre = nn.Conv1d(filter_channels, filter_channels, 1) + self.conv_proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.conv_dds = VitsDilatedDepthSeparableConv( + config, + dropout_rate=config.duration_predictor_dropout, + ) + + if embed_dim != 0: + self.cond = nn.Conv1d(embed_dim, filter_channels, 1) + + self.flows = nn.ModuleList() + self.flows.append(VitsElementwiseAffine(config)) + for _ in range(config.duration_predictor_num_flows): + self.flows.append(VitsConvFlow(config)) + + self.post_conv_pre = nn.Conv1d(1, filter_channels, 1) + self.post_conv_proj = nn.Conv1d(filter_channels, filter_channels, 1) + self.post_conv_dds = VitsDilatedDepthSeparableConv( + config, + dropout_rate=config.duration_predictor_dropout, + ) + + self.post_flows = nn.ModuleList() + self.post_flows.append(VitsElementwiseAffine(config)) + for _ in range(config.duration_predictor_num_flows): + self.post_flows.append(VitsConvFlow(config)) + + def forward(self, inputs, padding_mask, global_conditioning=None, durations=None, reverse=False, noise_scale=1.0): + inputs = torch.detach(inputs) + inputs = self.conv_pre(inputs) + + if global_conditioning is not None: + global_conditioning = torch.detach(global_conditioning) + inputs = inputs + self.cond(global_conditioning) + + inputs = self.conv_dds(inputs, padding_mask) + inputs = self.conv_proj(inputs) * padding_mask + + if not reverse: + hidden_states = self.post_conv_pre(durations) + hidden_states = self.post_conv_dds(hidden_states, padding_mask) + hidden_states = self.post_conv_proj(hidden_states) * padding_mask + + random_posterior = ( + torch.randn(durations.size(0), 2, durations.size(2)).to(device=inputs.device, dtype=inputs.dtype) + * padding_mask + ) + log_determinant_posterior_sum = 0 + latents_posterior = random_posterior + for flow in self.post_flows: + latents_posterior, log_determinant = flow( + latents_posterior, padding_mask, global_conditioning=inputs + hidden_states + ) + latents_posterior = torch.flip(latents_posterior, [1]) + log_determinant_posterior_sum += log_determinant + + first_half, second_half = torch.split(latents_posterior, [1, 1], dim=1) + + log_determinant_posterior_sum += torch.sum( + (nn.functional.logsigmoid(first_half) + nn.functional.logsigmoid(-first_half)) * padding_mask, [1, 2] + ) + logq = ( + torch.sum(-0.5 * (math.log(2 * math.pi) + (random_posterior**2)) * padding_mask, [1, 2]) + - log_determinant_posterior_sum + ) + + first_half = (durations - torch.sigmoid(first_half)) * padding_mask + first_half = torch.log(torch.clamp_min(first_half, 1e-5)) * padding_mask + log_determinant_sum = torch.sum(-first_half, [1, 2]) + + latents = torch.cat([first_half, second_half], dim=1) + for flow in self.flows: + latents, log_determinant = flow(latents, padding_mask, global_conditioning=inputs) + latents = torch.flip(latents, [1]) + log_determinant_sum += log_determinant + + nll = torch.sum(0.5 * (math.log(2 * math.pi) + (latents**2)) * padding_mask, [1, 2]) - log_determinant_sum + return nll + logq + else: + flows = list(reversed(self.flows)) + flows = flows[:-2] + [flows[-1]] # remove a useless vflow + + randn_like_latents = np.load(randn_like_latents_path) + randn_like_latents = torch.from_numpy(randn_like_latents).to(device=inputs.device, dtype=inputs.dtype) + latents = ( + randn_like_latents + * noise_scale + ) + + for flow in flows: + latents = torch.flip(latents, [1]) + latents, _ = flow(latents, padding_mask, global_conditioning=inputs, reverse=True) + + log_duration, _ = torch.split(latents, [1, 1], dim=1) + return log_duration + + +class VitsDurationPredictor(nn.Module): + def __init__(self, config): + super().__init__() + kernel_size = config.duration_predictor_kernel_size + filter_channels = config.duration_predictor_filter_channels + + self.dropout = nn.Dropout(config.duration_predictor_dropout) + self.conv_1 = nn.Conv1d(config.hidden_size, filter_channels, kernel_size, padding=kernel_size // 2) + self.norm_1 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps) + self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.norm_2 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps) + self.proj = nn.Conv1d(filter_channels, 1, 1) + + if config.speaker_embedding_size != 0: + self.cond = nn.Conv1d(config.speaker_embedding_size, config.hidden_size, 1) + + def forward(self, inputs, padding_mask, global_conditioning=None): + inputs = torch.detach(inputs) + + if global_conditioning is not None: + global_conditioning = torch.detach(global_conditioning) + inputs = inputs + self.cond(global_conditioning) + + inputs = self.conv_1(inputs * padding_mask) + inputs = torch.relu(inputs) + inputs = self.norm_1(inputs.transpose(1, -1)).transpose(1, -1) + inputs = self.dropout(inputs) + + inputs = self.conv_2(inputs * padding_mask) + inputs = torch.relu(inputs) + inputs = self.norm_2(inputs.transpose(1, -1)).transpose(1, -1) + inputs = self.dropout(inputs) + + inputs = self.proj(inputs * padding_mask) + return inputs * padding_mask + + +class VitsAttention(nn.Module): + """Multi-headed attention with relative positional representation.""" + + def __init__(self, config: VitsConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.dropout = config.attention_dropout + self.window_size = config.window_size + + self.head_dim = self.embed_dim // self.num_heads + self.scaling = self.head_dim**-0.5 + + if (self.head_dim * self.num_heads) != self.embed_dim: + raise ValueError( + f"hidden_size must be divisible by num_attention_heads (got `hidden_size`: {self.embed_dim}" + f" and `num_attention_heads`: {self.num_heads})." + ) + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias) + + if self.window_size: + self.emb_rel_k = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling) + self.emb_rel_v = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if self.window_size is not None: + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len) + relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1)) + rel_pos_bias = self._relative_position_to_absolute_position(relative_logits) + attn_weights += rel_pos_bias + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + if self.window_size is not None: + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len) + relative_weights = self._absolute_position_to_relative_position(attn_probs) + rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings) + attn_output += rel_pos_bias + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + def _get_relative_embeddings(self, relative_embeddings, length): + pad_length = max(length - (self.window_size + 1), 0) + if pad_length > 0: + relative_embeddings = nn.functional.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0]) + + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + return relative_embeddings[:, slice_start_position:slice_end_position] + + def _relative_position_to_absolute_position(self, x): + batch_heads, length, _ = x.size() + + # Concat columns of pad to shift from relative to absolute indexing. + x = nn.functional.pad(x, [0, 1, 0, 0, 0, 0]) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch_heads, length * 2 * length]) + x_flat = nn.functional.pad(x_flat, [0, length - 1, 0, 0]) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch_heads, length + 1, 2 * length - 1]) + x_final = x_final[:, :length, length - 1 :] + return x_final + + def _absolute_position_to_relative_position(self, x): + batch_heads, length, _ = x.size() + + # Pad along column + x = nn.functional.pad(x, [0, length - 1, 0, 0, 0, 0]) + x_flat = x.view([batch_heads, length * (2 * length - 1)]) + + # Add 0's in the beginning that will skew the elements after reshape + x_flat = nn.functional.pad(x_flat, [length, 0, 0, 0]) + x_final = x_flat.view([batch_heads, length, 2 * length])[:, :, 1:] + return x_final + + +class VitsFeedForward(nn.Module): + def __init__(self, config): + super().__init__() + self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size) + self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size) + self.dropout = nn.Dropout(config.activation_dropout) + + if isinstance(config.hidden_act, str): + self.act_fn = ACT2FN[config.hidden_act] + else: + self.act_fn = config.hidden_act + + if config.ffn_kernel_size > 1: + pad_left = (config.ffn_kernel_size - 1) // 2 + pad_right = config.ffn_kernel_size // 2 + self.padding = [pad_left, pad_right, 0, 0, 0, 0] + else: + self.padding = None + + def forward(self, hidden_states, padding_mask): + hidden_states = hidden_states.permute(0, 2, 1) + padding_mask = padding_mask.permute(0, 2, 1) + + hidden_states = hidden_states * padding_mask + if self.padding is not None: + hidden_states = nn.functional.pad(hidden_states, self.padding) + + hidden_states = self.conv_1(hidden_states) + hidden_states = self.act_fn(hidden_states) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states * padding_mask + if self.padding is not None: + hidden_states = nn.functional.pad(hidden_states, self.padding) + + hidden_states = self.conv_2(hidden_states) + hidden_states = hidden_states * padding_mask + + hidden_states = hidden_states.permute(0, 2, 1) + return hidden_states + + +class VitsEncoderLayer(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.attention = VitsAttention(config) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.feed_forward = VitsFeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + padding_mask: torch.FloatTensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ): + residual = hidden_states + hidden_states, attn_weights = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = self.dropout(hidden_states) + hidden_states = self.layer_norm(residual + hidden_states) + + residual = hidden_states + hidden_states = self.feed_forward(hidden_states, padding_mask) + hidden_states = self.dropout(hidden_states) + hidden_states = self.final_layer_norm(residual + hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class VitsEncoder(nn.Module): + def __init__(self, config: VitsConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + self.layerdrop = config.layerdrop + + def forward( + self, + hidden_states: torch.FloatTensor, + padding_mask: torch.FloatTensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) + + hidden_states = hidden_states * padding_mask + + deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = np.random.uniform(0, 1) + + skip_the_layer = self.training and (dropout_probability < self.layerdrop) + if not skip_the_layer or deepspeed_zero3_is_enabled: + # under deepspeed zero3 all gpus must run in sync + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + padding_mask, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask=attention_mask, + padding_mask=padding_mask, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = hidden_states * padding_mask + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class VitsTextEncoder(nn.Module): + """ + Transformer encoder that uses relative positional representation instead of absolute positional encoding. + """ + + def __init__(self, config: VitsConfig): + super().__init__() + self.config = config + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) + self.encoder = VitsEncoder(config) + self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1) + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: torch.Tensor, + padding_mask: torch.FloatTensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], VitsTextEncoderOutput]: + hidden_states = self.embed_tokens(input_ids) * math.sqrt(self.config.hidden_size) + + encoder_outputs = self.encoder( + hidden_states=hidden_states, + padding_mask=padding_mask, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] if not return_dict else encoder_outputs.last_hidden_state + + stats = self.project(last_hidden_state.transpose(1, 2)).transpose(1, 2) * padding_mask + prior_means, prior_log_variances = torch.split(stats, self.config.flow_size, dim=2) + + if not return_dict: + outputs = (last_hidden_state, prior_means, prior_log_variances) + encoder_outputs[1:] + return outputs + + return VitsTextEncoderOutput( + last_hidden_state=last_hidden_state, + prior_means=prior_means, + prior_log_variances=prior_log_variances, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class VitsPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = VitsConfig + base_model_prefix = "vits" + main_input_name = "input_ids" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Conv1d): + nn.init.kaiming_normal_(module.weight) + if module.bias is not None: + k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0])) + nn.init.uniform_(module.bias, a=-k, b=k) + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +VITS_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`VitsConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +VITS_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, + 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + speaker_id (`int`, *optional*): + Which speaker embedding to use. Only used for multispeaker models. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The complete VITS model, for text-to-speech synthesis.", + VITS_START_DOCSTRING, +) +class VitsModel(VitsPreTrainedModel): + def __init__(self, config: VitsConfig): + super().__init__(config) + self.config = config + self.text_encoder = VitsTextEncoder(config) + self.flow = VitsResidualCouplingBlock(config) + self.decoder = VitsHifiGan(config) + + if config.use_stochastic_duration_prediction: + self.duration_predictor = VitsStochasticDurationPredictor(config) + else: + self.duration_predictor = VitsDurationPredictor(config) + + if config.num_speakers > 1: + self.embed_speaker = nn.Embedding(config.num_speakers, config.speaker_embedding_size) + + # This is used only for training. + self.posterior_encoder = VitsPosteriorEncoder(config) + + # These parameters control the synthesised speech properties + self.speaking_rate = config.speaking_rate + self.noise_scale = config.noise_scale + self.noise_scale_duration = config.noise_scale_duration + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.text_encoder + + def forward_encoder( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + speaker_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.FloatTensor] = None, + ) -> Union[Tuple[Any], VitsModelOutput]: + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if attention_mask is not None: + input_padding_mask = attention_mask.unsqueeze(-1).float() + else: + input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float() + + if self.config.num_speakers > 1 and speaker_id is not None: + # if not 0 <= speaker_id < self.config.num_speakers: + # raise ValueError(f"Set `speaker_id` in the range 0-{self.config.num_speakers - 1}.") + if isinstance(speaker_id, int): + speaker_id = torch.full(size=(1,), fill_value=speaker_id, device=self.device) + speaker_embeddings = self.embed_speaker(speaker_id).unsqueeze(-1) + else: + speaker_embeddings = None + speaker_embeddings = None + + if labels is not None: + raise NotImplementedError("Training of VITS is not supported yet.") + + text_encoder_output = self.text_encoder( + input_ids=input_ids, + padding_mask=input_padding_mask, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state + hidden_states = hidden_states.transpose(1, 2) + input_padding_mask = input_padding_mask.transpose(1, 2) + prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means + prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances + + if self.config.use_stochastic_duration_prediction: + log_duration = self.duration_predictor( + hidden_states, + input_padding_mask, + speaker_embeddings, + reverse=True, + noise_scale=self.noise_scale_duration, + ) + else: + log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings) + + return log_duration, input_padding_mask, prior_means, prior_log_variances + + def forward_decoder( + self, + attn: Optional[torch.Tensor] = None, + output_padding_mask: Optional[torch.Tensor] = None, + prior_means: Optional[torch.Tensor] = None, + prior_log_variances: Optional[torch.Tensor] = None, + ) -> Union[Tuple[Any], VitsModelOutput]: + + # Expand prior distribution + speaker_embeddings = None + prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2) + prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2) + + prior_latents = prior_means + torch.randn(prior_means.shape) * torch.exp(prior_log_variances) * self.noise_scale + latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True) + + spectrogram = latents * output_padding_mask + + waveform = self.decoder(spectrogram, speaker_embeddings) + waveform = waveform.squeeze(1) + return waveform + + def forward(self, *args, **kwargs) -> Union[Tuple[Any], VitsModelOutput]: + if len(args) == 2: + return self.forward_encoder(*args, **kwargs) + else: + return self.forward_decoder(*args, **kwargs) diff --git a/examples/wav2vec2/cpp/main.cc b/examples/wav2vec2/cpp/main.cc index 1dfc29b1..dcfbf766 100644 --- a/examples/wav2vec2/cpp/main.cc +++ b/examples/wav2vec2/cpp/main.cc @@ -56,8 +56,28 @@ int main(int argc, char **argv) printf("read audio fail! ret=%d audio_path=%s\n", ret, audio_path); goto out; } + + if (audio.num_channels == 2) + { + ret = convert_channels(&audio); + if (ret != 0) + { + printf("convert channels fail! ret=%d\n", ret, audio_path); + goto out; + } + } + + if (audio.sample_rate != SAMPLE_RATE) + { + ret = resample_audio(&audio, audio.sample_rate, SAMPLE_RATE); + if (ret != 0) + { + printf("resample audio fail! ret=%d\n", ret, audio_path); + goto out; + } + } timer.tok(); - timer.print_time("read_audio"); + timer.print_time("read_audio & convert_channels & resample_audio"); timer.tik(); audio_preprocess(&audio, audio_data); diff --git a/examples/wav2vec2/python/wav2vec2.py b/examples/wav2vec2/python/wav2vec2.py index a58b40a3..4546073b 100644 --- a/examples/wav2vec2/python/wav2vec2.py +++ b/examples/wav2vec2/python/wav2vec2.py @@ -3,6 +3,7 @@ import argparse import soundfile as sf import onnxruntime +import scipy CHUNK_LENGTH = 20 # 20 seconds MAX_N_SAMPLES = CHUNK_LENGTH * 16000 @@ -11,6 +12,19 @@ 11: "H", 12: "S", 13: "R", 14: "D", 15: "L", 16: "U", 17: "M", 18: "W", 19: "C", 20: "F", 21: "G", 22: "Y", 23: "P", 24: "B", 25: "V", 26: "K", 27: "'", 28: "X", 29: "J", 30: "Q", 31: "Z"} +def ensure_sample_rate(waveform, original_sample_rate, desired_sample_rate=16000): + if original_sample_rate != desired_sample_rate: + print("resample_audio: {} HZ -> {} HZ".format(original_sample_rate, desired_sample_rate)) + desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate)) + waveform = scipy.signal.resample(waveform, desired_length) + return waveform, desired_sample_rate + +def ensure_channels(waveform, original_channels, desired_channels=1): + if original_channels != desired_channels: + print("convert_channels: {} -> {}".format(original_channels, desired_channels)) + waveform = np.mean(waveform, axis=1) + return waveform, desired_channels + def init_model(model_path, target=None, device_id=None): if model_path.endswith(".rknn"): # Create RKNN object @@ -95,16 +109,16 @@ def post_process(output): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Wav2vec2 Python Demo', add_help=True) # basic params - parser.add_argument('--model_path', type=str, required=True, - help='model path, could be .rknn file') - parser.add_argument('--target', type=str, - default='rk3588', help='target RKNPU platform') - parser.add_argument('--device_id', type=str, - default=None, help='device id') + parser.add_argument('--model_path', type=str, required=True, help='model path, could be .rknn file') + parser.add_argument('--target', type=str, default='rk3588', help='target RKNPU platform') + parser.add_argument('--device_id', type=str, default=None, help='device id') args = parser.parse_args() # Set inputs audio_data, sample_rate = sf.read("../model/test.wav") + channels = audio_data.ndim + audio_data, channels = ensure_channels(audio_data, channels) + audio_data, sample_rate = ensure_sample_rate(audio_data, sample_rate) audio_array = np.array(audio_data, dtype=np.float32) audio_array = pre_process(audio_array, MAX_N_SAMPLES) audio_array = np.expand_dims(audio_array, axis=0) diff --git a/examples/whisper/cpp/main.cc b/examples/whisper/cpp/main.cc index 76c74231..2d3ead40 100644 --- a/examples/whisper/cpp/main.cc +++ b/examples/whisper/cpp/main.cc @@ -76,24 +76,34 @@ int main(int argc, char **argv) memset(&audio, 0, sizeof(audio_buffer_t)); timer.tik(); - ret = init_whisper_model(encoder_path, &rknn_app_ctx.encoder_context); + ret = read_audio(audio_path, &audio); if (ret != 0) { - printf("init_whisper_model fail! ret=%d encoder_path=%s\n", ret, encoder_path); + printf("read audio fail! ret=%d audio_path=%s\n", ret, audio_path); goto out; } - timer.tok(); - timer.print_time("init_whisper_encoder_model"); - timer.tik(); - ret = init_whisper_model(decoder_path, &rknn_app_ctx.decoder_context); - if (ret != 0) + if (audio.num_channels == 2) { - printf("init_whisper_model fail! ret=%d decoder_path=%s\n", ret, decoder_path); - goto out; + ret = convert_channels(&audio); + if (ret != 0) + { + printf("convert channels fail! ret=%d\n", ret, audio_path); + goto out; + } + } + + if (audio.sample_rate != SAMPLE_RATE) + { + ret = resample_audio(&audio, audio.sample_rate, SAMPLE_RATE); + if (ret != 0) + { + printf("resample audio fail! ret=%d\n", ret, audio_path); + goto out; + } } timer.tok(); - timer.print_time("init_whisper_decoder_model"); + timer.print_time("read_audio & convert_channels & resample_audio"); timer.tik(); ret = read_mel_filters(MEL_FILTERS_PATH, mel_filters, N_MELS * MELS_FILTERS_SIZE); @@ -113,14 +123,24 @@ int main(int argc, char **argv) timer.print_time("read_mel_filters & read_vocab"); timer.tik(); - ret = read_audio(audio_path, &audio); + ret = init_whisper_model(encoder_path, &rknn_app_ctx.encoder_context); if (ret != 0) { - printf("read audio fail! ret=%d audio_path=%s\n", ret, audio_path); + printf("init_whisper_model fail! ret=%d encoder_path=%s\n", ret, encoder_path); goto out; } timer.tok(); - timer.print_time("read_audio"); + timer.print_time("init_whisper_encoder_model"); + + timer.tik(); + ret = init_whisper_model(decoder_path, &rknn_app_ctx.decoder_context); + if (ret != 0) + { + printf("init_whisper_model fail! ret=%d decoder_path=%s\n", ret, decoder_path); + goto out; + } + timer.tok(); + timer.print_time("init_whisper_decoder_model"); timer.tik(); audio_preprocess(&audio, mel_filters, audio_data); diff --git a/examples/whisper/python/whisper.py b/examples/whisper/python/whisper.py index 54516404..43d5c9d9 100644 --- a/examples/whisper/python/whisper.py +++ b/examples/whisper/python/whisper.py @@ -5,6 +5,7 @@ import onnxruntime import torch import torch.nn.functional as F +import scipy SAMPLE_RATE = 16000 N_FFT = 400 @@ -14,6 +15,20 @@ MAX_LENGTH = CHUNK_LENGTH * 100 N_MELS = 80 + +def ensure_sample_rate(waveform, original_sample_rate, desired_sample_rate=16000): + if original_sample_rate != desired_sample_rate: + print("resample_audio: {} HZ -> {} HZ".format(original_sample_rate, desired_sample_rate)) + desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate)) + waveform = scipy.signal.resample(waveform, desired_length) + return waveform, desired_sample_rate + +def ensure_channels(waveform, original_channels, desired_channels=1): + if original_channels != desired_channels: + print("convert_channels: {} -> {}".format(original_channels, desired_channels)) + waveform = np.mean(waveform, axis=1) + return waveform, desired_channels + def get_char_index(c): if 'A' <= c <= 'Z': return ord(c) - ord('A') @@ -229,6 +244,9 @@ def load_array_from_file(filename): exit(1) vocab = read_vocab(vocab_path) audio_data, sample_rate = sf.read(args.audio_path) + channels = audio_data.ndim + audio_data, channels = ensure_channels(audio_data, channels) + audio_data, sample_rate = ensure_sample_rate(audio_data, sample_rate) audio_array = np.array(audio_data, dtype=np.float32) audio_array= log_mel_spectrogram(audio_array, N_MELS).numpy() x_mel = pad_or_trim(audio_array) diff --git a/examples/yamnet/cpp/main.cc b/examples/yamnet/cpp/main.cc index d8acc423..29da19b0 100644 --- a/examples/yamnet/cpp/main.cc +++ b/examples/yamnet/cpp/main.cc @@ -50,13 +50,6 @@ int main(int argc, char **argv) memset(&result, 0, sizeof(ResultEntry)); memset(label, 0, sizeof(label)); - ret = init_yamnet_model(model_path, &rknn_app_ctx); - if (ret != 0) - { - printf("init_yamnet_model fail! ret=%d model_path=%s\n", ret, model_path); - goto out; - } - // set data ret = read_label(label); if (ret != 0) @@ -72,6 +65,33 @@ int main(int argc, char **argv) goto out; } + if (audio.num_channels == 2) + { + ret = convert_channels(&audio); + if (ret != 0) + { + printf("convert channels fail! ret=%d\n", ret, audio_path); + goto out; + } + } + + if (audio.sample_rate != SAMPLE_RATE) + { + ret = resample_audio(&audio, audio.sample_rate, SAMPLE_RATE); + if (ret != 0) + { + printf("resample audio fail! ret=%d\n", ret, audio_path); + goto out; + } + } + + ret = init_yamnet_model(model_path, &rknn_app_ctx); + if (ret != 0) + { + printf("init_yamnet_model fail! ret=%d model_path=%s\n", ret, model_path); + goto out; + } + timer.tik(); ret = inference_yamnet_model(&rknn_app_ctx, &audio, label, result); if (ret != 0) diff --git a/examples/yamnet/python/yamnet.py b/examples/yamnet/python/yamnet.py index 586daedb..b57b5221 100644 --- a/examples/yamnet/python/yamnet.py +++ b/examples/yamnet/python/yamnet.py @@ -3,10 +3,25 @@ import argparse import soundfile as sf import onnxruntime +import scipy CHUNK_LENGTH = 3 # 3 seconds MAX_N_SAMPLES = CHUNK_LENGTH * 16000 + +def ensure_sample_rate(waveform, original_sample_rate, desired_sample_rate=16000): + if original_sample_rate != desired_sample_rate: + print("resample_audio: {} HZ -> {} HZ".format(original_sample_rate, desired_sample_rate)) + desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate)) + waveform = scipy.signal.resample(waveform, desired_length) + return waveform, desired_sample_rate + +def ensure_channels(waveform, original_channels, desired_channels=1): + if original_channels != desired_channels: + print("convert_channels: {} -> {}".format(original_channels, desired_channels)) + waveform = np.mean(waveform, axis=1) + return waveform, desired_channels + def init_model(model_path, target=None, device_id=None): if model_path.endswith(".rknn"): # Create RKNN object @@ -69,23 +84,23 @@ def read_txt_to_dict(filename): for line in txtfile: line = line.strip().split(' ') key = line[0] - value = ''.join(line[1:]) + value = ' '.join(line[1:]) data_dict[key] = value return data_dict if __name__ == '__main__': parser = argparse.ArgumentParser(description='Yamnet Python Demo', add_help=True) # basic params - parser.add_argument('--model_path', type=str, required=True, - help='model path, could be .rknn/.onnx file') - parser.add_argument('--target', type=str, - default='rk3588', help='target RKNPU platform') - parser.add_argument('--device_id', type=str, - default=None, help='device id') + parser.add_argument('--model_path', type=str, required=True, help='model path, could be .rknn/.onnx file') + parser.add_argument('--target', type=str, default='rk3588', help='target RKNPU platform') + parser.add_argument('--device_id', type=str, default=None, help='device id') args = parser.parse_args() # Set inputs audio_data, sample_rate = sf.read("../model/test.wav") + channels = audio_data.ndim + audio_data, channels = ensure_channels(audio_data, channels) + audio_data, sample_rate = ensure_sample_rate(audio_data, sample_rate) audio_array = np.array(audio_data, dtype=np.float32) audio = pad_or_trim(audio_array.flatten(), MAX_N_SAMPLES) audio = np.expand_dims(audio, 0) diff --git a/examples/yolo11/README.md b/examples/yolo11/README.md new file mode 100644 index 00000000..d1a1de70 --- /dev/null +++ b/examples/yolo11/README.md @@ -0,0 +1,205 @@ +# yolo11 + +## Table of contents + +- [1. Description](#1-description) +- [2. Current Support Platform](#2-current-support-platform) +- [3. Pretrained Model](#3-pretrained-model) +- [4. Convert to RKNN](#4-convert-to-rknn) +- [5. Python Demo](#5-python-demo) +- [6. Android Demo](#6-android-demo) + - [6.1 Compile and Build](#61-compile-and-build) + - [6.2 Push demo files to device](#62-push-demo-files-to-device) + - [6.3 Run demo](#63-run-demo) +- [7. Linux Demo](#7-linux-demo) + - [7.1 Compile and Build](#71-compile-and-build) + - [7.2 Push demo files to device](#72-push-demo-files-to-device) + - [7.3 Run demo](#73-run-demo) +- [8. Expected Results](#8-expected-results) + + + +## 1. Description + +The model used in this example comes from the following open source projects: + +https://github.com/airockchip/ultralytics_yolo11 + + + +## 2. Current Support Platform + +RV1103, RV1106, RK3562, RK3566, RK3568, RK3576, RK3588, RV1109, RV1126, RK1808, RK3399PRO + + + +## 3. Pretrained Model + +Download link: + +[./yolo11n.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11n.onnx)
[./yolo11s.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11s.onnx)
[./yolo11m.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11m.onnx) + +Download with shell command: + +``` +cd model +./download_model.sh +``` + +**Note**: For exporting yolo11 onnx models, please refer to [RKOPT_README.zh-CN.md](https://github.com/airockchip/ultralytics_yolo11/blob/main/RKOPT_README.zh-CN.md) / [RKOPT_README.md](https://github.com/airockchip/ultralytics_yolo11/blob/main/RKOPT_README.md) + + +**Note**: The model provided here is an optimized model, which is different from the official original model. Take yolo11n.onnx as an example to show the difference between them. +1. The comparison of their output information is as follows. The left is the official original model, and the right is the optimized model. As shown in the figure, the original one output is divided into three groups. For example, in the set of outputs ([1,64,80,80],[1,80,80,80],[1,1,80,80]), [1,64,80,80] is the coordinate of the box, [1,80,80,80] is the confidence of the box corresponding to the 80 categories, and [1,1,80,80] is the sum of the confidence of the 80 categories. + +
+ Image +
+ +2. Taking the the set of outputs ([1,64,80,80],[1,80,80,80],[1,1,80,80]) as an example, we remove the subgraphs behind the two convolution nodes in the model, keep the outputs of these two convolutions ([1,64,80,80],[1,80,80,80]), and add a reducesum+clip branch for calculating the sum of the confidence of the 80 categories ([1,1,80,80]). + +
+ Image +
+ + +## 4. Convert to RKNN + +*Usage:* + +```shell +cd python +python convert.py + +# such as: +python convert.py ../model/yolo11n.onnx rk3588 +# output model will be saved as ../model/yolo11.rknn +``` + +*Description:* + +- ``: Specify ONNX model path. +- ``: Specify NPU platform name. Such as 'rk3588'. +- `(optional)`: Specify as `i8`, `u8` or `fp`. `i8`/`u8` for doing quantization, `fp` for no quantization. Default is `i8`/`u8`. +- `(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model with name `yolo11.rknn` + + + +## 5. Python Demo + +*Usage:* + +```shell +cd python +# Inference with PyTorch model or ONNX model +python yolo11.py --model_path --img_show + +# Inference with RKNN model +python yolo11.py --model_path --target --img_show +``` + +*Description:* + +- ``: Specify NPU platform name. Such as 'rk3588'. + +- ``: Specify the model path. + + + +## 6. Android Demo + +**Note: RK1808, RV1109, RV1126 does not support Android.** + +#### 6.1 Compile and Build + +Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#android-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo. +**Note: Please replace the model name with `yolo11`.** + +#### 6.2 Push demo files to device + +With device connected via USB port, push demo files to devices: + +```shell +adb root +adb remount +adb push install/_android_/rknn_yolo11_demo/ /data/ +``` + +#### 6.3 Run demo + +```sh +adb shell +cd /data/rknn_yolo11_demo + +export LD_LIBRARY_PATH=./lib +./rknn_yolo11_demo model/yolo11.rknn model/bus.jpg +``` + +- After running, the result was saved as `out.png`. To check the result on host PC, pull back result referring to the following command: + + ```sh + adb pull /data/rknn_yolo11_demo/out.png + ``` + +- Output result refer [Expected Results](#8-expected-results). + + + +## 7. Linux Demo + +#### 7.1 Compile and Build + +Please refer to the [Compilation_Environment_Setup_Guide](../../docs/Compilation_Environment_Setup_Guide.md#linux-platform) document to setup a cross-compilation environment and complete the compilation of C/C++ Demo. +**Note: Please replace the model name with `yolo11`.** + +#### 7.2 Push demo files to device + +- If device connected via USB port, push demo files to devices: + +```shell +adb push install/_linux_/rknn_yolo11_demo/ /userdata/ +``` + +- For other boards, use `scp` or other approaches to push all files under `install/_linux_/rknn_yolo11_demo/` to `userdata`. + +#### 7.3 Run demo + +```sh +adb shell +cd /userdata/rknn_yolo11_demo + +export LD_LIBRARY_PATH=./lib +./rknn_yolo11_demo model/yolo11.rknn model/bus.jpg +``` + +- RV1106/1103 LD_LIBRARY_PATH must specify as the absolute path. Such as + + ```sh + export LD_LIBRARY_PATH=/userdata/rknn_yolo11_demo/lib + ``` + +- After running, the result was saved as `out.png`. To check the result on host PC, pull back result referring to the following command: + + ``` + adb pull /userdata/rknn_yolo11_demo/out.png + ``` + +- Output result refer [Expected Results](#8-expected-results). + + + +## 8. Expected Results + +This example will print the labels and corresponding scores of the test image detect results, as follows: + +``` +person @ (108 236 224 535) 0.898 +person @ (212 240 284 509) 0.847 +person @ (476 229 559 520) 0.827 +person @ (79 358 118 516) 0.396 +bus @ (91 136 554 440) 0.948 +``` + + + +- Note: Different platforms, different versions of tools and drivers may have slightly different results. diff --git a/examples/yolo11/cpp/CMakeLists.txt b/examples/yolo11/cpp/CMakeLists.txt new file mode 100644 index 00000000..afc65720 --- /dev/null +++ b/examples/yolo11/cpp/CMakeLists.txt @@ -0,0 +1,108 @@ +cmake_minimum_required(VERSION 3.10) + +project(rknn_yolo11_demo) + +if (ENABLE_ASAN) + message(STATUS "BUILD WITH ADDRESS SANITIZER") + set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") +endif () + +set(rknpu_yolo11_file rknpu2/yolo11.cc) + +if (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103") + add_definitions(-DRV1106_1103) + set(rknpu_yolo11_file rknpu2/yolo11_rv1106_1103.cc) + #dma + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/allocator/dma) +endif() + +if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126") + add_definitions(-DRKNPU1) + set(rknpu_yolo11_file rknpu1/yolo11.cc) +endif() + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out) + +set(CMAKE_INSTALL_RPATH "$ORIGIN/../lib") + +file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) + +add_executable(${PROJECT_NAME} + main.cc + postprocess.cc + ${rknpu_yolo11_file} +) + +target_link_libraries(${PROJECT_NAME} + imageutils + fileutils + imagedrawing + ${LIBRKNNRT} + dl +) + +if (CMAKE_SYSTEM_NAME STREQUAL "Android") + target_link_libraries(${PROJECT_NAME} + log +) +endif() + +message(STATUS "!!!!!!!!!!!CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") +if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(${PROJECT_NAME} Threads::Threads) +endif() + +target_include_directories(${PROJECT_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${LIBRKNNRT_INCLUDES} +) + + +# Currently zero copy only supports rknpu2, v1103/rv1103b/rv1106 supports zero copy by default +if (NOT (TARGET_SOC STREQUAL "rv1106" OR TARGET_SOC STREQUAL "rv1103" OR TARGET_SOC STREQUAL "rk1808" + OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126" OR TARGET_SOC STREQUAL "rv1103b")) + add_executable(${PROJECT_NAME}_zero_copy + main.cc + postprocess.cc + rknpu2/yolo11_zero_copy.cc + ) + + target_compile_definitions(${PROJECT_NAME}_zero_copy PRIVATE ZERO_COPY) + + target_link_libraries(${PROJECT_NAME}_zero_copy + imageutils + fileutils + imagedrawing + ${LIBRKNNRT} + dl + ) + + if (CMAKE_SYSTEM_NAME STREQUAL "Android") + target_link_libraries(${PROJECT_NAME}_zero_copy + log + ) + endif() + + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(${PROJECT_NAME}_zero_copy Threads::Threads) + endif() + + target_include_directories(${PROJECT_NAME}_zero_copy PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${LIBRKNNRT_INCLUDES} + ) + install(TARGETS ${PROJECT_NAME}_zero_copy DESTINATION .) +endif() + +install(TARGETS ${PROJECT_NAME} DESTINATION .) +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/bus.jpg DESTINATION model) +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/coco_80_labels_list.txt DESTINATION model) +file(GLOB RKNN_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../model/*.rknn") +install(FILES ${RKNN_FILES} DESTINATION model) \ No newline at end of file diff --git a/examples/yolo11/cpp/main.cc b/examples/yolo11/cpp/main.cc new file mode 100644 index 00000000..0b82b6ac --- /dev/null +++ b/examples/yolo11/cpp/main.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*------------------------------------------- + Includes +-------------------------------------------*/ +#include +#include +#include +#include + +#include "yolo11.h" +#include "image_utils.h" +#include "file_utils.h" +#include "image_drawing.h" + +#if defined(RV1106_1103) + #include "dma_alloc.hpp" +#endif + +/*------------------------------------------- + Main Function +-------------------------------------------*/ +int main(int argc, char **argv) +{ + if (argc != 3) + { + printf("%s \n", argv[0]); + return -1; + } + + const char *model_path = argv[1]; + const char *image_path = argv[2]; + + int ret; + rknn_app_context_t rknn_app_ctx; + memset(&rknn_app_ctx, 0, sizeof(rknn_app_context_t)); + + init_post_process(); + + ret = init_yolo11_model(model_path, &rknn_app_ctx); + if (ret != 0) + { + printf("init_yolo11_model fail! ret=%d model_path=%s\n", ret, model_path); + goto out; + } + + image_buffer_t src_image; + memset(&src_image, 0, sizeof(image_buffer_t)); + ret = read_image(image_path, &src_image); + +#if defined(RV1106_1103) + //RV1106 rga requires that input and output bufs are memory allocated by dma + ret = dma_buf_alloc(RV1106_CMA_HEAP_PATH, src_image.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, + (void **) & (rknn_app_ctx.img_dma_buf.dma_buf_virt_addr)); + memcpy(rknn_app_ctx.img_dma_buf.dma_buf_virt_addr, src_image.virt_addr, src_image.size); + dma_sync_cpu_to_device(rknn_app_ctx.img_dma_buf.dma_buf_fd); + free(src_image.virt_addr); + src_image.virt_addr = (unsigned char *)rknn_app_ctx.img_dma_buf.dma_buf_virt_addr; + src_image.fd = rknn_app_ctx.img_dma_buf.dma_buf_fd; + rknn_app_ctx.img_dma_buf.size = src_image.size; +#endif + + if (ret != 0) + { + printf("read image fail! ret=%d image_path=%s\n", ret, image_path); + goto out; + } + + object_detect_result_list od_results; + + ret = inference_yolo11_model(&rknn_app_ctx, &src_image, &od_results); + if (ret != 0) + { + printf("init_yolo11_model fail! ret=%d\n", ret); + goto out; + } + + // 画框和概率 + char text[256]; + for (int i = 0; i < od_results.count; i++) + { + object_detect_result *det_result = &(od_results.results[i]); + printf("%s @ (%d %d %d %d) %.3f\n", coco_cls_to_name(det_result->cls_id), + det_result->box.left, det_result->box.top, + det_result->box.right, det_result->box.bottom, + det_result->prop); + int x1 = det_result->box.left; + int y1 = det_result->box.top; + int x2 = det_result->box.right; + int y2 = det_result->box.bottom; + + draw_rectangle(&src_image, x1, y1, x2 - x1, y2 - y1, COLOR_BLUE, 3); + + sprintf(text, "%s %.1f%%", coco_cls_to_name(det_result->cls_id), det_result->prop * 100); + draw_text(&src_image, text, x1, y1 - 20, COLOR_RED, 10); + } + + write_image("out.png", &src_image); + +out: + deinit_post_process(); + + ret = release_yolo11_model(&rknn_app_ctx); + if (ret != 0) + { + printf("release_yolo11_model fail! ret=%d\n", ret); + } + + if (src_image.virt_addr != NULL) + { +#if defined(RV1106_1103) + dma_buf_free(rknn_app_ctx.img_dma_buf.size, &rknn_app_ctx.img_dma_buf.dma_buf_fd, + rknn_app_ctx.img_dma_buf.dma_buf_virt_addr); +#else + free(src_image.virt_addr); +#endif + } + + return 0; +} diff --git a/examples/yolo11/cpp/postprocess.cc b/examples/yolo11/cpp/postprocess.cc new file mode 100644 index 00000000..f8ab8247 --- /dev/null +++ b/examples/yolo11/cpp/postprocess.cc @@ -0,0 +1,696 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "yolo11.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#define LABEL_NALE_TXT_PATH "./model/coco_80_labels_list.txt" + +static char *labels[OBJ_CLASS_NUM]; + +inline static int clamp(float val, int min, int max) { return val > min ? (val < max ? val : max) : min; } + +static char *readLine(FILE *fp, char *buffer, int *len) +{ + int ch; + int i = 0; + size_t buff_len = 0; + + buffer = (char *)malloc(buff_len + 1); + if (!buffer) + return NULL; // Out of memory + + while ((ch = fgetc(fp)) != '\n' && ch != EOF) + { + buff_len++; + void *tmp = realloc(buffer, buff_len + 1); + if (tmp == NULL) + { + free(buffer); + return NULL; // Out of memory + } + buffer = (char *)tmp; + + buffer[i] = (char)ch; + i++; + } + buffer[i] = '\0'; + + *len = buff_len; + + // Detect end + if (ch == EOF && (i == 0 || ferror(fp))) + { + free(buffer); + return NULL; + } + return buffer; +} + +static int readLines(const char *fileName, char *lines[], int max_line) +{ + FILE *file = fopen(fileName, "r"); + char *s; + int i = 0; + int n = 0; + + if (file == NULL) + { + printf("Open %s fail!\n", fileName); + return -1; + } + + while ((s = readLine(file, s, &n)) != NULL) + { + lines[i++] = s; + if (i >= max_line) + break; + } + fclose(file); + return i; +} + +static int loadLabelName(const char *locationFilename, char *label[]) +{ + printf("load lable %s\n", locationFilename); + readLines(locationFilename, label, OBJ_CLASS_NUM); + return 0; +} + +static float CalculateOverlap(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1, + float ymax1) +{ + float w = fmax(0.f, fmin(xmax0, xmax1) - fmax(xmin0, xmin1) + 1.0); + float h = fmax(0.f, fmin(ymax0, ymax1) - fmax(ymin0, ymin1) + 1.0); + float i = w * h; + float u = (xmax0 - xmin0 + 1.0) * (ymax0 - ymin0 + 1.0) + (xmax1 - xmin1 + 1.0) * (ymax1 - ymin1 + 1.0) - i; + return u <= 0.f ? 0.f : (i / u); +} + +static int nms(int validCount, std::vector &outputLocations, std::vector classIds, std::vector &order, + int filterId, float threshold) +{ + for (int i = 0; i < validCount; ++i) + { + int n = order[i]; + if (n == -1 || classIds[n] != filterId) + { + continue; + } + for (int j = i + 1; j < validCount; ++j) + { + int m = order[j]; + if (m == -1 || classIds[m] != filterId) + { + continue; + } + float xmin0 = outputLocations[n * 4 + 0]; + float ymin0 = outputLocations[n * 4 + 1]; + float xmax0 = outputLocations[n * 4 + 0] + outputLocations[n * 4 + 2]; + float ymax0 = outputLocations[n * 4 + 1] + outputLocations[n * 4 + 3]; + + float xmin1 = outputLocations[m * 4 + 0]; + float ymin1 = outputLocations[m * 4 + 1]; + float xmax1 = outputLocations[m * 4 + 0] + outputLocations[m * 4 + 2]; + float ymax1 = outputLocations[m * 4 + 1] + outputLocations[m * 4 + 3]; + + float iou = CalculateOverlap(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1); + + if (iou > threshold) + { + order[j] = -1; + } + } + } + return 0; +} + +static int quick_sort_indice_inverse(std::vector &input, int left, int right, std::vector &indices) +{ + float key; + int key_index; + int low = left; + int high = right; + if (left < right) + { + key_index = indices[left]; + key = input[left]; + while (low < high) + { + while (low < high && input[high] <= key) + { + high--; + } + input[low] = input[high]; + indices[low] = indices[high]; + while (low < high && input[low] >= key) + { + low++; + } + input[high] = input[low]; + indices[high] = indices[low]; + } + input[low] = key; + indices[low] = key_index; + quick_sort_indice_inverse(input, left, low - 1, indices); + quick_sort_indice_inverse(input, low + 1, right, indices); + } + return low; +} + +static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); } + +static float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); } + +inline static int32_t __clip(float val, float min, float max) +{ + float f = val <= min ? min : (val >= max ? max : val); + return f; +} + +static int8_t qnt_f32_to_affine(float f32, int32_t zp, float scale) +{ + float dst_val = (f32 / scale) + zp; + int8_t res = (int8_t)__clip(dst_val, -128, 127); + return res; +} + +static uint8_t qnt_f32_to_affine_u8(float f32, int32_t zp, float scale) +{ + float dst_val = (f32 / scale) + zp; + uint8_t res = (uint8_t)__clip(dst_val, 0, 255); + return res; +} + +static float deqnt_affine_to_f32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; } + +static float deqnt_affine_u8_to_f32(uint8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; } + +static void compute_dfl(float* tensor, int dfl_len, float* box){ + for (int b=0; b<4; b++){ + float exp_t[dfl_len]; + float exp_sum=0; + float acc_sum=0; + for (int i=0; i< dfl_len; i++){ + exp_t[i] = exp(tensor[i+b*dfl_len]); + exp_sum += exp_t[i]; + } + + for (int i=0; i< dfl_len; i++){ + acc_sum += exp_t[i]/exp_sum *i; + } + box[b] = acc_sum; + } +} + +static int process_u8(uint8_t *box_tensor, int32_t box_zp, float box_scale, + uint8_t *score_tensor, int32_t score_zp, float score_scale, + uint8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale, + int grid_h, int grid_w, int stride, int dfl_len, + std::vector &boxes, + std::vector &objProbs, + std::vector &classId, + float threshold) +{ + int validCount = 0; + int grid_len = grid_h * grid_w; + uint8_t score_thres_u8 = qnt_f32_to_affine_u8(threshold, score_zp, score_scale); + uint8_t score_sum_thres_u8 = qnt_f32_to_affine_u8(threshold, score_sum_zp, score_sum_scale); + + for (int i = 0; i < grid_h; i++) + { + for (int j = 0; j < grid_w; j++) + { + int offset = i * grid_w + j; + int max_class_id = -1; + + // Use score sum to quickly filter + if (score_sum_tensor != nullptr) + { + if (score_sum_tensor[offset] < score_sum_thres_u8) + { + continue; + } + } + + uint8_t max_score = -score_zp; + for (int c = 0; c < OBJ_CLASS_NUM; c++) + { + if ((score_tensor[offset] > score_thres_u8) && (score_tensor[offset] > max_score)) + { + max_score = score_tensor[offset]; + max_class_id = c; + } + offset += grid_len; + } + + // compute box + if (max_score > score_thres_u8) + { + offset = i * grid_w + j; + float box[4]; + float before_dfl[dfl_len * 4]; + for (int k = 0; k < dfl_len * 4; k++) + { + before_dfl[k] = deqnt_affine_u8_to_f32(box_tensor[offset], box_zp, box_scale); + offset += grid_len; + } + compute_dfl(before_dfl, dfl_len, box); + + float x1, y1, x2, y2, w, h; + x1 = (-box[0] + j + 0.5) * stride; + y1 = (-box[1] + i + 0.5) * stride; + x2 = (box[2] + j + 0.5) * stride; + y2 = (box[3] + i + 0.5) * stride; + w = x2 - x1; + h = y2 - y1; + boxes.push_back(x1); + boxes.push_back(y1); + boxes.push_back(w); + boxes.push_back(h); + + objProbs.push_back(deqnt_affine_u8_to_f32(max_score, score_zp, score_scale)); + classId.push_back(max_class_id); + validCount++; + } + } + } + return validCount; +} + +static int process_i8(int8_t *box_tensor, int32_t box_zp, float box_scale, + int8_t *score_tensor, int32_t score_zp, float score_scale, + int8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale, + int grid_h, int grid_w, int stride, int dfl_len, + std::vector &boxes, + std::vector &objProbs, + std::vector &classId, + float threshold) +{ + int validCount = 0; + int grid_len = grid_h * grid_w; + int8_t score_thres_i8 = qnt_f32_to_affine(threshold, score_zp, score_scale); + int8_t score_sum_thres_i8 = qnt_f32_to_affine(threshold, score_sum_zp, score_sum_scale); + + for (int i = 0; i < grid_h; i++) + { + for (int j = 0; j < grid_w; j++) + { + int offset = i* grid_w + j; + int max_class_id = -1; + + // 通过 score sum 起到快速过滤的作用 + if (score_sum_tensor != nullptr){ + if (score_sum_tensor[offset] < score_sum_thres_i8){ + continue; + } + } + + int8_t max_score = -score_zp; + for (int c= 0; c< OBJ_CLASS_NUM; c++){ + if ((score_tensor[offset] > score_thres_i8) && (score_tensor[offset] > max_score)) + { + max_score = score_tensor[offset]; + max_class_id = c; + } + offset += grid_len; + } + + // compute box + if (max_score> score_thres_i8){ + offset = i* grid_w + j; + float box[4]; + float before_dfl[dfl_len*4]; + for (int k=0; k< dfl_len*4; k++){ + before_dfl[k] = deqnt_affine_to_f32(box_tensor[offset], box_zp, box_scale); + offset += grid_len; + } + compute_dfl(before_dfl, dfl_len, box); + + float x1,y1,x2,y2,w,h; + x1 = (-box[0] + j + 0.5)*stride; + y1 = (-box[1] + i + 0.5)*stride; + x2 = (box[2] + j + 0.5)*stride; + y2 = (box[3] + i + 0.5)*stride; + w = x2 - x1; + h = y2 - y1; + boxes.push_back(x1); + boxes.push_back(y1); + boxes.push_back(w); + boxes.push_back(h); + + objProbs.push_back(deqnt_affine_to_f32(max_score, score_zp, score_scale)); + classId.push_back(max_class_id); + validCount ++; + } + } + } + return validCount; +} + +static int process_fp32(float *box_tensor, float *score_tensor, float *score_sum_tensor, + int grid_h, int grid_w, int stride, int dfl_len, + std::vector &boxes, + std::vector &objProbs, + std::vector &classId, + float threshold) +{ + int validCount = 0; + int grid_len = grid_h * grid_w; + for (int i = 0; i < grid_h; i++) + { + for (int j = 0; j < grid_w; j++) + { + int offset = i* grid_w + j; + int max_class_id = -1; + + // 通过 score sum 起到快速过滤的作用 + if (score_sum_tensor != nullptr){ + if (score_sum_tensor[offset] < threshold){ + continue; + } + } + + float max_score = 0; + for (int c= 0; c< OBJ_CLASS_NUM; c++){ + if ((score_tensor[offset] > threshold) && (score_tensor[offset] > max_score)) + { + max_score = score_tensor[offset]; + max_class_id = c; + } + offset += grid_len; + } + + // compute box + if (max_score> threshold){ + offset = i* grid_w + j; + float box[4]; + float before_dfl[dfl_len*4]; + for (int k=0; k< dfl_len*4; k++){ + before_dfl[k] = box_tensor[offset]; + offset += grid_len; + } + compute_dfl(before_dfl, dfl_len, box); + + float x1,y1,x2,y2,w,h; + x1 = (-box[0] + j + 0.5)*stride; + y1 = (-box[1] + i + 0.5)*stride; + x2 = (box[2] + j + 0.5)*stride; + y2 = (box[3] + i + 0.5)*stride; + w = x2 - x1; + h = y2 - y1; + boxes.push_back(x1); + boxes.push_back(y1); + boxes.push_back(w); + boxes.push_back(h); + + objProbs.push_back(max_score); + classId.push_back(max_class_id); + validCount ++; + } + } + } + return validCount; +} + + +#if defined(RV1106_1103) +static int process_i8_rv1106(int8_t *box_tensor, int32_t box_zp, float box_scale, + int8_t *score_tensor, int32_t score_zp, float score_scale, + int8_t *score_sum_tensor, int32_t score_sum_zp, float score_sum_scale, + int grid_h, int grid_w, int stride, int dfl_len, + std::vector &boxes, + std::vector &objProbs, + std::vector &classId, + float threshold) { + int validCount = 0; + int grid_len = grid_h * grid_w; + int8_t score_thres_i8 = qnt_f32_to_affine(threshold, score_zp, score_scale); + int8_t score_sum_thres_i8 = qnt_f32_to_affine(threshold, score_sum_zp, score_sum_scale); + + for (int i = 0; i < grid_h; i++) { + for (int j = 0; j < grid_w; j++) { + int offset = i * grid_w + j; + int max_class_id = -1; + + // 通过 score sum 起到快速过滤的作用 + if (score_sum_tensor != nullptr) { + //score_sum_tensor [1, 1, 80, 80] + if (score_sum_tensor[offset] < score_sum_thres_i8) { + continue; + } + } + + int8_t max_score = -score_zp; + offset = offset * OBJ_CLASS_NUM; + for (int c = 0; c < OBJ_CLASS_NUM; c++) { + if ((score_tensor[offset + c] > score_thres_i8) && (score_tensor[offset + c] > max_score)) { + max_score = score_tensor[offset + c]; //80类 [1, 80, 80, 80] 3588NCHW 1106NHWC + max_class_id = c; + } + } + + // compute box + if (max_score > score_thres_i8) { + offset = (i * grid_w + j) * 4 * dfl_len; + float box[4]; + float before_dfl[dfl_len*4]; + for (int k=0; k< dfl_len*4; k++){ + before_dfl[k] = deqnt_affine_to_f32(box_tensor[offset + k], box_zp, box_scale); + } + compute_dfl(before_dfl, dfl_len, box); + + float x1, y1, x2, y2, w, h; + x1 = (-box[0] + j + 0.5) * stride; + y1 = (-box[1] + i + 0.5) * stride; + x2 = (box[2] + j + 0.5) * stride; + y2 = (box[3] + i + 0.5) * stride; + w = x2 - x1; + h = y2 - y1; + boxes.push_back(x1); + boxes.push_back(y1); + boxes.push_back(w); + boxes.push_back(h); + + objProbs.push_back(deqnt_affine_to_f32(max_score, score_zp, score_scale)); + classId.push_back(max_class_id); + validCount ++; + } + } + } + printf("validCount=%d\n", validCount); + printf("grid h-%d, w-%d, stride %d\n", grid_h, grid_w, stride); + return validCount; +} +#endif + +int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results) +{ +#if defined(RV1106_1103) + rknn_tensor_mem **_outputs = (rknn_tensor_mem **)outputs; +#else + rknn_output *_outputs = (rknn_output *)outputs; +#endif + std::vector filterBoxes; + std::vector objProbs; + std::vector classId; + int validCount = 0; + int stride = 0; + int grid_h = 0; + int grid_w = 0; + int model_in_w = app_ctx->model_width; + int model_in_h = app_ctx->model_height; + + memset(od_results, 0, sizeof(object_detect_result_list)); + + // default 3 branch +#ifdef RKNPU1 + int dfl_len = app_ctx->output_attrs[0].dims[2] / 4; +#else + int dfl_len = app_ctx->output_attrs[0].dims[1] /4; +#endif + int output_per_branch = app_ctx->io_num.n_output / 3; + for (int i = 0; i < 3; i++) + { +#if defined(RV1106_1103) + dfl_len = app_ctx->output_attrs[0].dims[3] /4; + void *score_sum = nullptr; + int32_t score_sum_zp = 0; + float score_sum_scale = 1.0; + if (output_per_branch == 3) { + score_sum = _outputs[i * output_per_branch + 2]->virt_addr; + score_sum_zp = app_ctx->output_attrs[i * output_per_branch + 2].zp; + score_sum_scale = app_ctx->output_attrs[i * output_per_branch + 2].scale; + } + int box_idx = i * output_per_branch; + int score_idx = i * output_per_branch + 1; + grid_h = app_ctx->output_attrs[box_idx].dims[1]; + grid_w = app_ctx->output_attrs[box_idx].dims[2]; + stride = model_in_h / grid_h; + + if (app_ctx->is_quant) { + validCount += process_i8_rv1106((int8_t *)_outputs[box_idx]->virt_addr, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale, + (int8_t *)_outputs[score_idx]->virt_addr, app_ctx->output_attrs[score_idx].zp, + app_ctx->output_attrs[score_idx].scale, (int8_t *)score_sum, score_sum_zp, score_sum_scale, + grid_h, grid_w, stride, dfl_len, filterBoxes, objProbs, classId, conf_threshold); + } + else + { + printf("RV1106/1103 only support quantization mode\n", LABEL_NALE_TXT_PATH); + return -1; + } + +#else + void *score_sum = nullptr; + int32_t score_sum_zp = 0; + float score_sum_scale = 1.0; + if (output_per_branch == 3){ + score_sum = _outputs[i*output_per_branch + 2].buf; + score_sum_zp = app_ctx->output_attrs[i*output_per_branch + 2].zp; + score_sum_scale = app_ctx->output_attrs[i*output_per_branch + 2].scale; + } + int box_idx = i*output_per_branch; + int score_idx = i*output_per_branch + 1; + +#ifdef RKNPU1 + grid_h = app_ctx->output_attrs[box_idx].dims[1]; + grid_w = app_ctx->output_attrs[box_idx].dims[0]; +#else + grid_h = app_ctx->output_attrs[box_idx].dims[2]; + grid_w = app_ctx->output_attrs[box_idx].dims[3]; +#endif + stride = model_in_h / grid_h; + + if (app_ctx->is_quant) + { +#ifdef RKNPU1 + validCount += process_u8((uint8_t *)_outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale, + (uint8_t *)_outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale, + (uint8_t *)score_sum, score_sum_zp, score_sum_scale, + grid_h, grid_w, stride, dfl_len, + filterBoxes, objProbs, classId, conf_threshold); +#else + validCount += process_i8((int8_t *)_outputs[box_idx].buf, app_ctx->output_attrs[box_idx].zp, app_ctx->output_attrs[box_idx].scale, + (int8_t *)_outputs[score_idx].buf, app_ctx->output_attrs[score_idx].zp, app_ctx->output_attrs[score_idx].scale, + (int8_t *)score_sum, score_sum_zp, score_sum_scale, + grid_h, grid_w, stride, dfl_len, + filterBoxes, objProbs, classId, conf_threshold); +#endif + } + else + { + validCount += process_fp32((float *)_outputs[box_idx].buf, (float *)_outputs[score_idx].buf, (float *)score_sum, + grid_h, grid_w, stride, dfl_len, + filterBoxes, objProbs, classId, conf_threshold); + } +#endif + } + + // no object detect + if (validCount <= 0) + { + return 0; + } + std::vector indexArray; + for (int i = 0; i < validCount; ++i) + { + indexArray.push_back(i); + } + quick_sort_indice_inverse(objProbs, 0, validCount - 1, indexArray); + + std::set class_set(std::begin(classId), std::end(classId)); + + for (auto c : class_set) + { + nms(validCount, filterBoxes, classId, indexArray, c, nms_threshold); + } + + int last_count = 0; + od_results->count = 0; + + /* box valid detect target */ + for (int i = 0; i < validCount; ++i) + { + if (indexArray[i] == -1 || last_count >= OBJ_NUMB_MAX_SIZE) + { + continue; + } + int n = indexArray[i]; + + float x1 = filterBoxes[n * 4 + 0] - letter_box->x_pad; + float y1 = filterBoxes[n * 4 + 1] - letter_box->y_pad; + float x2 = x1 + filterBoxes[n * 4 + 2]; + float y2 = y1 + filterBoxes[n * 4 + 3]; + int id = classId[n]; + float obj_conf = objProbs[i]; + + od_results->results[last_count].box.left = (int)(clamp(x1, 0, model_in_w) / letter_box->scale); + od_results->results[last_count].box.top = (int)(clamp(y1, 0, model_in_h) / letter_box->scale); + od_results->results[last_count].box.right = (int)(clamp(x2, 0, model_in_w) / letter_box->scale); + od_results->results[last_count].box.bottom = (int)(clamp(y2, 0, model_in_h) / letter_box->scale); + od_results->results[last_count].prop = obj_conf; + od_results->results[last_count].cls_id = id; + last_count++; + } + od_results->count = last_count; + return 0; +} + +int init_post_process() +{ + int ret = 0; + ret = loadLabelName(LABEL_NALE_TXT_PATH, labels); + if (ret < 0) + { + printf("Load %s failed!\n", LABEL_NALE_TXT_PATH); + return -1; + } + return 0; +} + +char *coco_cls_to_name(int cls_id) +{ + + if (cls_id >= OBJ_CLASS_NUM) + { + return "null"; + } + + if (labels[cls_id]) + { + return labels[cls_id]; + } + + return "null"; +} + +void deinit_post_process() +{ + for (int i = 0; i < OBJ_CLASS_NUM; i++) + { + if (labels[i] != nullptr) + { + free(labels[i]); + labels[i] = nullptr; + } + } +} diff --git a/examples/yolo11/cpp/postprocess.h b/examples/yolo11/cpp/postprocess.h new file mode 100644 index 00000000..9eb7628b --- /dev/null +++ b/examples/yolo11/cpp/postprocess.h @@ -0,0 +1,36 @@ +#ifndef _RKNN_YOLO11_DEMO_POSTPROCESS_H_ +#define _RKNN_YOLO11_DEMO_POSTPROCESS_H_ + +#include +#include +#include "rknn_api.h" +#include "common.h" +#include "image_utils.h" + +#define OBJ_NAME_MAX_SIZE 64 +#define OBJ_NUMB_MAX_SIZE 128 +#define OBJ_CLASS_NUM 80 +#define NMS_THRESH 0.45 +#define BOX_THRESH 0.25 + +// class rknn_app_context_t; + +typedef struct { + image_rect_t box; + float prop; + int cls_id; +} object_detect_result; + +typedef struct { + int id; + int count; + object_detect_result results[OBJ_NUMB_MAX_SIZE]; +} object_detect_result_list; + +int init_post_process(); +void deinit_post_process(); +char *coco_cls_to_name(int cls_id); +int post_process(rknn_app_context_t *app_ctx, void *outputs, letterbox_t *letter_box, float conf_threshold, float nms_threshold, object_detect_result_list *od_results); + +void deinitPostProcess(); +#endif //_RKNN_YOLO11_DEMO_POSTPROCESS_H_ diff --git a/examples/yolo11/cpp/rknpu1/yolo11.cc b/examples/yolo11/cpp/rknpu1/yolo11.cc new file mode 100644 index 00000000..9e5e6d6c --- /dev/null +++ b/examples/yolo11/cpp/rknpu1/yolo11.cc @@ -0,0 +1,250 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "yolo11.h" +#include "common.h" +#include "file_utils.h" +#include "image_utils.h" + +static void dump_tensor_attr(rknn_tensor_attr *attr) +{ + printf(" index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " + "zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0], + attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type), + get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); +} + +int init_yolo11_model(const char *model_path, rknn_app_context_t *app_ctx) +{ + int ret; + int model_len = 0; + char *model; + rknn_context ctx = 0; + + // Load RKNN Model + model_len = read_data_from_file(model_path, &model); + if (model == NULL) + { + printf("load_model fail!\n"); + return -1; + } + + ret = rknn_init(&ctx, model, model_len, 0); + free(model); + if (ret < 0) + { + printf("rknn_init fail! ret=%d\n", ret); + return -1; + } + + // Get Model Input Output Number + rknn_input_output_num io_num; + ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); + + // Get Model Input Info + printf("input tensors:\n"); + rknn_tensor_attr input_attrs[io_num.n_input]; + memset(input_attrs, 0, sizeof(input_attrs)); + for (int i = 0; i < io_num.n_input; i++) + { + input_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(input_attrs[i])); + } + + // Get Model Output Info + printf("output tensors:\n"); + rknn_tensor_attr output_attrs[io_num.n_output]; + memset(output_attrs, 0, sizeof(output_attrs)); + for (int i = 0; i < io_num.n_output; i++) + { + output_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(output_attrs[i])); + } + + // Set to context + app_ctx->rknn_ctx = ctx; + + // TODO + if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type == RKNN_TENSOR_UINT8) + { + app_ctx->is_quant = true; + } + else + { + app_ctx->is_quant = false; + } + + app_ctx->io_num = io_num; + app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) + { + printf("model is NCHW input fmt\n"); + app_ctx->model_channel = input_attrs[0].dims[2]; + app_ctx->model_height = input_attrs[0].dims[1]; + app_ctx->model_width = input_attrs[0].dims[0]; + } + else + { + printf("model is NHWC input fmt\n"); + app_ctx->model_height = input_attrs[0].dims[2]; + app_ctx->model_width = input_attrs[0].dims[1]; + app_ctx->model_channel = input_attrs[0].dims[0]; + } + printf("model input height=%d, width=%d, channel=%d\n", + app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel); + + return 0; +} + +int release_yolo11_model(rknn_app_context_t *app_ctx) +{ + if (app_ctx->input_attrs != NULL) + { + free(app_ctx->input_attrs); + app_ctx->input_attrs = NULL; + } + if (app_ctx->output_attrs != NULL) + { + free(app_ctx->output_attrs); + app_ctx->output_attrs = NULL; + } + if (app_ctx->rknn_ctx != 0) + { + rknn_destroy(app_ctx->rknn_ctx); + app_ctx->rknn_ctx = 0; + } + return 0; +} + +int inference_yolo11_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results) +{ + int ret; + image_buffer_t dst_img; + letterbox_t letter_box; + rknn_input inputs[app_ctx->io_num.n_input]; + rknn_output outputs[app_ctx->io_num.n_output]; + const float nms_threshold = NMS_THRESH; // Default NMS threshold + const float box_conf_threshold = BOX_THRESH; // Default box threshold + int bg_color = 114; + + if ((!app_ctx) || !(img) || (!od_results)) + { + return -1; + } + + memset(od_results, 0x00, sizeof(*od_results)); + memset(&letter_box, 0, sizeof(letterbox_t)); + memset(&dst_img, 0, sizeof(image_buffer_t)); + memset(inputs, 0, sizeof(inputs)); + memset(outputs, 0, sizeof(outputs)); + + // Pre Process + dst_img.width = app_ctx->model_width; + dst_img.height = app_ctx->model_height; + dst_img.format = IMAGE_FORMAT_RGB888; + dst_img.size = get_image_size(&dst_img); + dst_img.virt_addr = (unsigned char *)malloc(dst_img.size); + if (dst_img.virt_addr == NULL) + { + printf("malloc buffer size:%d fail!\n", dst_img.size); + return -1; + } + + // letterbox + ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color); + if (ret < 0) + { + printf("convert_image_with_letterbox fail! ret=%d\n", ret); + return -1; + } + + // Set Input Data + inputs[0].index = 0; + inputs[0].type = RKNN_TENSOR_UINT8; + inputs[0].fmt = RKNN_TENSOR_NHWC; + inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel; + inputs[0].buf = dst_img.virt_addr; + + ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs); + if (ret < 0) + { + printf("rknn_input_set fail! ret=%d\n", ret); + return -1; + } + + // Run + printf("rknn_run\n"); + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + return -1; + } + + // Get Output + memset(outputs, 0, sizeof(outputs)); + for (int i = 0; i < app_ctx->io_num.n_output; i++) + { + outputs[i].index = i; + outputs[i].want_float = (!app_ctx->is_quant); + } + ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL); + if (ret < 0) + { + printf("rknn_outputs_get fail! ret=%d\n", ret); + goto out; + } + + // Post Process + post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results); + + // Remeber to release rknn output + rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs); + +out: + if (dst_img.virt_addr != NULL) + { + free(dst_img.virt_addr); + } + + return ret; +} \ No newline at end of file diff --git a/examples/yolo11/cpp/rknpu2/yolo11.cc b/examples/yolo11/cpp/rknpu2/yolo11.cc new file mode 100644 index 00000000..2b38e657 --- /dev/null +++ b/examples/yolo11/cpp/rknpu2/yolo11.cc @@ -0,0 +1,250 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "yolo11.h" +#include "common.h" +#include "file_utils.h" +#include "image_utils.h" + +static void dump_tensor_attr(rknn_tensor_attr *attr) +{ + printf(" index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " + "zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3], + attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type), + get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); +} + +int init_yolo11_model(const char *model_path, rknn_app_context_t *app_ctx) +{ + int ret; + int model_len = 0; + char *model; + rknn_context ctx = 0; + + // Load RKNN Model + model_len = read_data_from_file(model_path, &model); + if (model == NULL) + { + printf("load_model fail!\n"); + return -1; + } + + ret = rknn_init(&ctx, model, model_len, 0, NULL); + free(model); + if (ret < 0) + { + printf("rknn_init fail! ret=%d\n", ret); + return -1; + } + + // Get Model Input Output Number + rknn_input_output_num io_num; + ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); + + // Get Model Input Info + printf("input tensors:\n"); + rknn_tensor_attr input_attrs[io_num.n_input]; + memset(input_attrs, 0, sizeof(input_attrs)); + for (int i = 0; i < io_num.n_input; i++) + { + input_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(input_attrs[i])); + } + + // Get Model Output Info + printf("output tensors:\n"); + rknn_tensor_attr output_attrs[io_num.n_output]; + memset(output_attrs, 0, sizeof(output_attrs)); + for (int i = 0; i < io_num.n_output; i++) + { + output_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(output_attrs[i])); + } + + // Set to context + app_ctx->rknn_ctx = ctx; + + // TODO + if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type == RKNN_TENSOR_INT8) + { + app_ctx->is_quant = true; + } + else + { + app_ctx->is_quant = false; + } + + app_ctx->io_num = io_num; + app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) + { + printf("model is NCHW input fmt\n"); + app_ctx->model_channel = input_attrs[0].dims[1]; + app_ctx->model_height = input_attrs[0].dims[2]; + app_ctx->model_width = input_attrs[0].dims[3]; + } + else + { + printf("model is NHWC input fmt\n"); + app_ctx->model_height = input_attrs[0].dims[1]; + app_ctx->model_width = input_attrs[0].dims[2]; + app_ctx->model_channel = input_attrs[0].dims[3]; + } + printf("model input height=%d, width=%d, channel=%d\n", + app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel); + + return 0; +} + +int release_yolo11_model(rknn_app_context_t *app_ctx) +{ + if (app_ctx->input_attrs != NULL) + { + free(app_ctx->input_attrs); + app_ctx->input_attrs = NULL; + } + if (app_ctx->output_attrs != NULL) + { + free(app_ctx->output_attrs); + app_ctx->output_attrs = NULL; + } + if (app_ctx->rknn_ctx != 0) + { + rknn_destroy(app_ctx->rknn_ctx); + app_ctx->rknn_ctx = 0; + } + return 0; +} + +int inference_yolo11_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results) +{ + int ret; + image_buffer_t dst_img; + letterbox_t letter_box; + rknn_input inputs[app_ctx->io_num.n_input]; + rknn_output outputs[app_ctx->io_num.n_output]; + const float nms_threshold = NMS_THRESH; // 默认的NMS阈值 + const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值 + int bg_color = 114; + + if ((!app_ctx) || !(img) || (!od_results)) + { + return -1; + } + + memset(od_results, 0x00, sizeof(*od_results)); + memset(&letter_box, 0, sizeof(letterbox_t)); + memset(&dst_img, 0, sizeof(image_buffer_t)); + memset(inputs, 0, sizeof(inputs)); + memset(outputs, 0, sizeof(outputs)); + + // Pre Process + dst_img.width = app_ctx->model_width; + dst_img.height = app_ctx->model_height; + dst_img.format = IMAGE_FORMAT_RGB888; + dst_img.size = get_image_size(&dst_img); + dst_img.virt_addr = (unsigned char *)malloc(dst_img.size); + if (dst_img.virt_addr == NULL) + { + printf("malloc buffer size:%d fail!\n", dst_img.size); + return -1; + } + + // letterbox + ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color); + if (ret < 0) + { + printf("convert_image_with_letterbox fail! ret=%d\n", ret); + return -1; + } + + // Set Input Data + inputs[0].index = 0; + inputs[0].type = RKNN_TENSOR_UINT8; + inputs[0].fmt = RKNN_TENSOR_NHWC; + inputs[0].size = app_ctx->model_width * app_ctx->model_height * app_ctx->model_channel; + inputs[0].buf = dst_img.virt_addr; + + ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, inputs); + if (ret < 0) + { + printf("rknn_input_set fail! ret=%d\n", ret); + return -1; + } + + // Run + printf("rknn_run\n"); + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + return -1; + } + + // Get Output + memset(outputs, 0, sizeof(outputs)); + for (int i = 0; i < app_ctx->io_num.n_output; i++) + { + outputs[i].index = i; + outputs[i].want_float = (!app_ctx->is_quant); + } + ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs, NULL); + if (ret < 0) + { + printf("rknn_outputs_get fail! ret=%d\n", ret); + goto out; + } + + // Post Process + post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results); + + // Remeber to release rknn output + rknn_outputs_release(app_ctx->rknn_ctx, app_ctx->io_num.n_output, outputs); + +out: + if (dst_img.virt_addr != NULL) + { + free(dst_img.virt_addr); + } + + return ret; +} \ No newline at end of file diff --git a/examples/yolo11/cpp/rknpu2/yolo11_rv1106_1103.cc b/examples/yolo11/cpp/rknpu2/yolo11_rv1106_1103.cc new file mode 100644 index 00000000..92b19842 --- /dev/null +++ b/examples/yolo11/cpp/rknpu2/yolo11_rv1106_1103.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "yolo11.h" +#include "common.h" +#include "file_utils.h" +#include "image_utils.h" + +static void dump_tensor_attr(rknn_tensor_attr *attr) +{ + printf(" index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " + "zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3], + attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type), + get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); +} + +int init_yolo11_model(const char *model_path, rknn_app_context_t *app_ctx) +{ + int ret; + int model_len = 0; + char *model; + rknn_context ctx = 0; + + ret = rknn_init(&ctx, (char *)model_path, 0, 0, NULL); + if (ret < 0) + { + printf("rknn_init fail! ret=%d\n", ret); + return -1; + } + + // Get Model Input Output Number + rknn_input_output_num io_num; + ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); + + // Get Model Input Info + printf("input tensors:\n"); + rknn_tensor_attr input_attrs[io_num.n_input]; + memset(input_attrs, 0, sizeof(input_attrs)); + for (int i = 0; i < io_num.n_input; i++) + { + input_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(input_attrs[i])); + } + + // Get Model Output Info + printf("output tensors:\n"); + rknn_tensor_attr output_attrs[io_num.n_output]; + memset(output_attrs, 0, sizeof(output_attrs)); + for (int i = 0; i < io_num.n_output; i++) + { + output_attrs[i].index = i; + //When using the zero-copy API interface, query the native output tensor attribute + ret = rknn_query(ctx, RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(output_attrs[i])); + } + + // default input type is int8 (normalize and quantize need compute in outside) + // if set uint8, will fuse normalize and quantize to npu + input_attrs[0].type = RKNN_TENSOR_UINT8; + // default fmt is NHWC,1106 npu only support NHWC in zero copy mode + input_attrs[0].fmt = RKNN_TENSOR_NHWC; + printf("input_attrs[0].size_with_stride=%d\n", input_attrs[0].size_with_stride); + app_ctx->input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride); + + // Set input tensor memory + ret = rknn_set_io_mem(ctx, app_ctx->input_mems[0], &input_attrs[0]); + if (ret < 0) { + printf("input_mems rknn_set_io_mem fail! ret=%d\n", ret); + return -1; + } + + // Set output tensor memory + for (uint32_t i = 0; i < io_num.n_output; ++i) { + app_ctx->output_mems[i] = rknn_create_mem(ctx, output_attrs[i].size_with_stride); + ret = rknn_set_io_mem(ctx, app_ctx->output_mems[i], &output_attrs[i]); + if (ret < 0) { + printf("output_mems rknn_set_io_mem fail! ret=%d\n", ret); + return -1; + } + } + + // Set to context + app_ctx->rknn_ctx = ctx; + + // TODO + if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC) + { + app_ctx->is_quant = true; + } + else + { + app_ctx->is_quant = false; + } + + app_ctx->io_num = io_num; + app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) + { + printf("model is NCHW input fmt\n"); + app_ctx->model_channel = input_attrs[0].dims[1]; + app_ctx->model_height = input_attrs[0].dims[2]; + app_ctx->model_width = input_attrs[0].dims[3]; + } else + { + printf("model is NHWC input fmt\n"); + app_ctx->model_height = input_attrs[0].dims[1]; + app_ctx->model_width = input_attrs[0].dims[2]; + app_ctx->model_channel = input_attrs[0].dims[3]; + } + + printf("model input height=%d, width=%d, channel=%d\n", + app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel); + + return 0; +} + +int release_yolo11_model(rknn_app_context_t *app_ctx) +{ + if (app_ctx->input_attrs != NULL) + { + free(app_ctx->input_attrs); + app_ctx->input_attrs = NULL; + } + if (app_ctx->output_attrs != NULL) + { + free(app_ctx->output_attrs); + app_ctx->output_attrs = NULL; + } + for (int i = 0; i < app_ctx->io_num.n_input; i++) { + if (app_ctx->input_mems[i] != NULL) { + rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]); + } + } + for (int i = 0; i < app_ctx->io_num.n_output; i++) { + if (app_ctx->output_mems[i] != NULL) { + rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]); + } + } + if (app_ctx->rknn_ctx != 0) + { + rknn_destroy(app_ctx->rknn_ctx); + app_ctx->rknn_ctx = 0; + } + return 0; +} + +int inference_yolo11_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results) +{ + int ret; + image_buffer_t dst_img; + letterbox_t letter_box; + const float nms_threshold = NMS_THRESH; // 默认的NMS阈值 + const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值 + int bg_color = 114; + + if ((!app_ctx) || !(img) || (!od_results)) + { + return -1; + } + memset(od_results, 0x00, sizeof(*od_results)); + memset(&letter_box, 0, sizeof(letterbox_t)); + memset(&dst_img, 0, sizeof(image_buffer_t)); + + // Pre Process + dst_img.width = app_ctx->model_width; + dst_img.height = app_ctx->model_height; + dst_img.format = IMAGE_FORMAT_RGB888; + dst_img.size = get_image_size(&dst_img); + dst_img.fd = app_ctx->input_mems[0]->fd; + if (dst_img.virt_addr == NULL && dst_img.fd == 0) + { + printf("malloc buffer size:%d fail!\n", dst_img.size); + return -1; + } + + // letterbox + ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color); + if (ret < 0) + { + printf("convert_image_with_letterbox fail! ret=%d\n", ret); + return -1; + } + + // Run + printf("rknn_run\n"); + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) { + printf("rknn_run fail! ret=%d\n", ret); + return -1; + } + + // Post Process + post_process(app_ctx, app_ctx->output_mems, &letter_box, box_conf_threshold, nms_threshold, od_results); +out: + return ret; +} \ No newline at end of file diff --git a/examples/yolo11/cpp/rknpu2/yolo11_zero_copy.cc b/examples/yolo11/cpp/rknpu2/yolo11_zero_copy.cc new file mode 100644 index 00000000..825613b0 --- /dev/null +++ b/examples/yolo11/cpp/rknpu2/yolo11_zero_copy.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "yolo11.h" +#include "common.h" +#include "file_utils.h" +#include "image_utils.h" + +static void dump_tensor_attr(rknn_tensor_attr *attr) { + char dims[128] = {0}; + for (int i = 0; i < attr->n_dims; ++i) { + int idx = strlen(dims); + sprintf(&dims[idx], "%d%s", attr->dims[i], (i == attr->n_dims - 1) ? "" : ", "); + } + printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, w_stride = %d, size_with_stride = %d, " + "fmt=%s, type=%s, qnt_type=%s, " + "zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, dims, attr->n_elems, attr->size, attr->w_stride, attr->size_with_stride, + get_format_string(attr->fmt), get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, + attr->scale); +} + +int init_yolo11_model(const char *model_path, rknn_app_context_t *app_ctx) { + int ret; + int model_len = 0; + char *model; + rknn_context ctx = 0; + + // Load RKNN Model + model_len = read_data_from_file(model_path, &model); + if (model == NULL) { + printf("load_model fail!\n"); + return -1; + } + + ret = rknn_init(&ctx, model, model_len, 0, NULL); + free(model); + if (ret < 0) { + printf("rknn_init fail! ret=%d\n", ret); + return -1; + } + + // Get Model Input Output Number + rknn_input_output_num io_num; + ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); + if (ret != RKNN_SUCC) { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); + + // Get Model Input Info + printf("input tensors:\n"); + rknn_tensor_attr input_native_attrs[io_num.n_input]; + memset(input_native_attrs, 0, sizeof(input_native_attrs)); + for (int i = 0; i < io_num.n_input; i++) { + input_native_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_NATIVE_INPUT_ATTR, &(input_native_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(input_native_attrs[i])); + } + + // default input type is int8 (normalize and quantize need compute in outside) + // if set uint8, will fuse normalize and quantize to npu + input_native_attrs[0].type = RKNN_TENSOR_UINT8; + app_ctx->input_mems[0] = rknn_create_mem(ctx, input_native_attrs[0].size_with_stride); + + // Set input tensor memory + ret = rknn_set_io_mem(ctx, app_ctx->input_mems[0], &input_native_attrs[0]); + if (ret < 0) { + printf("input_mems rknn_set_io_mem fail! ret=%d\n", ret); + return -1; + } + + // Get Model Output Info + printf("output tensors:\n"); + rknn_tensor_attr output_native_attrs[io_num.n_output]; + memset(output_native_attrs, 0, sizeof(output_native_attrs)); + for (int i = 0; i < io_num.n_output; i++) { + output_native_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_NATIVE_OUTPUT_ATTR, &(output_native_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(output_native_attrs[i])); + } + + // Set output tensor memory + for (uint32_t i = 0; i < io_num.n_output; ++i) { + app_ctx->output_mems[i] = rknn_create_mem(ctx, output_native_attrs[i].size_with_stride); + ret = rknn_set_io_mem(ctx, app_ctx->output_mems[i], &output_native_attrs[i]); + if (ret < 0) { + printf("output_mems rknn_set_io_mem fail! ret=%d\n", ret); + return -1; + } + } + + // Set to context + app_ctx->rknn_ctx = ctx; + + // TODO + if (output_native_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_native_attrs[0].type == RKNN_TENSOR_INT8) { + app_ctx->is_quant = true; + } else { + app_ctx->is_quant = false; + } + + rknn_tensor_attr input_attrs[io_num.n_input]; + memset(input_attrs, 0, sizeof(input_attrs)); + for (int i = 0; i < io_num.n_input; i++) { + input_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + } + + rknn_tensor_attr output_attrs[io_num.n_output]; + memset(output_attrs, 0, sizeof(output_attrs)); + for (int i = 0; i < io_num.n_output; i++) { + output_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + } + + app_ctx->io_num = io_num; + app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + app_ctx->input_native_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_native_attrs, input_native_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_native_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_native_attrs, output_native_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + + if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) { + printf("model is NCHW input fmt\n"); + app_ctx->model_channel = input_attrs[0].dims[1]; + app_ctx->model_height = input_attrs[0].dims[2]; + app_ctx->model_width = input_attrs[0].dims[3]; + } else { + printf("model is NHWC input fmt\n"); + app_ctx->model_height = input_attrs[0].dims[1]; + app_ctx->model_width = input_attrs[0].dims[2]; + app_ctx->model_channel = input_attrs[0].dims[3]; + } + printf("model input height=%d, width=%d, channel=%d\n", + app_ctx->model_height, app_ctx->model_width, app_ctx->model_channel); + + return 0; +} + +int NC1HWC2_i8_to_NCHW_i8(const int8_t *src, int8_t *dst, int *dims, int channel, int h, int w, int zp, float scale) { + int batch = dims[0]; + int C1 = dims[1]; + int C2 = dims[4]; + int hw_src = dims[2] * dims[3]; + int hw_dst = h * w; + for (int i = 0; i < batch; i++) { + const int8_t *src_b = src + i * C1 * hw_src * C2; + int8_t *dst_b = dst + i * channel * hw_dst; + for (int c = 0; c < channel; ++c) { + int plane = c / C2; + const int8_t *src_bc = plane * hw_src * C2 + src_b; + int offset = c % C2; + for (int cur_h = 0; cur_h < h; ++cur_h) + for (int cur_w = 0; cur_w < w; ++cur_w) { + int cur_hw = cur_h * w + cur_w; + dst_b[c * hw_dst + cur_hw] = src_bc[C2 * cur_hw + offset] ; // int8-->int8 + } + } + } + + return 0; +} + +int release_yolo11_model(rknn_app_context_t *app_ctx) { + int ret; + if (app_ctx->input_attrs != NULL) { + free(app_ctx->input_attrs); + app_ctx->input_attrs = NULL; + } + if (app_ctx->output_attrs != NULL) { + free(app_ctx->output_attrs); + app_ctx->output_attrs = NULL; + } + if (app_ctx->input_native_attrs != NULL) { + free(app_ctx->input_native_attrs); + app_ctx->input_native_attrs = NULL; + } + if (app_ctx->output_native_attrs != NULL) { + free(app_ctx->output_native_attrs); + app_ctx->output_native_attrs = NULL; + } + + for (int i = 0; i < app_ctx->io_num.n_input; i++) { + if (app_ctx->input_mems[i] != NULL) { + ret = rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->input_mems[i]); + if (ret != RKNN_SUCC) { + printf("rknn_destroy_mem fail! ret=%d\n", ret); + return -1; + } + } + } + for (int i = 0; i < app_ctx->io_num.n_output; i++) { + if (app_ctx->output_mems[i] != NULL) { + ret = rknn_destroy_mem(app_ctx->rknn_ctx, app_ctx->output_mems[i]); + if (ret != RKNN_SUCC) { + printf("rknn_destroy_mem fail! ret=%d\n", ret); + return -1; + } + } + } + if (app_ctx->rknn_ctx != 0) { + ret = rknn_destroy(app_ctx->rknn_ctx); + if (ret != RKNN_SUCC) { + printf("rknn_destroy fail! ret=%d\n", ret); + return -1; + } + app_ctx->rknn_ctx = 0; + + } + return 0; +} + +int inference_yolo11_model(rknn_app_context_t *app_ctx, image_buffer_t *img, object_detect_result_list *od_results) { + int ret; + image_buffer_t dst_img; + letterbox_t letter_box; + const float nms_threshold = NMS_THRESH; // 默认的NMS阈值 + const float box_conf_threshold = BOX_THRESH; // 默认的置信度阈值 + int bg_color = 114; + + if ((!app_ctx) || !(img) || (!od_results)) { + return -1; + } + + memset(od_results, 0x00, sizeof(*od_results)); + memset(&letter_box, 0, sizeof(letterbox_t)); + memset(&dst_img, 0, sizeof(image_buffer_t)); + + // Pre Process + dst_img.width = app_ctx->model_width; + dst_img.height = app_ctx->model_height; + dst_img.format = IMAGE_FORMAT_RGB888; + dst_img.size = get_image_size(&dst_img); + dst_img.fd = app_ctx->input_mems[0]->fd; + dst_img.virt_addr = (unsigned char*)app_ctx->input_mems[0]->virt_addr; + + if (dst_img.virt_addr == NULL && dst_img.fd == 0) { + printf("malloc buffer size:%d fail!\n", dst_img.size); + return -1; + } + + // letterbox + ret = convert_image_with_letterbox(img, &dst_img, &letter_box, bg_color); + if (ret < 0) { + printf("convert_image_with_letterbox fail! ret=%d\n", ret); + return -1; + } + + // Run + printf("rknn_run\n"); + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) { + printf("rknn_run fail! ret=%d\n", ret); + return -1; + } + + //NC1HWC2 to NCHW + rknn_output outputs[app_ctx->io_num.n_output]; + memset(outputs, 0, sizeof(outputs)); + for (uint32_t i = 0; i < app_ctx->io_num.n_output; i++) { + int channel = app_ctx->output_attrs[i].dims[1]; + int h = app_ctx->output_attrs[i].n_dims > 2 ? app_ctx->output_attrs[i].dims[2] : 1; + int w = app_ctx->output_attrs[i].n_dims > 3 ? app_ctx->output_attrs[i].dims[3] : 1; + int hw = h * w; + int zp = app_ctx->output_native_attrs[i].zp; + float scale = app_ctx->output_native_attrs[i].scale; + if (app_ctx->is_quant) { + outputs[i].size = app_ctx->output_native_attrs[i].n_elems * sizeof(int8_t); + outputs[i].buf = (int8_t *)malloc(outputs[i].size); + if (app_ctx->output_native_attrs[i].fmt == RKNN_TENSOR_NC1HWC2) { + NC1HWC2_i8_to_NCHW_i8((int8_t *)app_ctx->output_mems[i]->virt_addr, (int8_t *)outputs[i].buf, + (int *)app_ctx->output_native_attrs[i].dims, channel, h, w, zp, scale); + } else { + memcpy(outputs[i].buf, app_ctx->output_mems[i]->virt_addr, outputs[i].size); + } + } else { + printf("Currently zero copy does not support fp16!\n"); + goto out; + } + } + + // Post Process + post_process(app_ctx, outputs, &letter_box, box_conf_threshold, nms_threshold, od_results); + + for (int i = 0; i < app_ctx->io_num.n_output; i++) { + free(outputs[i].buf); + } + +out: + return ret; +} \ No newline at end of file diff --git a/examples/yolo11/cpp/yolo11.h b/examples/yolo11/cpp/yolo11.h new file mode 100644 index 00000000..6fd1c4be --- /dev/null +++ b/examples/yolo11/cpp/yolo11.h @@ -0,0 +1,61 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef _RKNN_DEMO_YOLO11_H_ +#define _RKNN_DEMO_YOLO11_H_ + +#include "rknn_api.h" +#include "common.h" + +#if defined(RV1106_1103) + typedef struct { + char *dma_buf_virt_addr; + int dma_buf_fd; + int size; + }rknn_dma_buf; +#endif + +typedef struct { + rknn_context rknn_ctx; + rknn_input_output_num io_num; + rknn_tensor_attr* input_attrs; + rknn_tensor_attr* output_attrs; +#if defined(RV1106_1103) + rknn_tensor_mem* input_mems[1]; + rknn_tensor_mem* output_mems[9]; + rknn_dma_buf img_dma_buf; +#endif +#if defined(ZERO_COPY) + rknn_tensor_mem* input_mems[1]; + rknn_tensor_mem* output_mems[9]; + rknn_tensor_attr* input_native_attrs; + rknn_tensor_attr* output_native_attrs; +#endif + int model_channel; + int model_width; + int model_height; + bool is_quant; +} rknn_app_context_t; + +#include "postprocess.h" + + +int init_yolo11_model(const char* model_path, rknn_app_context_t* app_ctx); + +int release_yolo11_model(rknn_app_context_t* app_ctx); + +int inference_yolo11_model(rknn_app_context_t* app_ctx, image_buffer_t* img, object_detect_result_list* od_results); + +#endif //_RKNN_DEMO_YOLO11_H_ \ No newline at end of file diff --git a/examples/yolo11/model/bus.jpg b/examples/yolo11/model/bus.jpg new file mode 100644 index 00000000..d8ef30bd Binary files /dev/null and b/examples/yolo11/model/bus.jpg differ diff --git a/examples/yolo11/model/coco_80_labels_list.txt b/examples/yolo11/model/coco_80_labels_list.txt new file mode 100644 index 00000000..941cb4e1 --- /dev/null +++ b/examples/yolo11/model/coco_80_labels_list.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorcycle +airplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +couch +potted plant +bed +dining table +toilet +tv +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/examples/yolo11/model/dataset.txt b/examples/yolo11/model/dataset.txt new file mode 100644 index 00000000..aeeb6ad2 --- /dev/null +++ b/examples/yolo11/model/dataset.txt @@ -0,0 +1 @@ +bus.jpg \ No newline at end of file diff --git a/examples/yolo11/model/download_model.sh b/examples/yolo11/model/download_model.sh new file mode 100644 index 00000000..e8d89c7d --- /dev/null +++ b/examples/yolo11/model/download_model.sh @@ -0,0 +1 @@ +wget -O ./yolo11n.onnx https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/yolo11/yolo11n.onnx \ No newline at end of file diff --git a/examples/yolo11/model_comparison/yolo11_graph_comparison.jpg b/examples/yolo11/model_comparison/yolo11_graph_comparison.jpg new file mode 100644 index 00000000..4ef1e1cf Binary files /dev/null and b/examples/yolo11/model_comparison/yolo11_graph_comparison.jpg differ diff --git a/examples/yolo11/model_comparison/yolo11_output_comparison.jpg b/examples/yolo11/model_comparison/yolo11_output_comparison.jpg new file mode 100644 index 00000000..ce66297f Binary files /dev/null and b/examples/yolo11/model_comparison/yolo11_output_comparison.jpg differ diff --git a/examples/yolo11/python/convert.py b/examples/yolo11/python/convert.py new file mode 100644 index 00000000..d0170ef7 --- /dev/null +++ b/examples/yolo11/python/convert.py @@ -0,0 +1,73 @@ +import sys +from rknn.api import RKNN + +DATASET_PATH = '../../../datasets/COCO/coco_subset_20.txt' +DEFAULT_RKNN_PATH = '../model/yolo11.rknn' +DEFAULT_QUANT = True + +def parse_arg(): + if len(sys.argv) < 3: + print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0])) + print(" platform choose from [rk3562,rk3566,rk3568,rk3588,rk3576,rk1808,rv1109,rv1126]") + print(" dtype choose from [i8, fp] for [rk3562,rk3566,rk3568,rk3588,rk3576]") + print(" dtype choose from [u8, fp] for [rk1808,rv1109,rv1126]") + exit(1) + + model_path = sys.argv[1] + platform = sys.argv[2] + + do_quant = DEFAULT_QUANT + if len(sys.argv) > 3: + model_type = sys.argv[3] + if model_type not in ['i8', 'u8', 'fp']: + print("ERROR: Invalid model type: {}".format(model_type)) + exit(1) + elif model_type in ['i8', 'u8']: + do_quant = True + else: + do_quant = False + + if len(sys.argv) > 4: + output_path = sys.argv[4] + else: + output_path = DEFAULT_RKNN_PATH + + return model_path, platform, do_quant, output_path + +if __name__ == '__main__': + model_path, platform, do_quant, output_path = parse_arg() + + # Create RKNN object + rknn = RKNN(verbose=False) + + # Pre-process config + print('--> Config model') + rknn.config(mean_values=[[0, 0, 0]], std_values=[[255, 255, 255]], target_platform=platform) + print('done') + + # Load model + print('--> Loading model') + ret = rknn.load_onnx(model=model_path) + if ret != 0: + print('Load model failed!') + exit(ret) + print('done') + + # Build model + print('--> Building model') + ret = rknn.build(do_quantization=do_quant, dataset=DATASET_PATH) + if ret != 0: + print('Build model failed!') + exit(ret) + print('done') + + # Export rknn model + print('--> Export rknn model') + ret = rknn.export_rknn(output_path) + if ret != 0: + print('Export rknn model failed!') + exit(ret) + print('done') + + # Release + rknn.release() diff --git a/examples/yolo11/python/yolo11.py b/examples/yolo11/python/yolo11.py new file mode 100644 index 00000000..0f8f19c6 --- /dev/null +++ b/examples/yolo11/python/yolo11.py @@ -0,0 +1,303 @@ +import os +import cv2 +import sys +import argparse + +# add path +realpath = os.path.abspath(__file__) +_sep = os.path.sep +realpath = realpath.split(_sep) +sys.path.append(os.path.join(realpath[0]+_sep, *realpath[1:realpath.index('rknn_model_zoo')+1])) + +from py_utils.coco_utils import COCO_test_helper +import numpy as np + + +OBJ_THRESH = 0.25 +NMS_THRESH = 0.45 + +# The follew two param is for map test +# OBJ_THRESH = 0.001 +# NMS_THRESH = 0.65 + +IMG_SIZE = (640, 640) # (width, height), such as (1280, 736) + +CLASSES = ("person", "bicycle", "car","motorbike ","aeroplane ","bus ","train","truck ","boat","traffic light", + "fire hydrant","stop sign ","parking meter","bench","bird","cat","dog ","horse ","sheep","cow","elephant", + "bear","zebra ","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite", + "baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife ", + "spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza ","donut","cake","chair","sofa", + "pottedplant","bed","diningtable","toilet ","tvmonitor","laptop ","mouse ","remote ","keyboard ","cell phone","microwave ", + "oven ","toaster","sink","refrigerator ","book","clock","vase","scissors ","teddy bear ","hair drier", "toothbrush ") + +coco_id_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] + + +def filter_boxes(boxes, box_confidences, box_class_probs): + """Filter boxes with object threshold. + """ + box_confidences = box_confidences.reshape(-1) + candidate, class_num = box_class_probs.shape + + class_max_score = np.max(box_class_probs, axis=-1) + classes = np.argmax(box_class_probs, axis=-1) + + _class_pos = np.where(class_max_score* box_confidences >= OBJ_THRESH) + scores = (class_max_score* box_confidences)[_class_pos] + + boxes = boxes[_class_pos] + classes = classes[_class_pos] + + return boxes, classes, scores + +def nms_boxes(boxes, scores): + """Suppress non-maximal boxes. + # Returns + keep: ndarray, index of effective boxes. + """ + x = boxes[:, 0] + y = boxes[:, 1] + w = boxes[:, 2] - boxes[:, 0] + h = boxes[:, 3] - boxes[:, 1] + + areas = w * h + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + + xx1 = np.maximum(x[i], x[order[1:]]) + yy1 = np.maximum(y[i], y[order[1:]]) + xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]]) + yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]]) + + w1 = np.maximum(0.0, xx2 - xx1 + 0.00001) + h1 = np.maximum(0.0, yy2 - yy1 + 0.00001) + inter = w1 * h1 + + ovr = inter / (areas[i] + areas[order[1:]] - inter) + inds = np.where(ovr <= NMS_THRESH)[0] + order = order[inds + 1] + keep = np.array(keep) + return keep + +def dfl(position): + # Distribution Focal Loss (DFL) + import torch + x = torch.tensor(position) + n,c,h,w = x.shape + p_num = 4 + mc = c//p_num + y = x.reshape(n,p_num,mc,h,w) + y = y.softmax(2) + acc_metrix = torch.tensor(range(mc)).float().reshape(1,1,mc,1,1) + y = (y*acc_metrix).sum(2) + return y.numpy() + + +def box_process(position): + grid_h, grid_w = position.shape[2:4] + col, row = np.meshgrid(np.arange(0, grid_w), np.arange(0, grid_h)) + col = col.reshape(1, 1, grid_h, grid_w) + row = row.reshape(1, 1, grid_h, grid_w) + grid = np.concatenate((col, row), axis=1) + stride = np.array([IMG_SIZE[1]//grid_h, IMG_SIZE[0]//grid_w]).reshape(1,2,1,1) + + position = dfl(position) + box_xy = grid +0.5 -position[:,0:2,:,:] + box_xy2 = grid +0.5 +position[:,2:4,:,:] + xyxy = np.concatenate((box_xy*stride, box_xy2*stride), axis=1) + + return xyxy + +def post_process(input_data): + boxes, scores, classes_conf = [], [], [] + defualt_branch=3 + pair_per_branch = len(input_data)//defualt_branch + # Python 忽略 score_sum 输出 + for i in range(defualt_branch): + boxes.append(box_process(input_data[pair_per_branch*i])) + classes_conf.append(input_data[pair_per_branch*i+1]) + scores.append(np.ones_like(input_data[pair_per_branch*i+1][:,:1,:,:], dtype=np.float32)) + + def sp_flatten(_in): + ch = _in.shape[1] + _in = _in.transpose(0,2,3,1) + return _in.reshape(-1, ch) + + boxes = [sp_flatten(_v) for _v in boxes] + classes_conf = [sp_flatten(_v) for _v in classes_conf] + scores = [sp_flatten(_v) for _v in scores] + + boxes = np.concatenate(boxes) + classes_conf = np.concatenate(classes_conf) + scores = np.concatenate(scores) + + # filter according to threshold + boxes, classes, scores = filter_boxes(boxes, scores, classes_conf) + + # nms + nboxes, nclasses, nscores = [], [], [] + for c in set(classes): + inds = np.where(classes == c) + b = boxes[inds] + c = classes[inds] + s = scores[inds] + keep = nms_boxes(b, s) + + if len(keep) != 0: + nboxes.append(b[keep]) + nclasses.append(c[keep]) + nscores.append(s[keep]) + + if not nclasses and not nscores: + return None, None, None + + boxes = np.concatenate(nboxes) + classes = np.concatenate(nclasses) + scores = np.concatenate(nscores) + + return boxes, classes, scores + + +def draw(image, boxes, scores, classes): + for box, score, cl in zip(boxes, scores, classes): + top, left, right, bottom = [int(_b) for _b in box] + print("%s @ (%d %d %d %d) %.3f" % (CLASSES[cl], top, left, right, bottom, score)) + cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2) + cv2.putText(image, '{0} {1:.2f}'.format(CLASSES[cl], score), + (top, left - 6), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2) + +def setup_model(args): + model_path = args.model_path + if model_path.endswith('.pt') or model_path.endswith('.torchscript'): + platform = 'pytorch' + from py_utils.pytorch_executor import Torch_model_container + model = Torch_model_container(args.model_path) + elif model_path.endswith('.rknn'): + platform = 'rknn' + from py_utils.rknn_executor import RKNN_model_container + model = RKNN_model_container(args.model_path, args.target, args.device_id) + elif model_path.endswith('onnx'): + platform = 'onnx' + from py_utils.onnx_executor import ONNX_model_container + model = ONNX_model_container(args.model_path) + else: + assert False, "{} is not rknn/pytorch/onnx model".format(model_path) + print('Model-{} is {} model, starting val'.format(model_path, platform)) + return model, platform + +def img_check(path): + img_type = ['.jpg', '.jpeg', '.png', '.bmp'] + for _type in img_type: + if path.endswith(_type) or path.endswith(_type.upper()): + return True + return False + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Process some integers.') + # basic params + parser.add_argument('--model_path', type=str, required= True, help='model path, could be .pt or .rknn file') + parser.add_argument('--target', type=str, default='rk3566', help='target RKNPU platform') + parser.add_argument('--device_id', type=str, default=None, help='device id') + + parser.add_argument('--img_show', action='store_true', default=False, help='draw the result and show') + parser.add_argument('--img_save', action='store_true', default=False, help='save the result') + + # data params + parser.add_argument('--anno_json', type=str, default='../../../datasets/COCO/annotations/instances_val2017.json', help='coco annotation path') + # coco val folder: '../../../datasets/COCO//val2017' + parser.add_argument('--img_folder', type=str, default='../model', help='img folder path') + parser.add_argument('--coco_map_test', action='store_true', help='enable coco map test') + + args = parser.parse_args() + + # init model + model, platform = setup_model(args) + + file_list = sorted(os.listdir(args.img_folder)) + img_list = [] + for path in file_list: + if img_check(path): + img_list.append(path) + co_helper = COCO_test_helper(enable_letter_box=True) + + # run test + for i in range(len(img_list)): + print('infer {}/{}'.format(i+1, len(img_list)), end='\r') + + img_name = img_list[i] + img_path = os.path.join(args.img_folder, img_name) + if not os.path.exists(img_path): + print("{} is not found", img_name) + continue + + img_src = cv2.imread(img_path) + if img_src is None: + continue + + ''' + # using for test input dumped by C.demo + img_src = np.fromfile('./input_b/demo_c_input_hwc_rgb.txt', dtype=np.uint8).reshape(640,640,3) + img_src = cv2.cvtColor(img_src, cv2.COLOR_RGB2BGR) + ''' + + # Due to rga init with (0,0,0), we using pad_color (0,0,0) instead of (114, 114, 114) + pad_color = (0,0,0) + img = co_helper.letter_box(im= img_src.copy(), new_shape=(IMG_SIZE[1], IMG_SIZE[0]), pad_color=(0,0,0)) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + # preprocee if not rknn model + if platform in ['pytorch', 'onnx']: + input_data = img.transpose((2,0,1)) + input_data = input_data.reshape(1,*input_data.shape).astype(np.float32) + input_data = input_data/255. + else: + input_data = img + + outputs = model.run([input_data]) + boxes, classes, scores = post_process(outputs) + + if args.img_show or args.img_save: + print('\n\nIMG: {}'.format(img_name)) + img_p = img_src.copy() + if boxes is not None: + draw(img_p, co_helper.get_real_box(boxes), scores, classes) + + if args.img_save: + if not os.path.exists('./result'): + os.mkdir('./result') + result_path = os.path.join('./result', img_name) + cv2.imwrite(result_path, img_p) + print('Detection result save to {}'.format(result_path)) + + if args.img_show: + cv2.imshow("full post process result", img_p) + cv2.waitKeyEx(0) + + # record maps + if args.coco_map_test is True: + if boxes is not None: + for i in range(boxes.shape[0]): + co_helper.add_single_record(image_id = int(img_name.split('.')[0]), + category_id = coco_id_list[int(classes[i])], + bbox = boxes[i], + score = round(scores[i], 5).item() + ) + + # calculate maps + if args.coco_map_test is True: + pred_json = args.model_path.split('.')[-2]+ '_{}'.format(platform) +'.json' + pred_json = pred_json.split('/')[-1] + pred_json = os.path.join('./', pred_json) + co_helper.export_to_json(pred_json) + + from py_utils.coco_utils import coco_eval_with_json + coco_eval_with_json(args.anno_json, pred_json) + + # release + model.release() diff --git a/examples/yolo11/result.png b/examples/yolo11/result.png new file mode 100644 index 00000000..97779e5a Binary files /dev/null and b/examples/yolo11/result.png differ diff --git a/examples/yolov10/python/yolov10.py b/examples/yolov10/python/yolov10.py index a232e04c..a5d43a30 100644 --- a/examples/yolov10/python/yolov10.py +++ b/examples/yolov10/python/yolov10.py @@ -344,4 +344,5 @@ def img_check(path): from py_utils.coco_utils import coco_eval_with_json coco_eval_with_json(args.anno_json, pred_json) - + # release + model.release() diff --git a/examples/yolov5/python/yolov5.py b/examples/yolov5/python/yolov5.py index cf335c1f..b60710fd 100644 --- a/examples/yolov5/python/yolov5.py +++ b/examples/yolov5/python/yolov5.py @@ -299,4 +299,5 @@ def img_check(path): from py_utils.coco_utils import coco_eval_with_json coco_eval_with_json(args.anno_json, pred_json) - + # release + model.release() diff --git a/examples/yolov5_seg/python/yolov5_seg.py b/examples/yolov5_seg/python/yolov5_seg.py index 5b54b11b..67894f65 100644 --- a/examples/yolov5_seg/python/yolov5_seg.py +++ b/examples/yolov5_seg/python/yolov5_seg.py @@ -371,4 +371,5 @@ def img_check(path): results.extend(eval.stats[:2]) # update results (mAP@0.5:0.95, mAP@0.5) print(results) - + # release + model.release() diff --git a/examples/yolov6/python/yolov6.py b/examples/yolov6/python/yolov6.py index 1d21b0a6..c2db2f19 100644 --- a/examples/yolov6/python/yolov6.py +++ b/examples/yolov6/python/yolov6.py @@ -305,4 +305,5 @@ def img_check(path): from py_utils.coco_utils import coco_eval_with_json coco_eval_with_json(args.anno_json, pred_json) - + # release + model.release() diff --git a/examples/yolov7/python/yolov7.py b/examples/yolov7/python/yolov7.py index b3f41a7a..62494b70 100644 --- a/examples/yolov7/python/yolov7.py +++ b/examples/yolov7/python/yolov7.py @@ -310,4 +310,5 @@ def img_check(path): from py_utils.coco_utils import coco_eval_with_json coco_eval_with_json(args.anno_json, pred_json) - + # release + model.release() \ No newline at end of file diff --git a/examples/yolov8/python/yolov8.py b/examples/yolov8/python/yolov8.py index 48713194..0f8f19c6 100644 --- a/examples/yolov8/python/yolov8.py +++ b/examples/yolov8/python/yolov8.py @@ -299,4 +299,5 @@ def img_check(path): from py_utils.coco_utils import coco_eval_with_json coco_eval_with_json(args.anno_json, pred_json) - + # release + model.release() diff --git a/examples/yolov8_obb/python/yolov8_obb.py b/examples/yolov8_obb/python/yolov8_obb.py index f74e086e..4762e2f0 100644 --- a/examples/yolov8_obb/python/yolov8_obb.py +++ b/examples/yolov8_obb/python/yolov8_obb.py @@ -198,7 +198,7 @@ def process(out,model_w,model_h,stride,angle_feature,index,scale_w=1,scale_h=1): # Init runtime environment print('--> Init runtime environment') - ret = rknn.init_runtime(target=args.target) + ret = rknn.init_runtime(target=args.target, device_id=args.device_id) if ret != 0: print('Init runtime environment failed!') exit(ret) diff --git a/examples/yolov8_pose/python/yolov8_pose.py b/examples/yolov8_pose/python/yolov8_pose.py index 5feee783..b3ce246c 100644 --- a/examples/yolov8_pose/python/yolov8_pose.py +++ b/examples/yolov8_pose/python/yolov8_pose.py @@ -181,7 +181,7 @@ def process(out,keypoints,index,model_w,model_h,stride,scale_w=1,scale_h=1): # Init runtime environment print('--> Init runtime environment') - ret = rknn.init_runtime(target=args.target) + ret = rknn.init_runtime(target=args.target, device_id=args.device_id) if ret != 0: print('Init runtime environment failed!') exit(ret) diff --git a/examples/yolov8_seg/python/yolov8_seg.py b/examples/yolov8_seg/python/yolov8_seg.py index 53232598..75217b9e 100644 --- a/examples/yolov8_seg/python/yolov8_seg.py +++ b/examples/yolov8_seg/python/yolov8_seg.py @@ -363,4 +363,5 @@ def img_check(path): results.extend(eval.stats[:2]) # update results (mAP@0.5:0.95, mAP@0.5) print(results) - + # release + model.release() diff --git a/examples/yolox/python/yolox.py b/examples/yolox/python/yolox.py index fa3f8c59..d326af26 100644 --- a/examples/yolox/python/yolox.py +++ b/examples/yolox/python/yolox.py @@ -287,4 +287,5 @@ def img_check(path): from py_utils.coco_utils import coco_eval_with_json coco_eval_with_json(args.anno_json, pred_json) - + # release + model.release() diff --git a/examples/zipformer/README.md b/examples/zipformer/README.md new file mode 100644 index 00000000..e7492fec --- /dev/null +++ b/examples/zipformer/README.md @@ -0,0 +1,210 @@ +# zipformer + +## Table of contents + +- [1. Description](#1-description) +- [2. Current Support Platform](#2-current-support-platform) +- [3. Pretrained Model](#3-pretrained-model) +- [4. Convert to RKNN](#4-convert-to-rknn) +- [5. Python Demo](#5-python-demo) +- [6. Android Demo](#6-android-demo) + - [6.1 Compile and Build](#61-compile-and-build) + - [6.2 Push demo files to device](#62-push-demo-files-to-device) + - [6.3 Run demo](#63-run-demo) +- [7. Linux Demo](#7-linux-demo) + - [7.1 Compile \&\& Build](#71-compile-and-build) + - [7.2 Push demo files to device](#72-push-demo-files-to-device) + - [7.3 Run demo](#73-run-demo) +- [8. Expected Results](#8-expected-results) + + + +## 1. Description + +Chinese-English ASR model using k2-zipformer-streaming. + +The model used in this example comes from the following open source projects: + +https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t + + + +## 2. Current Support Platform + +RK3566, RK3568, RK3588, RK3562, RK3576 + + + +## 3. Pretrained Model + +Download link: + +[encoder-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/encoder-epoch-99-avg-1.onnx)
[decoder-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/decoder-epoch-99-avg-1.onnx)
[joiner-epoch-99-avg-1.onnx](https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/joiner-epoch-99-avg-1.onnx) + +Download with shell command: + +```sh +cd model +./download_model.sh +``` + +**Note: For exporting zipformer onnx models, please refer to [export-for-onnx.sh](https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t/blob/main/exp/96/export-for-onnx.sh)** + + +## 4. Convert to RKNN + +*Usage:* + +```shell +cd python +python convert.py + +# such as: +python convert.py ../model/encoder-epoch-99-avg-1.onnx rk3588 +# output model will be saved as ../model/encoder-epoch-99-avg-1.rknn + +python convert.py ../model/decoder-epoch-99-avg-1.onnx rk3588 +# output model will be saved as ../model/decoder-epoch-99-avg-1.rknn + +python convert.py ../model/joiner-epoch-99-avg-1.onnx rk3588 +# output model will be saved as ../model/joiner-epoch-99-avg-1.rknn +``` + +*Description:* + +- ``: Specify ONNX model path. +- ``: Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- `(optional)`: Specify as `i8` or `fp`. `i8` for doing quantization, `fp` for no quantization. Default is `fp`. +- `(optional)`: Specify save path for the RKNN model, default save in the same directory as ONNX model. + + + +## 5. Python Demo + +*Usage:* + +```shell +# Install kaldifeat +# Refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html for installation. +# This python demo is tested under version: kaldifeat-1.25.4.dev20240223 + +cd python +# Inference with ONNX model +python zipformer.py --encoder_model_path --decoder_model_path ---joiner_model_path + +# Inference with RKNN model +python zipformer.py --encoder_model_path --decoder_model_path --joiner_model_path --target +``` +*Description:* +- : Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- : Specify model path. + + + +## 6. Android Demo + +#### 6.1 Compile and Build + +*Usage:* + +```sh +# go back to the rknn_model_zoo root directory +cd ../../ +export ANDROID_NDK_PATH= + +./build-android.sh -t -a -d zipformer + +# such as +./build-android.sh -t rk3588 -a arm64-v8a -d zipformer +``` + +*Description:* +- ``: Specify Android NDK path. +- ``: Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- ``: Specify device system architecture. To query device architecture, refer to the following command: + ```shell + # Query architecture. For Android, ['arm64-v8a' or 'armeabi-v7a'] should shown in log. + adb shell cat /proc/version + ``` + +#### 6.2 Push demo files to device + +With device connected via USB port, push demo files to devices: + +```shell +adb root +adb remount +adb push install/_android_/rknn_zipformer_demo/ /data/ +``` + +#### 6.3 Run demo + +```sh +adb shell +cd /data/rknn_zipformer_demo + +export LD_LIBRARY_PATH=./lib +./rknn_zipformer_demo model/encoder-epoch-99-avg-1.rknn model/decoder-epoch-99-avg-1.rknn model/joiner-epoch-99-avg-1.rknn model/test.wav +``` + + + +## 7. Linux Demo + +#### 7.1 Compile and Build + +*usage* + +```shell +# go back to the rknn_model_zoo root directory +cd ../../ + +# if GCC_COMPILER not found while building, please set GCC_COMPILER path +(optional)export GCC_COMPILER= + +./build-linux.sh -t -a -d zipformer + +# such as +./build-linux.sh -t rk3588 -a aarch64 -d zipformer +``` + +*Description:* + +- ``: Specified as GCC_COMPILER path. +- `` : Specify NPU platform name. Support Platform refer [here](#2-current-support-platform). +- ``: Specify device system architecture. To query device architecture, refer to the following command: + + ```shell + # Query architecture. For Linux, ['aarch64' or 'armhf'] should shown in log. + adb shell cat /proc/version + ``` + +#### 7.2 Push demo files to device + +- If device connected via USB port, push demo files to devices: + +```shell +adb push install/_linux_/rknn_zipformer_demo/ /data/ +``` + +- For other boards, use `scp` or other approaches to push all files under `install/_linux_/rknn_zipformer_demo/` to `data`. + +#### 7.3 Run demo + +```sh +adb shell +cd /data/rknn_zipformer_demo + +export LD_LIBRARY_PATH=./lib +./rknn_zipformer_demo model/encoder-epoch-99-avg-1.rknn model/decoder-epoch-99-avg-1.rknn model/joiner-epoch-99-avg-1.rknn model/test.wav +``` + + +## 8. Expected Results + +This example will print the recognized text, as follows: +``` +Zipformer output: 对我做了介绍那么我想说的是大家如果对我的研究感兴趣呢 +``` + +- Note: Different platforms, different versions of tools and drivers may have slightly different results. \ No newline at end of file diff --git a/examples/zipformer/cpp/CMakeLists.txt b/examples/zipformer/cpp/CMakeLists.txt new file mode 100644 index 00000000..a981476c --- /dev/null +++ b/examples/zipformer/cpp/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.10) + +project(rknn_zipformer_demo) + +if (ENABLE_ASAN) + message(STATUS "BUILD WITH ADDRESS SANITIZER") + set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") +endif () + +set(rknpu_zipformer_file rknpu2/zipformer.cc) +if(TARGET_SOC STREQUAL "rk1808" OR TARGET_SOC STREQUAL "rv1109" OR TARGET_SOC STREQUAL "rv1126") + set(rknpu_zipformer_file rknpu1/zipformer.cc) +endif() + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/ 3rdparty.out) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../utils/ utils.out) + +set(CMAKE_INSTALL_RPATH "$ORIGIN/lib") + +file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) + +add_executable(${PROJECT_NAME} + main.cc + process.cc + ${rknpu_zipformer_file} +) + +target_link_libraries(${PROJECT_NAME} + fileutils + audioutils + ${LIBRKNNRT} + ${LIBKALDI_NATIVE_FBANK} +) + +if (CMAKE_SYSTEM_NAME STREQUAL "Android") + target_link_libraries(${PROJECT_NAME} + log +) +endif() + +if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(${PROJECT_NAME} Threads::Threads) +endif() + +target_include_directories(${PROJECT_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${LIBRKNNRT_INCLUDES} + ${LIBKALDI_NATIVE_FBANK_INCLUDES} + ${LIBTIMER_INCLUDES} +) + +install(TARGETS ${PROJECT_NAME} DESTINATION .) +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/test.wav DESTINATION ./model) +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../model/vocab.txt DESTINATION ./model) +file(GLOB RKNN_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../model/*.rknn") +install(FILES ${RKNN_FILES} DESTINATION model) \ No newline at end of file diff --git a/examples/zipformer/cpp/main.cc b/examples/zipformer/cpp/main.cc new file mode 100644 index 00000000..fee765be --- /dev/null +++ b/examples/zipformer/cpp/main.cc @@ -0,0 +1,203 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*------------------------------------------- + Includes +-------------------------------------------*/ +#include +#include +#include +#include +#include "zipformer.h" +#include "audio_utils.h" +#include +#include +#include +#include "process.h" +#include + +/*------------------------------------------- + Main Function +-------------------------------------------*/ + +int main(int argc, char **argv) +{ + if (argc != 5) + { + printf("%s \n", argv[0]); + return -1; + } + + const char *encoder_path = argv[1]; + const char *decoder_path = argv[2]; + const char *joiner_path = argv[3]; + const char *audio_path = argv[4]; + + int ret; + TIMER timer; + float infer_time = 0.0; + float audio_length = 0.0; + float rtf = 0.0; + int frame_shift_ms = 10; + int subsampling_factor = 4; + float frame_shift_s = frame_shift_ms / 1000.0 * subsampling_factor; + std::vector recognized_text; + std::vector timestamp; + rknn_zipformer_context_t rknn_app_ctx; + VocabEntry vocab[VOCAB_NUM]; + audio_buffer_t audio; + memset(&rknn_app_ctx, 0, sizeof(rknn_zipformer_context_t)); + memset(vocab, 0, sizeof(vocab)); + memset(&audio, 0, sizeof(audio_buffer_t)); + + timer.tik(); + ret = read_audio(audio_path, &audio); + if (ret != 0) + { + printf("read audio fail! ret=%d audio_path=%s\n", ret, audio_path); + goto out; + } + + if (audio.num_channels == 2) + { + ret = convert_channels(&audio); + if (ret != 0) + { + printf("convert channels fail! ret=%d\n", ret, audio_path); + goto out; + } + } + + if (audio.sample_rate != SAMPLE_RATE) + { + ret = resample_audio(&audio, audio.sample_rate, SAMPLE_RATE); + if (ret != 0) + { + printf("resample audio fail! ret=%d\n", ret, audio_path); + goto out; + } + } + + ret = read_vocab(VOCAB_PATH, vocab); + if (ret != 0) + { + printf("read vocab fail! ret=%d vocab_path=%s\n", ret, VOCAB_PATH); + goto out; + } + timer.tok(); + timer.print_time("read_audio & convert_channels & resample_audio & read_vocab"); + + timer.tik(); + ret = init_zipformer_model(encoder_path, &rknn_app_ctx.encoder_context); + if (ret != 0) + { + printf("init_zipformer_model fail! ret=%d encoder_path=%s\n", ret, encoder_path); + goto out; + } + build_input_output(&rknn_app_ctx.encoder_context); + timer.tok(); + timer.print_time("init_zipformer_encoder_model"); + + timer.tik(); + ret = init_zipformer_model(decoder_path, &rknn_app_ctx.decoder_context); + if (ret != 0) + { + printf("init_zipformer_model fail! ret=%d decoder_path=%s\n", ret, decoder_path); + goto out; + } + build_input_output(&rknn_app_ctx.decoder_context); + timer.tok(); + timer.print_time("init_zipformer_decoder_model"); + + timer.tik(); + ret = init_zipformer_model(joiner_path, &rknn_app_ctx.joiner_context); + if (ret != 0) + { + printf("init_zipformer_model fail! ret=%d oiner_path=%s\n", ret, joiner_path); + goto out; + } + build_input_output(&rknn_app_ctx.joiner_context); + timer.tok(); + timer.print_time("init_zipformer_joiner_model"); + + timer.tik(); + ret = inference_zipformer_model(&rknn_app_ctx, audio, vocab, recognized_text, timestamp, audio_length); + if (ret != 0) + { + printf("inference_zipformer_model fail! ret=%d\n", ret); + goto out; + } + timer.tok(); + timer.print_time("inference_zipformer_model"); + + infer_time = timer.get_time() / 1000.0; // sec + rtf = infer_time / audio_length; + printf("\nReal Time Factor (RTF): %.3f / %.3f = %.3f\n", infer_time, audio_length, rtf); + + // print result + std::cout << "\nTimestamp (s): "; + std::cout << std::fixed << std::setprecision(2); + for (size_t i = 0; i < timestamp.size(); ++i) + { + std::cout << timestamp[i] * frame_shift_s; + if (i < timestamp.size() - 1) + { + std::cout << ", "; + } + } + std::cout << std::endl; + + std::cout << "\nZipformer output: "; + for (const auto &str : recognized_text) + { + std::cout << str; + } + std::cout << std::endl; + +out: + + if (audio.data) + { + free(audio.data); + } + + for (int i = 0; i < VOCAB_NUM; i++) + { + if (vocab[i].token) + { + free(vocab[i].token); + vocab[i].token = NULL; + } + } + + ret = release_zipformer_model(&rknn_app_ctx.encoder_context); + if (ret != 0) + { + printf("release_zipformer_model encoder_context fail! ret=%d\n", ret); + } + + ret = release_zipformer_model(&rknn_app_ctx.decoder_context); + if (ret != 0) + { + printf("release_zipformer_model decoder_context fail! ret=%d\n", ret); + } + + ret = release_zipformer_model(&rknn_app_ctx.joiner_context); + if (ret != 0) + { + printf("release_zipformer_model joiner_context fail! ret=%d\n", ret); + } + + return 0; +} diff --git a/examples/zipformer/cpp/process.cc b/examples/zipformer/cpp/process.cc new file mode 100644 index 00000000..cc917707 --- /dev/null +++ b/examples/zipformer/cpp/process.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "zipformer.h" +#include +#include +#include +#include +#include + +void convert_nchw_to_nhwc(float *src, float *dst, int N, int channels, int height, int width) +{ + for (int n = 0; n < N; ++n) + { + for (int c = 0; c < channels; ++c) + { + for (int h = 0; h < height; ++h) + { + for (int w = 0; w < width; ++w) + { + dst[n * height * width * channels + h * width * channels + w * channels + c] = src[n * channels * height * width + c * height * width + h * width + w]; + } + } + } + } +} + +int get_kbank_frames(knf::OnlineFbank *fbank, int frame_index, int segment, float *frames) +{ + if (frame_index + segment > fbank->NumFramesReady()) + { + return -1; + } + + for (int i = 0; i < segment; ++i) + { + const float *frame = fbank->GetFrame(i + frame_index); + memcpy(frames + i * N_MELS, frame, N_MELS * sizeof(float)); + } + + return 0; +} + +int argmax(float *array) +{ + int start_index = 0; + int max_index = start_index; + float max_value = array[max_index]; + for (int i = start_index + 1; i < start_index + JOINER_OUTPUT_SIZE; i++) + { + if (array[i] > max_value) + { + max_value = array[i]; + max_index = i; + } + } + int relative_index = max_index - start_index; + return relative_index; +} + +void replace_substr(std::string &str, const std::string &from, const std::string &to) +{ + if (from.empty()) + return; // Prevent infinite loop if 'from' is empty + size_t start_pos = 0; + while ((start_pos = str.find(from, start_pos)) != std::string::npos) + { + str.replace(start_pos, from.length(), to); + start_pos += to.length(); // Advance position by length of the replacement + } +} + +int read_vocab(const char *fileName, VocabEntry *vocab) +{ + FILE *fp; + char line[512]; + + fp = fopen(fileName, "r"); + if (fp == NULL) + { + perror("Error opening file"); + return -1; + } + + int count = 0; + while (fgets(line, sizeof(line), fp)) + { + vocab[count].index = atoi(strchr(line, ' ') + 1); // get token before the first space + char *token = strtok(line, " "); + vocab[count].token = strdup(token); // Get index after the first space + + count++; + } + + fclose(fp); + + return 0; +} \ No newline at end of file diff --git a/examples/zipformer/cpp/process.h b/examples/zipformer/cpp/process.h new file mode 100644 index 00000000..cae11d5f --- /dev/null +++ b/examples/zipformer/cpp/process.h @@ -0,0 +1,36 @@ +#ifndef _RKNN_ZIPFORMER_DEMO_PROCESS_H_ +#define _RKNN_ZIPFORMER_DEMO_PROCESS_H_ + +// #define TIMING_DISABLED // if you don't need to print the time used, uncomment this line of code + +#include "rknn_api.h" +#include "easy_timer.h" +#include "kaldi-native-fbank/csrc/online-feature.h" + +#define VOCAB_NUM 6257 +#define SAMPLE_RATE 16000 +#define N_MELS 80 +#define N_SEGMENT 103 +#define ENCODER_OUTPUT_T 24 +#define DECODER_DIM 512 +#define ENCODER_INPUT_SIZE N_MELS *N_SEGMENT +#define ENCODER_OUTPUT_SIZE ENCODER_OUTPUT_T *DECODER_DIM +#define JOINER_OUTPUT_SIZE 6254 +#define N_OFFSET 96 +#define CONTEXT_SIZE 2 + +#define VOCAB_PATH "./model/vocab.txt" + +typedef struct +{ + int index; + char *token; +} VocabEntry; + +int get_kbank_frames(knf::OnlineFbank *fbank, int frame_index, int segment, float *frames); +void convert_nchw_to_nhwc(float *src, float *dst, int N, int channels, int height, int width); +int argmax(float *array); +void replace_substr(std::string &str, const std::string &from, const std::string &to); +int read_vocab(const char *fileName, VocabEntry *vocab); + +#endif //_RKNN_ZIPFORMER_DEMO_PROCESS_H_ diff --git a/examples/zipformer/cpp/rknpu2/zipformer.cc b/examples/zipformer/cpp/rknpu2/zipformer.cc new file mode 100644 index 00000000..0502dfcc --- /dev/null +++ b/examples/zipformer/cpp/rknpu2/zipformer.cc @@ -0,0 +1,464 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "zipformer.h" +#include "process.h" + +static void dump_tensor_attr(rknn_tensor_attr *attr) +{ + char dims_str[100]; + char temp_str[100]; + memset(dims_str, 0, sizeof(dims_str)); + for (int i = 0; i < attr->n_dims; i++) + { + strcpy(temp_str, dims_str); + if (i == attr->n_dims - 1) + { + sprintf(dims_str, "%s%d", temp_str, attr->dims[i]); + } + else + { + sprintf(dims_str, "%s%d, ", temp_str, attr->dims[i]); + } + } + + printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, zp=%d, scale=%f\n", + attr->index, attr->name, attr->n_dims, dims_str, attr->n_elems, attr->size, get_format_string(attr->fmt), + get_type_string(attr->type), get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); +} + +void build_input_output(rknn_app_context_t *app_ctx) +{ + app_ctx->inputs = (rknn_input *)malloc(app_ctx->io_num.n_input * sizeof(rknn_input)); + memset(app_ctx->inputs, 0, app_ctx->io_num.n_input * sizeof(rknn_input)); + + for (int i = 0; i < app_ctx->io_num.n_input; i++) + { + app_ctx->inputs[i].index = i; + + if (app_ctx->input_attrs[i].type == RKNN_TENSOR_FLOAT16) + { + app_ctx->inputs[i].size = app_ctx->input_attrs[i].n_elems * sizeof(float); + app_ctx->inputs[i].type = RKNN_TENSOR_FLOAT32; + app_ctx->inputs[i].fmt = app_ctx->input_attrs[i].fmt; + app_ctx->inputs[i].buf = (float *)malloc(app_ctx->inputs[i].size); + memset(app_ctx->inputs[i].buf, 0, app_ctx->inputs[i].size); + } + else if (app_ctx->input_attrs[i].type == RKNN_TENSOR_INT64) + { + app_ctx->inputs[i].size = app_ctx->input_attrs[i].n_elems * sizeof(int64_t); + app_ctx->inputs[i].type = RKNN_TENSOR_INT64; + app_ctx->inputs[i].fmt = app_ctx->input_attrs[i].fmt; + app_ctx->inputs[i].buf = (int64_t *)malloc(app_ctx->inputs[i].size); + memset(app_ctx->inputs[i].buf, 0, app_ctx->inputs[i].size); + } + } + + app_ctx->outputs = (rknn_output *)malloc(app_ctx->io_num.n_output * sizeof(rknn_output)); + memset(app_ctx->outputs, 0, app_ctx->io_num.n_output * sizeof(rknn_output)); + + for (int i = 0; i < app_ctx->io_num.n_output; i++) + { + app_ctx->outputs[i].index = i; + + if (app_ctx->output_attrs[i].type == RKNN_TENSOR_FLOAT16) + { + app_ctx->outputs[i].size = app_ctx->output_attrs[i].n_elems * sizeof(float); + app_ctx->outputs[i].is_prealloc = true; + app_ctx->outputs[i].want_float = 1; + app_ctx->outputs[i].buf = (float *)malloc(app_ctx->outputs[i].size); + memset(app_ctx->outputs[i].buf, 0, app_ctx->outputs[i].size); + } + else if (app_ctx->output_attrs[i].type == RKNN_TENSOR_INT64) + { + app_ctx->outputs[i].size = app_ctx->output_attrs[i].n_elems * sizeof(int64_t); + app_ctx->outputs[i].is_prealloc = true; + app_ctx->outputs[i].want_float = 0; + app_ctx->outputs[i].buf = (int64_t *)malloc(app_ctx->outputs[i].size); + memset(app_ctx->outputs[i].buf, 0, app_ctx->outputs[i].size); + } + } +} + +int init_zipformer_model(const char *model_path, rknn_app_context_t *app_ctx) +{ + int ret; + int model_len = 0; + char *model; + rknn_context ctx = 0; + + // Load RKNN Model + ret = rknn_init(&ctx, (void *)model_path, model_len, 0, NULL); + if (ret < 0) + { + printf("rknn_init fail! ret=%d\n", ret); + return -1; + } + + // Get Model Input Output Number + rknn_input_output_num io_num; + ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); + + // Get Model Input Info + printf("input tensors:\n"); + rknn_tensor_attr input_attrs[io_num.n_input]; + memset(input_attrs, 0, sizeof(input_attrs)); + for (int i = 0; i < io_num.n_input; i++) + { + input_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(input_attrs[i])); + } + + // Get Model Output Info + printf("output tensors:\n"); + rknn_tensor_attr output_attrs[io_num.n_output]; + memset(output_attrs, 0, sizeof(output_attrs)); + for (int i = 0; i < io_num.n_output; i++) + { + output_attrs[i].index = i; + ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); + if (ret != RKNN_SUCC) + { + printf("rknn_query fail! ret=%d\n", ret); + return -1; + } + dump_tensor_attr(&(output_attrs[i])); + } + + // Set to context + app_ctx->rknn_ctx = ctx; + app_ctx->io_num = io_num; + app_ctx->input_attrs = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->input_attrs, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); + app_ctx->output_attrs = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); + memcpy(app_ctx->output_attrs, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); + + return 0; +} + +static void release_input_output(rknn_app_context_t *app_ctx) +{ + for (int i = 0; i < app_ctx->io_num.n_input; i++) + { + if (app_ctx->inputs[i].buf != NULL) + { + free(app_ctx->inputs[i].buf); + app_ctx->inputs[i].buf = NULL; + } + } + + for (int i = 0; i < app_ctx->io_num.n_output; i++) + { + if (app_ctx->outputs[i].buf != NULL) + { + free(app_ctx->outputs[i].buf); + app_ctx->outputs[i].buf = NULL; + } + } + + if (app_ctx->inputs != NULL) + { + free(app_ctx->inputs); + app_ctx->inputs = NULL; + } + + if (app_ctx->outputs != NULL) + { + free(app_ctx->outputs); + app_ctx->outputs = NULL; + } +} + +int release_zipformer_model(rknn_app_context_t *app_ctx) +{ + if (app_ctx->input_attrs != NULL) + { + free(app_ctx->input_attrs); + app_ctx->input_attrs = NULL; + } + + if (app_ctx->output_attrs != NULL) + { + free(app_ctx->output_attrs); + app_ctx->output_attrs = NULL; + } + + if (app_ctx->rknn_ctx != 0) + { + rknn_destroy(app_ctx->rknn_ctx); + app_ctx->rknn_ctx = 0; + } + + if (app_ctx != NULL) + { + release_input_output(app_ctx); + app_ctx = NULL; + } + + return 0; +} + +static int inference_encoder_model(rknn_app_context_t *app_ctx) +{ + int ret = 0; + + // Set Input Data + ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, app_ctx->inputs); + if (ret < 0) + { + printf("rknn_input_set fail! ret=%d\n", ret); + goto out; + } + + // Run + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + goto out; + } + + // Get Output + ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, app_ctx->outputs, NULL); + if (ret < 0) + { + printf("rknn_outputs_get fail! ret=%d\n", ret); + goto out; + } + + for (int i = 1; i < app_ctx->io_num.n_input; i++) + { + if (app_ctx->input_attrs[i].fmt == RKNN_TENSOR_NHWC) + { + int N = app_ctx->input_attrs[i].dims[0]; + int H = app_ctx->input_attrs[i].dims[1]; + int W = app_ctx->input_attrs[i].dims[2]; + int C = app_ctx->input_attrs[i].dims[3]; + convert_nchw_to_nhwc((float *)app_ctx->outputs[i].buf, (float *)app_ctx->inputs[i].buf, N, C, H, W); + } + else + { + memcpy(app_ctx->inputs[i].buf, app_ctx->outputs[i].buf, app_ctx->inputs[i].size); + } + } + +out: + + return ret; +} + +static int inference_decoder_model(rknn_app_context_t *app_ctx) +{ + int ret = 0; + + // Set Input Data + ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, app_ctx->inputs); + if (ret < 0) + { + printf("rknn_input_set fail! ret=%d\n", ret); + goto out; + } + + // Run + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + goto out; + } + + // Get Output + ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, app_ctx->outputs, NULL); + if (ret < 0) + { + printf("rknn_outputs_get fail! ret=%d\n", ret); + goto out; + } + +out: + + return ret; +} + +static int inference_joiner_model(rknn_app_context_t *app_ctx, float *cur_encoder_output, float *decoder_output) +{ + int ret = 0; + + // Set Input Data + memcpy(app_ctx->inputs[0].buf, cur_encoder_output, app_ctx->input_attrs[0].n_elems * sizeof(float)); + memcpy(app_ctx->inputs[1].buf, decoder_output, app_ctx->input_attrs[1].n_elems * sizeof(float)); + ret = rknn_inputs_set(app_ctx->rknn_ctx, app_ctx->io_num.n_input, app_ctx->inputs); + if (ret < 0) + { + printf("rknn_input_set fail! ret=%d\n", ret); + goto out; + } + + // Run + ret = rknn_run(app_ctx->rknn_ctx, nullptr); + if (ret < 0) + { + printf("rknn_run fail! ret=%d\n", ret); + goto out; + } + + // Get Output + ret = rknn_outputs_get(app_ctx->rknn_ctx, app_ctx->io_num.n_output, app_ctx->outputs, NULL); + if (ret < 0) + { + printf("rknn_outputs_get fail! ret=%d\n", ret); + goto out; + } + +out: + + return ret; +} + +static int greedy_search(rknn_zipformer_context_t *app_ctx, float *encoder_input, float *encoder_output, float *decoder_output, int64_t *hyp, + float *joiner_output, VocabEntry *vocab, std::vector &recognized_text, std::vector ×tamp, int num_processed_frames, int &frame_offset) +{ + int ret = 0; + + ret = inference_encoder_model(&app_ctx->encoder_context); + if (ret < 0) + { + printf("inference_encoder_model fail! ret=%d\n", ret); + return ret; + } + + if (num_processed_frames == 0) + { + ret = inference_decoder_model(&app_ctx->decoder_context); + if (ret < 0) + { + printf("inference_decoder_model fail! ret=%d\n", ret); + return ret; + } + } + + for (int i = 0; i < ENCODER_OUTPUT_T; i++) + { + float *cur_encoder_output = encoder_output + i * DECODER_DIM; + ret = inference_joiner_model(&app_ctx->joiner_context, cur_encoder_output, decoder_output); + if (ret < 0) + { + printf("inference_joiner_model fail! ret=%d\n", ret); + return ret; + } + + int next_token = argmax(joiner_output); + if (next_token != BLANK_ID && next_token != UNK_ID) + { + timestamp.push_back(frame_offset + i); + + for (int j = 0; j < CONTEXT_SIZE - 1; j++) + { + hyp[j] = hyp[j + 1]; + } + + hyp[CONTEXT_SIZE - 1] = (int64_t)next_token; + std::string next_token_str = vocab[next_token].token; + replace_substr(next_token_str, "▁", " "); + recognized_text.push_back(next_token_str); + ret = inference_decoder_model(&app_ctx->decoder_context); + if (ret < 0) + { + printf("inference_decoder_model fail! ret=%d\n", ret); + return ret; + } + } + } + + frame_offset += ENCODER_OUTPUT_T; + + return ret; +} + +int inference_zipformer_model(rknn_zipformer_context_t *app_ctx, audio_buffer_t audio, VocabEntry *vocab, std::vector &recognized_text, + std::vector ×tamp, float &audio_length) +{ + int ret; + recognized_text.clear(); + timestamp.clear(); + + float *encoder_input = (float *)app_ctx->encoder_context.inputs[0].buf; + float *encoder_output = (float *)app_ctx->encoder_context.outputs[0].buf; + int64_t *hyp = (int64_t *)app_ctx->decoder_context.inputs[0].buf; + float *decoder_output = (float *)app_ctx->decoder_context.outputs[0].buf; + float *joiner_output = (float *)app_ctx->joiner_context.outputs[0].buf; + + knf::FbankOptions fbank_opts; + fbank_opts.frame_opts.samp_freq = 16000; + fbank_opts.mel_opts.num_bins = 80; + fbank_opts.mel_opts.high_freq = -400; + fbank_opts.frame_opts.dither = 0; + fbank_opts.frame_opts.snip_edges = false; + knf::OnlineFbank fbank(fbank_opts); + + int num_frames = 0; + int num_processed_frames = 0; + int offset = N_OFFSET; + int segment = N_SEGMENT; + float tail_pad_length = 0.0; // sec + fbank.AcceptWaveform(SAMPLE_RATE, audio.data, audio.num_frames); + num_frames = fbank.NumFramesReady(); + int frame_offset = 0; + + while ((num_frames - num_processed_frames) > 0) + { + if ((num_frames - num_processed_frames) < segment) + { + tail_pad_length = (segment - (num_frames - num_processed_frames)) / 100.0f; // sec + std::vector tail_paddings(int(tail_pad_length * SAMPLE_RATE)); + fbank.AcceptWaveform(SAMPLE_RATE, tail_paddings.data(), tail_paddings.size()); + fbank.InputFinished(); + } + ret = get_kbank_frames(&fbank, num_processed_frames, segment, encoder_input); + if (ret < 0) + { + break; + } + + ret = greedy_search(app_ctx, encoder_input, encoder_output, decoder_output, hyp, joiner_output, vocab, recognized_text, timestamp, num_processed_frames, frame_offset); + if (ret < 0) + { + printf("greedy_search fail! ret=%d\n", ret); + goto out; + } + num_processed_frames += offset; + } + + audio_length = (float)audio.num_frames / audio.sample_rate + tail_pad_length; + +out: + + return ret; +} \ No newline at end of file diff --git a/examples/zipformer/cpp/zipformer.h b/examples/zipformer/cpp/zipformer.h new file mode 100644 index 00000000..1365aecd --- /dev/null +++ b/examples/zipformer/cpp/zipformer.h @@ -0,0 +1,51 @@ +// Copyright (c) 2024 by Rockchip Electronics Co., Ltd. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _RKNN_DEMO_ZIPFORMER_H_ +#define _RKNN_DEMO_ZIPFORMER_H_ + +#include "rknn_api.h" +#include "audio_utils.h" +#include +#include +#include +#include "process.h" + +#define BLANK_ID 0 +#define UNK_ID 2 + +typedef struct +{ + rknn_context rknn_ctx; + rknn_input_output_num io_num; + rknn_tensor_attr *input_attrs; + rknn_tensor_attr *output_attrs; + rknn_input *inputs; + rknn_output *outputs; +} rknn_app_context_t; + +typedef struct +{ + rknn_app_context_t encoder_context; + rknn_app_context_t decoder_context; + rknn_app_context_t joiner_context; +} rknn_zipformer_context_t; + +int init_zipformer_model(const char *model_path, rknn_app_context_t *app_ctx); +int inference_zipformer_model(rknn_zipformer_context_t *app_ctx, audio_buffer_t audio, VocabEntry *vocab, std::vector &recognized_text, + std::vector ×tamp, float &audio_length); +int release_zipformer_model(rknn_app_context_t *app_ctx); +void build_input_output(rknn_app_context_t *app_ctx); + +#endif //_RKNN_DEMO_ZIPFORMER_H_ \ No newline at end of file diff --git a/examples/zipformer/model/download_model.sh b/examples/zipformer/model/download_model.sh new file mode 100644 index 00000000..8845eae3 --- /dev/null +++ b/examples/zipformer/model/download_model.sh @@ -0,0 +1,3 @@ +wget -O encoder-epoch-99-avg-1.onnx https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/encoder-epoch-99-avg-1.onnx +wget -O decoder-epoch-99-avg-1.onnx https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/decoder-epoch-99-avg-1.onnx +wget -O joiner-epoch-99-avg-1.onnx https://ftrg.zbox.filez.com/v2/delivery/data/95f00b0fc900458ba134f8b180b3f7a1/examples/zipformer/joiner-epoch-99-avg-1.onnx \ No newline at end of file diff --git a/examples/zipformer/model/test.wav b/examples/zipformer/model/test.wav new file mode 100644 index 00000000..d4407c1a Binary files /dev/null and b/examples/zipformer/model/test.wav differ diff --git a/examples/zipformer/model/vocab.txt b/examples/zipformer/model/vocab.txt new file mode 100644 index 00000000..c2cc3d6e --- /dev/null +++ b/examples/zipformer/model/vocab.txt @@ -0,0 +1,6257 @@ + 0 + 1 + 2 +▁AS 3 +▁ONE 4 +▁OF 5 +▁A 6 +▁COMP 7 +AN 8 +Y 9 +▁SOME 10 +▁HUNDRED 11 +▁AND 12 +▁FI 13 +F 14 +T 15 +▁ME 16 +N 17 +▁WO 18 +M 19 +EN 20 +▁NOT 21 +▁LA 22 +BO 23 +UR 24 +ING 25 +▁UNDER 26 +▁ANY 27 +▁SU 28 +S 29 +P 30 +IC 31 +ION 32 +▁ 33 +LU 34 +AC 35 +▁THAT 36 +▁THE 37 +▁CO 38 +▁CHA 39 +CE 40 +RY 41 +▁THOUGH 42 +▁SH 43 +IN 44 +B 45 +J 46 +E 47 +C 48 +▁MUCH 49 +▁PO 50 +UL 51 +AR 52 +▁PRE 53 +U 54 +DI 55 +▁AT 56 +▁WHICH 57 +▁POINT 58 +▁I 59 +▁JU 60 +D 61 +GE 62 +▁E 63 +▁HAD 64 +▁CA 65 +▁IN 66 +▁MY 67 +▁DI 68 +RE 69 +CTION 70 +▁WAS 71 +L 72 +MO 73 +ATE 74 +▁THERE 75 +▁BEEN 76 +▁HE 77 +IT 78 +TED 79 +▁T 80 +RI 81 +VI 82 +AL 83 +▁B 84 +LE 85 +ISH 86 +▁OR 87 +▁SO 88 +▁IT 89 +▁RA 90 +TE 91 +▁PRO 92 +G 93 +▁BUT 94 +▁THIS 95 +▁EX 96 +A 97 +ER 98 +ATED 99 +▁EN 100 +TI 101 +LY 102 +OW 103 +▁TO 104 +▁PA 105 +R 106 +IM 107 +ON 108 +▁PUBLIC 109 +▁GU 110 +IL 111 +▁APP 112 +ED 113 +▁UN 114 +ENT 115 +▁MOST 116 +▁DE 117 +TER 118 +▁MAN 119 +▁ON 120 +▁BY 121 +▁NO 122 +▁MEAN 123 +▁NUMBER 124 +O 125 +▁BELIEVE 126 +▁RI 127 +CH 128 +▁SECOND 129 +▁OTHER 130 +▁K 131 +▁WILL 132 +▁DO 133 +▁WELL 134 +▁SEE 135 +UND 136 +▁JO 137 +KE 138 +▁BE 139 +▁BO 140 +OK 141 +▁COULD 142 +▁THEM 143 +▁THESE 144 +▁P 145 +AGE 146 +▁SHA 147 +ME 148 +IOUS 149 +▁IS 150 +▁ONLY 151 +▁WORD 152 +▁RE 153 +K 154 +I 155 +▁SHOULD 156 +▁HAVE 157 +OR 158 +ITY 159 +▁WHAT 160 +LL 161 +▁SP 162 +TA 163 +NE 164 +OUS 165 +▁COM 166 +US 167 +TION 168 +▁HAS 169 +▁S 170 +TH 171 +▁MI 172 +RO 173 +▁GOOD 174 +▁FRIEND 175 +▁LE 176 +W 177 +ES 178 +▁WITH 179 +▁WHO 180 +▁THINK 181 +▁MU 182 +IG 183 +▁MO 184 +▁MIGHT 185 +▁AN 186 +QU 187 +ATION 188 +▁FROM 189 +▁NA 190 +TURE 191 +▁ALL 192 +IES 193 +▁LET 194 +▁TIME 195 +▁WHEN 196 +▁EVEN 197 +▁NEED 198 +▁O 199 +VE 200 +▁WI 201 +FUL 202 +▁NE 203 +LI 204 +AD 205 +ERS 206 +▁BEFORE 207 +▁WORK 208 +▁LIKE 209 +▁HAND 210 +▁THEN 211 +SH 212 +▁WERE 213 +OM 214 +▁KNOW 215 +EL 216 +MA 217 +▁OVER 218 +▁LO 219 +TING 220 +▁LI 221 +OL 222 +▁HA 223 +MP 224 +LA 225 +ABLE 226 +MB 227 +▁WE 228 +▁ST 229 +ET 230 +▁IF 231 +▁WA 232 +▁NEW 233 +IR 234 +▁FA 235 +▁G 236 +IAN 237 +ND 238 +VER 239 +IS 240 +▁DIS 241 +CO 242 +UN 243 +▁SEVEN 244 +▁TH 245 +▁AFTER 246 +WARD 247 +▁RO 248 +▁WOULD 249 +▁SA 250 +▁FOR 251 +▁FE 252 +▁LONG 253 +▁STILL 254 +▁CON 255 +X 256 +▁HERE 257 +▁EVERYTHING 258 +▁SE 259 +ANT 260 +ALLY 261 +RU 262 +▁THAN 263 +ANCE 264 +▁ARE 265 +▁NEXT 266 +▁DOWN 267 +▁CHI 268 +▁MA 269 +CK 270 +▁DR 271 +Z 272 +▁F 273 +ROW 274 +▁GO 275 +▁INTO 276 +INE 277 +▁SIX 278 +▁YEAR 279 +▁HIS 280 +CI 281 +ENCE 282 +▁CLOSE 283 +▁LAST 284 +▁C 285 +▁MORE 286 +▁THOUSAND 287 +LO 288 +▁UP 289 +▁WANT 290 +▁JA 291 +▁GA 292 +▁THROUGH 293 +▁PLACE 294 +▁HU 295 +MAN 296 +UGH 297 +IST 298 +▁HO 299 +▁BETTER 300 +▁THEIR 301 +▁VERY 302 +NG 303 +▁ANOTHER 304 +▁UM 305 +AS 306 +▁TE 307 +V 308 +▁HER 309 +▁HIGH 310 +QUI 311 +▁BECAUSE 312 +▁SHOW 313 +▁WHERE 314 +▁DAY 315 +▁BRO 316 +▁OLD 317 +MENT 318 +▁HARD 319 +▁THOSE 320 +CU 321 +AT 322 +▁INTEREST 323 +▁EVERY 324 +HE 325 +▁GIVE 326 +▁FACT 327 +▁FOUND 328 +▁HEAR 329 +▁NEVER 330 +▁CAN 331 +▁COME 332 +▁SORT 333 +ITION 334 +EST 335 +IGHT 336 +SHIP 337 +SIDE 338 +▁GREAT 339 +▁MAR 340 +▁GET 341 +▁OUT 342 +▁STA 343 +OP 344 +AIN 345 +▁PI 346 +RA 347 +ID 348 +▁DA 349 +▁LOOK 350 +END 351 +▁BA 352 +VO 353 +H 354 +▁LITTLE 355 +▁THEY 356 +▁ALWAYS 357 +CA 358 +▁HOUSE 359 +TIC 360 +IVE 361 +▁RU 362 +HO 363 +▁RIGHT 364 +ICAL 365 +HA 366 +▁AGAIN 367 +▁EXP 368 +IBLE 369 +▁CERTAIN 370 +▁SHE 371 +▁PRI 372 +▁PLAY 373 +▁VA 374 +LIC 375 +▁TWO 376 +▁THREE 377 +▁DOES 378 +▁YOU 379 +▁TALK 380 +▁ABOUT 381 +UT 382 +▁WITHOUT 383 +▁PEOPLE 384 +FF 385 +▁DON 386 +▁HAPPEN 387 +▁PERSON 388 +▁MADE 389 +▁PART 390 +▁HOW 391 +▁WHY 392 +▁MAKE 393 +▁HIM 394 +▁STATE 395 +▁GRA 396 +▁TRA 397 +FOR 398 +▁LIFE 399 +▁PER 400 +AM 401 +▁DU 402 +PORT 403 +IA 404 +▁BUSINESS 405 +▁UNDERSTAND 406 +▁PLAN 407 +▁KEEP 408 +▁GOT 409 +▁MONEY 410 +▁OWN 411 +▁MANY 412 +ONE 413 +▁BU 414 +▁REALLY 415 +▁COURSE 416 +▁THINGS 417 +▁SAID 418 +▁US 419 +▁WAY 420 +▁TAKE 421 +▁WORLD 422 +▁THING 423 +▁SAY 424 +▁TA 425 +▁SOMETHING 426 +▁LEARN 427 +DE 428 +▁QUESTION 429 +DAY 430 +▁NOW 431 +▁BACK 432 +▁YOUR 433 +ACH 434 +▁SPEAK 435 +▁TURN 436 +ATIVE 437 +▁OUR 438 +▁JE 439 +▁VI 440 +▁HI 441 +▁OP 442 +▁WEEK 443 +▁IDEA 444 +▁AWAY 445 +▁COUNT 446 +NESS 447 +▁REASON 448 +PH 449 +▁TWENTY 450 +▁QUITE 451 +▁CHANGE 452 +▁LOVE 453 +DUC 454 +▁SAME 455 +▁ENOUGH 456 +▁YES 457 +▁FEEL 458 +▁FIRST 459 +▁WHILE 460 +IZE 461 +▁ANYTHING 462 +▁DID 463 +▁KIND 464 +▁POWER 465 +▁JUST 466 +IF 467 +▁PH 468 +▁ANSWER 469 +▁FIND 470 +▁THANK 471 +▁BUILD 472 +▁GOING 473 +▁CAME 474 +▁TOGETHER 475 +▁IMPORTANT 476 +▁HELP 477 +▁FOUR 478 +▁DIFFERENT 479 +▁AROUND 480 +▁AMERICA 481 +▁ALSO 482 +▁NINE 483 +▁AH 484 +▁LOT 485 +▁BETWEEN 486 +▁START 487 +▁SCHOOL 488 +▁SYSTEM 489 +▁MAYBE 490 +▁ACTUALLY 491 +▁PROBLEM 492 +Q 493 +▁MR 494 +▁YEAH 495 +▁OKAY 496 +以 497 +后 498 +你 499 +是 500 +男 501 +孩 502 +子 503 +曾 504 +丽 505 +婷 506 +妈 507 +很 508 +想 509 +兰 510 +州 511 +哪 512 +有 513 +买 514 +路 515 +虎 516 +汽 517 +车 518 +的 519 +我 520 +家 521 +狗 522 +叫 523 +什 524 +么 525 +名 526 +字 527 +现 528 +在 529 +网 530 +络 531 +怎 532 +样 533 +鞍 534 +山 535 +到 536 +郑 537 +大 538 +巴 539 +上 540 +没 541 +厕 542 +所 543 +英 544 +文 545 +给 546 +一 547 +个 548 +惊 549 +喜 550 +看 551 +日 552 +程 553 +表 554 +形 555 +容 556 +人 557 +强 558 +势 559 +面 560 +前 561 +不 562 +敢 563 +吭 564 +声 565 +词 566 +儿 567 +些 568 +办 569 +法 570 +说 571 +话 572 +唱 573 +首 574 +老 575 +与 576 +海 577 +婆 578 +笨 579 +蛋 580 +为 581 +欢 582 +女 583 +呢 584 +杭 585 +里 586 +可 587 +打 588 +炮 589 +朋 590 +友 591 +啊 592 +对 593 +练 594 +语 595 +听 596 +力 597 +帮 598 +助 599 +歌 600 +请 601 +推 602 +荐 603 +几 604 +谢 605 +明 606 +天 607 +早 608 +七 609 +点 610 +起 611 +床 612 +时 613 +候 614 +睡 615 +觉 616 +会 617 +了 618 +吗 619 +查 620 +本 621 +地 622 +气 623 +公 624 +快 625 +吧 626 +注 627 +意 628 +安 629 +全 630 +要 631 +错 632 +过 633 +四 634 +川 635 +再 636 +讲 637 +笑 638 +好 639 +午 640 +连 641 +烟 642 +台 643 +轮 644 +船 645 +史 646 +记 647 +商 648 +君 649 +列 650 +传 651 +孙 652 +鞅 653 +乃 654 +遂 655 +西 656 +入 657 +秦 658 +翻 659 +译 660 +做 661 +下 662 +提 663 +醒 664 +发 665 +送 666 +排 667 +班 668 +邮 669 +件 670 +刘 671 +俊 672 +峰 673 +电 674 +播 675 +放 676 +曲 677 +最 678 +炫 679 +民 680 +族 681 +风 682 +还 683 +年 684 +中 685 +泰 686 +拉 687 +石 688 +光 689 +剑 690 +任 691 +务 692 +玛 693 +雅 694 +预 695 +言 696 +真 697 +视 698 +频 699 +爱 700 +范 701 +冰 702 +出 703 +演 704 +剧 705 +六 706 +三 707 +十 708 +分 709 +退 710 +当 711 +模 712 +式 713 +附 714 +近 715 +洗 716 +浴 717 +方 718 +交 719 +和 720 +玩 721 +见 722 +工 723 +作 724 +干 725 +就 726 +能 727 +疖 728 +吃 729 +饭 730 +或 731 +者 732 +其 733 +他 734 +东 735 +国 736 +移 737 +动 738 +通 739 +信 740 +限 741 +司 742 +介 743 +绍 744 +讯 745 +录 746 +载 747 +装 748 +跟 749 +位 750 +置 751 +离 752 +莞 753 +长 754 +站 755 +多 756 +远 757 +福 758 +建 759 +高 760 +速 761 +服 762 +少 763 +把 764 +这 765 +定 766 +成 767 +彩 768 +铃 769 +手 770 +机 771 +别 772 +忘 773 +盒 774 +拿 775 +回 776 +校 777 +区 778 +万 779 +口 780 +坐 781 +冷 782 +漠 783 +醉 784 +红 785 +颜 786 +来 787 +猪 788 +张 789 +波 790 +炎 791 +亚 792 +纶 793 +媳 794 +妇 795 +马 796 +志 797 +华 798 +短 799 +清 800 +图 801 +片 802 +生 803 +五 804 +钟 805 +开 806 +启 807 +蓝 808 +牙 809 +锂 810 +池 811 +初 812 +次 813 +充 814 +让 815 +今 816 +号 817 +顷 818 +等 819 +于 820 +平 821 +也 822 +呀 823 +聊 824 +问 825 +主 826 +结 827 +婚 828 +恭 829 +情 830 +流 831 +返 832 +洒 833 +热 834 +泪 835 +诗 836 +那 837 +去 838 +眠 839 +药 840 +功 841 +使 842 +用 843 +象 844 +间 845 +估 846 +计 847 +厚 848 +德 849 +物 850 +思 851 +搞 852 +们 853 +只 854 +知 855 +道 856 +奥 857 +特 858 +曼 859 +越 860 +野 861 +比 862 +较 863 +重 864 +新 865 +陈 866 +奕 867 +迅 868 +泉 869 +湾 870 +票 871 +呵 872 +活 873 +经 874 +历 875 +摇 876 +痴 877 +此 878 +刻 879 +呈 880 +报 881 +脑 882 +总 883 +着 884 +她 885 +阿 886 +斯 887 +顿 888 +丁 889 +影 890 +终 891 +幻 892 +雨 893 +否 894 +带 895 +伞 896 +休 897 +息 898 +值 899 +百 900 +每 901 +月 902 +找 903 +扎 904 +屯 905 +线 906 +仙 907 +奇 908 +侠 909 +业 910 +写 911 +完 912 +姐 913 +稍 914 +烤 915 +鹅 916 +肠 917 +团 918 +狩 919 +猎 920 +美 921 +忍 922 +向 923 +冲 924 +常 925 +熟 926 +度 927 +折 928 +扣 929 +二 930 +乘 931 +教 932 +实 933 +认 934 +证 935 +考 936 +试 937 +答 938 +案 939 +费 940 +脸 941 +自 942 +恋 943 +广 944 +府 945 +待 946 +赶 947 +集 948 +告 949 +诉 950 +太 951 +懂 952 +坏 953 +泡 954 +妞 955 +伊 956 +更 957 +健 958 +康 959 +检 960 +乙 961 +肝 962 +厦 963 +门 964 +急 965 +控 966 +毒 967 +产 968 +头 969 +脚 970 +轻 971 +感 972 +空 973 +订 974 +从 975 +碑 976 +店 977 +北 978 +京 979 +火 980 +鸡 981 +白 982 +态 983 +啥 984 +幺 985 +哈 986 +尔 987 +滨 988 +至 989 +硬 990 +座 991 +换 992 +姿 993 +亲 994 +级 995 +紫 996 +秋 997 +叶 998 +刀 999 +爆 1000 +率 1001 +馨 1002 +予 1003 +帅 1004 +汇 1005 +询 1006 +小 1007 +唉 1008 +性 1009 +挺 1010 +厉 1011 +害 1012 +八 1013 +米 1014 +跑 1015 +步 1016 +达 1017 +标 1018 +规 1019 +准 1020 +码 1021 +音 1022 +拨 1023 +器 1024 +油 1025 +价 1026 +卡 1027 +尿 1028 +防 1029 +杀 1030 +死 1031 +倍 1032 +晋 1033 +映 1034 +部 1035 +韩 1036 +己 1037 +右 1038 +键 1039 +编 1040 +辑 1041 +您 1042 +房 1043 +租 1044 +具 1045 +体 1046 +牌 1047 +瑞 1048 +士 1049 +浪 1050 +琴 1051 +简 1052 +永 1053 +拥 1054 +灿 1055 +烂 1056 +朝 1057 +阳 1058 +解 1059 +赢 1060 +事 1061 +滚 1062 +周 1063 +吕 1064 +忠 1065 +页 1066 +心 1067 +份 1068 +被 1069 +龙 1070 +肖 1071 +半 1072 +喊 1073 +改 1074 +堵 1075 +饿 1076 +瞌 1077 +叔 1078 +戏 1079 +兴 1080 +化 1081 +备 1082 +萍 1083 +乡 1084 +钓 1085 +鱼 1086 +岛 1087 +消 1088 +忻 1089 +襄 1090 +县 1091 +况 1092 +得 1093 +王 1094 +属 1095 +假 1096 +期 1097 +像 1098 +脾 1099 +零 1100 +九 1101 +关 1102 +晚 1103 +陪 1104 +危 1105 +苦 1106 +难 1107 +例 1108 +顺 1109 +序 1110 +盖 1111 +茨 1112 +邦 1113 +故 1114 +警 1115 +戒 1116 +搜 1117 +索 1118 +黄 1119 +照 1120 +底 1121 +识 1122 +武 1123 +汉 1124 +第 1125 +博 1126 +基 1127 +尼 1128 +删 1129 +除 1130 +沃 1131 +狐 1132 +场 1133 +卖 1134 +便 1135 +宜 1136 +营 1137 +厅 1138 +资 1139 +料 1140 +哥 1141 +句 1142 +员 1143 +随 1144 +导 1145 +航 1146 +学 1147 +星 1148 +脱 1149 +毛 1150 +膏 1151 +种 1152 +乐 1153 +贵 1154 +妃 1155 +酒 1156 +内 1157 +条 1158 +联 1159 +系 1160 +希 1161 +望 1162 +非 1163 +苹 1164 +果 1165 +银 1166 +登 1167 +户 1168 +密 1169 +师 1170 +沟 1171 +莫 1172 +才 1173 +喝 1174 +累 1175 +彭 1176 +裘 1177 +莹 1178 +珠 1179 +徐 1180 +管 1181 +爸 1182 +敏 1183 +腰 1184 +闪 1185 +舟 1186 +凯 1187 +肯 1188 +水 1189 +济 1190 +闻 1191 +约 1192 +伦 1193 +乱 1194 +舞 1195 +春 1196 +叮 1197 +咚 1198 +昨 1199 +又 1200 +宵 1201 +省 1202 +途 1203 +无 1204 +院 1205 +吉 1206 +农 1207 +股 1208 +胶 1209 +布 1210 +谁 1211 +免 1212 +疫 1213 +逗 1214 +闹 1215 +闭 1216 +青 1217 +景 1218 +花 1219 +园 1220 +富 1221 +桥 1222 +哭 1223 +节 1224 +树 1225 +茂 1226 +盛 1227 +边 1228 +余 1229 +姚 1230 +走 1231 +原 1232 +行 1233 +都 1234 +旅 1235 +馆 1236 +宾 1237 +根 1238 +修 1239 +理 1240 +厂 1241 +板 1242 +诊 1243 +专 1244 +复 1245 +鹰 1246 +队 1247 +桌 1248 +财 1249 +港 1250 +色 1251 +诚 1252 +勿 1253 +扰 1254 +持 1255 +孟 1256 +古 1257 +医 1258 +研 1259 +究 1260 +取 1261 +卫 1262 +目 1263 +铁 1264 +麻 1265 +将 1266 +浏 1267 +览 1268 +湿 1269 +朱 1270 +沱 1271 +合 1272 +江 1273 +何 1274 +祈 1275 +祷 1276 +义 1277 +酷 1278 +派 1279 +如 1280 +咩 1281 +正 1282 +算 1283 +楼 1284 +距 1285 +震 1286 +借 1287 +政 1288 +策 1289 +温 1290 +宝 1291 +沙 1292 +角 1293 +肚 1294 +疼 1295 +林 1296 +宿 1297 +舍 1298 +阴 1299 +晴 1300 +圆 1301 +缺 1302 +微 1303 +辽 1304 +刚 1305 +牛 1306 +偷 1307 +吓 1308 +跳 1309 +城 1310 +吴 1311 +称 1312 +呼 1313 +爷 1314 +埋 1315 +怨 1316 +缠 1317 +柴 1318 +钱 1319 +极 1320 +先 1321 +辞 1322 +职 1323 +哦 1324 +啦 1325 +售 1326 +保 1327 +黑 1328 +客 1329 +立 1330 +足 1331 +之 1332 +鬼 1333 +留 1334 +辟 1335 +邪 1336 +谱 1337 +减 1338 +肥 1339 +皇 1340 +萨 1341 +舅 1342 +幽 1343 +默 1344 +细 1345 +胞 1346 +溪 1347 +懒 1348 +书 1349 +杨 1350 +慧 1351 +芝 1352 +屁 1353 +画 1354 +晨 1355 +宁 1356 +侦 1357 +探 1358 +柯 1359 +南 1360 +涛 1361 +应 1362 +该 1363 +弑 1364 +神 1365 +魔 1366 +创 1367 +世 1368 +游 1369 +犯 1370 +已 1371 +泽 1372 +村 1373 +变 1374 +奏 1375 +杰 1376 +偶 1377 +命 1378 +乔 1379 +恩 1380 +并 1381 +阅 1382 +读 1383 +左 1384 +般 1385 +低 1386 +调 1387 +阔 1388 +烦 1389 +球 1390 +峡 1391 +界 1392 +霆 1393 +锋 1394 +柏 1395 +需 1396 +艺 1397 +术 1398 +弄 1399 +章 1400 +悲 1401 +咒 1402 +版 1403 +统 1404 +羊 1405 +肉 1406 +斤 1407 +嘿 1408 +郁 1409 +闷 1410 +进 1411 +姨 1412 +庆 1413 +威 1414 +两 1415 +岸 1416 +普 1417 +琪 1418 +玫 1419 +瑰 1420 +香 1421 +碟 1422 +眼 1423 +收 1424 +湖 1425 +禽 1426 +涮 1427 +汪 1428 +盘 1429 +夹 1430 +存 1431 +枕 1432 +指 1433 +针 1434 +仓 1435 +库 1436 +餐 1437 +块 1438 +咪 1439 +毕 1440 +薪 1441 +赛 1442 +纪 1443 +末 1444 +典 1445 +娱 1446 +傻 1447 +嘲 1448 +彪 1449 +升 1450 +润 1451 +核 1452 +遍 1453 +接 1454 +元 1455 +鼓 1456 +屿 1457 +爹 1458 +胡 1459 +雪 1460 +招 1461 +喂 1462 +齿 1463 +侣 1464 +土 1465 +豆 1466 +铿 1467 +锵 1468 +聪 1469 +但 1470 +飞 1471 +鹤 1472 +壁 1473 +摞 1474 +久 1475 +冬 1476 +骏 1477 +然 1478 +讨 1479 +论 1480 +腾 1481 +易 1482 +筋 1483 +转 1484 +弯 1485 +耳 1486 +齐 1487 +阶 1488 +革 1489 +代 1490 +许 1491 +圣 1492 +诞 1493 +吸 1494 +血 1495 +燕 1496 +松 1497 +鼠 1498 +确 1499 +凤 1500 +凰 1501 +由 1502 +翔 1503 +段 1504 +超 1505 +精 1506 +支 1507 +扶 1508 +室 1509 +包 1510 +菜 1511 +田 1512 +骂 1513 +洋 1514 +舒 1515 +衡 1516 +款 1517 +嘴 1518 +菲 1519 +嘛 1520 +嗨 1521 +鸟 1522 +玉 1523 +数 1524 +贝 1525 +郝 1526 +漫 1527 +诺 1528 +衣 1529 +嫖 1530 +娼 1531 +穿 1532 +骨 1533 +拆 1534 +伤 1535 +获 1536 +奖 1537 +稿 1538 +锅 1539 +购 1540 +治 1541 +痘 1542 +遗 1543 +疤 1544 +痕 1545 +饱 1546 +寻 1547 +瓷 1548 +妹 1549 +徽 1550 +参 1551 +格 1552 +题 1553 +凭 1554 +住 1555 +剖 1556 +腹 1557 +祝 1558 +贺 1559 +加 1560 +跪 1561 +潞 1562 +云 1563 +端 1564 +木 1565 +烁 1566 +朵 1567 +赵 1568 +潘 1569 +津 1570 +滋 1571 +燥 1572 +利 1573 +负 1574 +昆 1575 +因 1576 +森 1577 +及 1578 +病 1579 +固 1580 +市 1581 +烧 1582 +番 1583 +茄 1584 +炒 1585 +而 1586 +凉 1587 +冒 1588 +量 1589 +夸 1590 +尾 1591 +崔 1592 +另 1593 +处 1594 +铺 1595 +沈 1596 +哎 1597 +身 1598 +哟 1599 +习 1600 +虞 1601 +瞧 1602 +烈 1603 +皮 1604 +鞋 1605 +深 1606 +圳 1607 +委 1608 +胖 1609 +猴 1610 +军 1611 +素 1612 +楷 1613 +补 1614 +偿 1615 +屏 1616 +散 1617 +效 1618 +丹 1619 +念 1620 +绝 1621 +艳 1622 +够 1623 +狼 1624 +且 1625 +龄 1626 +乌 1627 +蓉 1628 +厘 1629 +含 1630 +庚 1631 +澈 1632 +犬 1633 +致 1634 +运 1635 +慢 1636 +钻 1637 +李 1638 +轩 1639 +育 1640 +项 1641 +咱 1642 +误 1643 +弟 1644 +依 1645 +尽 1646 +河 1647 +夫 1648 +沁 1649 +始 1650 +芳 1651 +禺 1652 +旧 1653 +坑 1654 +胜 1655 +酱 1656 +漂 1657 +亮 1658 +战 1659 +斗 1660 +严 1661 +娟 1662 +逼 1663 +添 1664 +盈 1665 +萝 1666 +莉 1667 +肌 1668 +唐 1669 +兵 1670 +辆 1671 +双 1672 +佛 1673 +傅 1674 +劲 1675 +直 1676 +测 1677 +苏 1678 +迁 1679 +沭 1680 +祥 1681 +婴 1682 +品 1683 +销 1684 +禹 1685 +哲 1686 +嗯 1687 +趟 1688 +拐 1689 +金 1690 +满 1691 +套 1692 +倒 1693 +千 1694 +迎 1695 +淇 1696 +驾 1697 +拟 1698 +良 1699 +揭 1700 +杯 1701 +淄 1702 +睛 1703 +制 1704 +枪 1705 +抢 1706 +狸 1707 +泥 1708 +造 1709 +哇 1710 +羯 1711 +庙 1712 +逃 1713 +朗 1714 +领 1715 +悟 1716 +湛 1717 +贸 1718 +垃 1719 +圾 1720 +软 1721 +莲 1722 +味 1723 +旺 1724 +旦 1725 +潮 1726 +奶 1727 +央 1728 +惜 1729 +续 1730 +咨 1731 +茅 1732 +父 1733 +母 1734 +笔 1735 +封 1736 +同 1737 +黎 1738 +共 1739 +科 1740 +相 1741 +镇 1742 +贤 1743 +宏 1744 +洲 1745 +瓦 1746 +寨 1747 +受 1748 +梦 1749 +呗 1750 +苍 1751 +丘 1752 +避 1753 +孕 1754 +灰 1755 +尤 1756 +击 1757 +腮 1758 +腺 1759 +兆 1760 +坨 1761 +屎 1762 +忧 1763 +草 1764 +赠 1765 +外 1766 +险 1767 +某 1768 +磁 1769 +贷 1770 +反 1771 +罚 1772 +昌 1773 +饰 1774 +辉 1775 +权 1776 +材 1777 +炯 1778 +签 1779 +追 1780 +求 1781 +催 1782 +痛 1783 +盱 1784 +眙 1785 +围 1786 +淘 1787 +幸 1788 +扫 1789 +旭 1790 +切 1791 +磋 1792 +冠 1793 +征 1794 +郎 1795 +骗 1796 +哄 1797 +释 1798 +蒙 1799 +涯 1800 +碧 1801 +斜 1802 +嫁 1803 +幕 1804 +哼 1805 +钢 1806 +碳 1807 +纤 1808 +维 1809 +决 1810 +妻 1811 +未 1812 +绳 1813 +断 1814 +寺 1815 +妙 1816 +伯 1817 +蹲 1818 +宅 1819 +吵 1820 +娃 1821 +兄 1822 +廷 1823 +夺 1824 +社 1825 +示 1826 +按 1827 +饺 1828 +甜 1829 +蜜 1830 +咖 1831 +啡 1832 +优 1833 +逛 1834 +街 1835 +著 1836 +杂 1837 +址 1838 +荷 1839 +塘 1840 +拜 1841 +币 1842 +迟 1843 +货 1844 +粉 1845 +刮 1846 +破 1847 +射 1848 +狂 1849 +苗 1850 +罗 1851 +设 1852 +困 1853 +湘 1854 +潭 1855 +评 1856 +娘 1857 +涉 1858 +采 1859 +芙 1860 +夜 1861 +捷 1862 +斩 1863 +摩 1864 +托 1865 +泳 1866 +琳 1867 +律 1868 +官 1869 +劫 1870 +蛮 1871 +替 1872 +架 1873 +悬 1874 +浮 1875 +窗 1876 +顶 1877 +敦 1878 +善 1879 +哉 1880 +桂 1881 +勇 1882 +荆 1883 +镜 1884 +监 1885 +怕 1886 +呐 1887 +劳 1888 +莱 1889 +狮 1890 +宽 1891 +袋 1892 +囊 1893 +秀 1894 +卸 1895 +链 1896 +嵩 1897 +韭 1898 +葆 1899 +额 1900 +翼 1901 +忙 1902 +瓶 1903 +梅 1904 +堰 1905 +粒 1906 +汤 1907 +谋 1908 +樊 1909 +恨 1910 +愿 1911 +锡 1912 +申 1913 +护 1914 +庄 1915 +临 1916 +源 1917 +环 1918 +境 1919 +礼 1920 +恐 1921 +晒 1922 +虫 1923 +划 1924 +鸣 1925 +怖 1926 +伍 1927 +佰 1928 +岁 1929 +组 1930 +响 1931 +类 1932 +韶 1933 +克 1934 +洛 1935 +玲 1936 +裤 1937 +柔 1938 +疆 1939 +篮 1940 +伟 1941 +扔 1942 +掉 1943 +媒 1944 +涨 1945 +透 1946 +纯 1947 +怀 1948 +坊 1949 +麦 1950 +菠 1951 +养 1952 +晕 1953 +群 1954 +展 1955 +厌 1956 +拒 1957 +单 1958 +静 1959 +刷 1960 +插 1961 +肛 1962 +互 1963 +蘑 1964 +菇 1965 +姑 1966 +桐 1967 +辛 1968 +察 1969 +毫 1970 +质 1971 +差 1972 +翰 1973 +爽 1974 +欣 1975 +议 1976 +铜 1977 +籍 1978 +争 1979 +喆 1980 +孔 1981 +堤 1982 +薇 1983 +茵 1984 +席 1985 +琼 1986 +杠 1987 +衔 1988 +概 1989 +往 1990 +邢 1991 +惠 1992 +烫 1993 +绑 1994 +崇 1995 +帝 1996 +据 1997 +貌 1998 +似 1999 +胸 2000 +罩 2001 +构 2002 +尊 2003 +秘 2004 +它 2005 +详 2006 +悠 2007 +闲 2008 +违 2009 +陆 2010 +割 2011 +绩 2012 +企 2013 +绥 2014 +辐 2015 +舌 2016 +寂 2017 +寞 2018 +宇 2019 +携 2020 +拳 2021 +观 2022 +魏 2023 +郭 2024 +磊 2025 +副 2026 +梁 2027 +斌 2028 +须 2029 +僧 2030 +徒 2031 +季 2032 +灯 2033 +梯 2034 +墙 2035 +付 2036 +坦 2037 +殊 2038 +曹 2039 +操 2040 +捡 2041 +赤 2042 +盗 2043 +废 2044 +蒋 2045 +浙 2046 +食 2047 +咯 2048 +童 2049 +坡 2050 +剪 2051 +唯 2052 +疗 2053 +状 2054 +暴 2055 +缓 2056 +誉 2057 +衰 2058 +宋 2059 +娜 2060 +雄 2061 +谛 2062 +糖 2063 +羽 2064 +棋 2065 +滩 2066 +佳 2067 +臭 2068 +帆 2069 +岳 2070 +疲 2071 +惫 2072 +滴 2073 +倾 2074 +盆 2075 +谷 2076 +施 2077 +晶 2078 +赚 2079 +澡 2080 +遇 2081 +鲁 2082 +祭 2083 +灶 2084 +独 2085 +谈 2086 +承 2087 +蜀 2088 +丰 2089 +归 2090 +辜 2091 +扇 2092 +渴 2093 +羡 2094 +慕 2095 +裸 2096 +宗 2097 +纬 2098 +亦 2099 +儒 2100 +霸 2101 +翡 2102 +翠 2103 +芭 2104 +抱 2105 +歉 2106 +邱 2107 +夏 2108 +隆 2109 +灵 2110 +珍 2111 +浩 2112 +乾 2113 +坤 2114 +培 2115 +训 2116 +压 2117 +偏 2118 +骤 2119 +熙 2120 +葬 2121 +姆 2122 +兽 2123 +筑 2124 +丝 2125 +若 2126 +诡 2127 +异 2128 +侯 2129 +摆 2130 +俗 2131 +缚 2132 +束 2133 +愁 2134 +盟 2135 +却 2136 +显 2137 +肤 2138 +茹 2139 +荣 2140 +增 2141 +宫 2142 +局 2143 +适 2144 +楚 2145 +驻 2146 +纽 2147 +秒 2148 +辣 2149 +虾 2150 +甘 2151 +肃 2152 +粕 2153 +喻 2154 +敬 2155 +谨 2156 +慎 2157 +竭 2158 +止 2159 +际 2160 +寓 2161 +勤 2162 +挫 2163 +泣 2164 +奈 2165 +圭 2166 +焰 2167 +猩 2168 +守 2169 +允 2170 +兔 2171 +篇 2172 +敌 2173 +辕 2174 +猫 2175 +柠 2176 +檬 2177 +橘 2178 +卜 2179 +妓 2180 +既 2181 +闯 2182 +胆 2183 +刁 2184 +竟 2185 +竞 2186 +冯 2187 +陇 2188 +赣 2189 +呆 2190 +滞 2191 +停 2192 +邯 2193 +郸 2194 +域 2195 +徕 2196 +患 2197 +甲 2198 +亡 2199 +鼻 2200 +背 2201 +戴 2202 +幼 2203 +伙 2204 +括 2205 +邓 2206 +谐 2207 +担 2208 +浑 2209 +抖 2210 +耍 2211 +综 2212 +失 2213 +蕾 2214 +鸭 2215 +莘 2216 +选 2217 +糸 2218 +桶 2219 +弃 2220 +暗 2221 +卓 2222 +榜 2223 +拼 2224 +壅 2225 +丈 2226 +锈 2227 +恢 2228 +刺 2229 +嘻 2230 +顾 2231 +投 2232 +晓 2233 +巨 2234 +抽 2235 +档 2236 +乳 2237 +迪 2238 +蠢 2239 +裹 2240 +唤 2241 +焦 2242 +择 2243 +俺 2244 +技 2245 +暧 2246 +昧 2247 +怪 2248 +坛 2249 +眉 2250 +嘉 2251 +逸 2252 +课 2253 +栏 2254 +撬 2255 +框 2256 +液 2257 +凝 2258 +暑 2259 +型 2260 +烘 2261 +簿 2262 +扬 2263 +汁 2264 +诸 2265 +迹 2266 +禁 2267 +株 2268 +泸 2269 +屠 2270 +宰 2271 +忽 2272 +炼 2273 +必 2274 +妆 2275 +飘 2276 +鹿 2277 +敲 2278 +拾 2279 +躺 2280 +歇 2281 +狠 2282 +沾 2283 +畅 2284 +镕 2285 +贪 2286 +污 2287 +斧 2288 +巡 2289 +弹 2290 +盐 2291 +枝 2292 +渝 2293 +壑 2294 +郴 2295 +落 2296 +牡 2297 +钛 2298 +剩 2299 +俄 2300 +抵 2301 +押 2302 +郊 2303 +弱 2304 +授 2305 +蟹 2306 +糕 2307 +败 2308 +各 2309 +伴 2310 +享 2311 +居 2312 +障 2313 +棕 2314 +旁 2315 +屌 2316 +绒 2317 +酸 2318 +隔 2319 +瞄 2320 +俩 2321 +柳 2322 +册 2323 +弊 2324 +逮 2325 +绵 2326 +挣 2327 +闵 2328 +勒 2329 +陶 2330 +寒 2331 +吻 2332 +桃 2333 +悍 2334 +绮 2335 +贞 2336 +疾 2337 +诫 2338 +菌 2339 +则 2340 +谭 2341 +咏 2342 +麟 2343 +棠 2344 +抬 2345 +棺 2346 +均 2347 +纸 2348 +碱 2349 +沧 2350 +董 2351 +挤 2352 +虚 2353 +钠 2354 +胃 2355 +躁 2356 +智 2357 +畔 2358 +墨 2359 +堂 2360 +喔 2361 +宣 2362 +丑 2363 +嚣 2364 +辈 2365 +孤 2366 +鞭 2367 +验 2368 +夕 2369 +印 2370 +欧 2371 +阵 2372 +咋 2373 +驹 2374 +挂 2375 +轿 2376 +拍 2377 +洁 2378 +凑 2379 +蕉 2380 +诱 2381 +惑 2382 +颐 2383 +箭 2384 +樱 2385 +辖 2386 +捕 2387 +炸 2388 +斋 2389 +恒 2390 +沉 2391 +侧 2392 +跌 2393 +暇 2394 +掌 2395 +筷 2396 +彬 2397 +稚 2398 +傲 2399 +腔 2400 +藏 2401 +浦 2402 +瓣 2403 +捆 2404 +卧 2405 +欠 2406 +犀 2407 +甩 2408 +敷 2409 +衍 2410 +谅 2411 +积 2412 +怡 2413 +阁 2414 +趣 2415 +掰 2416 +耽 2417 +蒜 2418 +菱 2419 +葛 2420 +聚 2421 +露 2422 +帐 2423 +紧 2424 +郓 2425 +聘 2426 +桑 2427 +众 2428 +圈 2429 +渡 2430 +鲜 2431 +杜 2432 +甫 2433 +遵 2434 +骚 2435 +吹 2436 +蚊 2437 +塔 2438 +赏 2439 +荒 2440 +欺 2441 +揍 2442 +锁 2443 +恼 2444 +忐 2445 +忑 2446 +输 2447 +描 2448 +触 2449 +糊 2450 +涂 2451 +熊 2452 +妮 2453 +抄 2454 +裙 2455 +塑 2456 +橡 2457 +阜 2458 +獒 2459 +励 2460 +黔 2461 +臣 2462 +憔 2463 +悴 2464 +昂 2465 +党 2466 +沐 2467 +浓 2468 +灾 2469 +捐 2470 +柿 2471 +瑟 2472 +翁 2473 +侨 2474 +督 2475 +振 2476 +鹏 2477 +乒 2478 +乓 2479 +巷 2480 +贡 2481 +祖 2482 +即 2483 +绿 2484 +搭 2485 +配 2486 +骑 2487 +届 2488 +举 2489 +伸 2490 +整 2491 +突 2492 +陌 2493 +糟 2494 +惩 2495 +硕 2496 +茫 2497 +趋 2498 +仁 2499 +钙 2500 +雕 2501 +井 2502 +撒 2503 +岩 2504 +悄 2505 +搁 2506 +浅 2507 +救 2508 +饮 2509 +佩 2510 +赌 2511 +涕 2512 +薯 2513 +令 2514 +泌 2515 +蔬 2516 +批 2517 +攀 2518 +怜 2519 +淮 2520 +寝 2521 +填 2522 +卿 2523 +萱 2524 +寄 2525 +窝 2526 +纳 2527 +洱 2528 +惹 2529 +锦 2530 +浒 2531 +欲 2532 +棉 2533 +箱 2534 +仅 2535 +述 2536 +摸 2537 +纲 2538 +澳 2539 +染 2540 +兼 2541 +岭 2542 +淋 2543 +肿 2544 +旗 2545 +嘞 2546 +乖 2547 +酮 2548 +颠 2549 +覆 2550 +誓 2551 +递 2552 +蛟 2553 +占 2554 +乎 2555 +融 2556 +甸 2557 +幂 2558 +钥 2559 +匙 2560 +酬 2561 +皆 2562 +胎 2563 +腐 2564 +痿 2565 +绣 2566 +枫 2567 +蝴 2568 +蝶 2569 +抛 2570 +撞 2571 +植 2572 +僵 2573 +尸 2574 +巾 2575 +煌 2576 +逊 2577 +引 2578 +兑 2579 +荫 2580 +朔 2581 +丢 2582 +扩 2583 +摄 2584 +龟 2585 +鑫 2586 +谦 2587 +豪 2588 +噬 2589 +眷 2590 +挑 2591 +仲 2592 +穷 2593 +玻 2594 +璃 2595 +岗 2596 +姥 2597 +横 2598 +蚌 2599 +埠 2600 +邀 2601 +蔚 2602 +虹 2603 +降 2604 +疣 2605 +鱿 2606 +喵 2607 +囧 2608 +茶 2609 +猜 2610 +玮 2611 +莎 2612 +冼 2613 +榕 2614 +媛 2615 +瓜 2616 +煮 2617 +耕 2618 +镶 2619 +虽 2620 +驳 2621 +霍 2622 +仗 2623 +窍 2624 +魅 2625 +访 2626 +邻 2627 +抗 2628 +莆 2629 +涵 2630 +筒 2631 +疯 2632 +赖 2633 +豌 2634 +碍 2635 +症 2636 +卤 2637 +翅 2638 +膀 2639 +蓬 2640 +咸 2641 +尚 2642 +瘦 2643 +缸 2644 +爬 2645 +鄂 2646 +塞 2647 +稻 2648 +召 2649 +荡 2650 +桨 2651 +税 2652 +呃 2653 +渠 2654 +骥 2655 +伏 2656 +枥 2657 +邑 2658 +净 2659 +弦 2660 +蔽 2661 +诀 2662 +咳 2663 +嗽 2664 +芯 2665 +储 2666 +缘 2667 +冻 2668 +厨 2669 +鉴 2670 +擦 2671 +棒 2672 +损 2673 +暂 2674 +殖 2675 +焊 2676 +募 2677 +邵 2678 +饶 2679 +梭 2680 +鄙 2681 +骄 2682 +蔡 2683 +辄 2684 +努 2685 +洽 2686 +宙 2687 +鲈 2688 +葫 2689 +芦 2690 +梧 2691 +燎 2692 +缴 2693 +薄 2694 +执 2695 +垫 2696 +靠 2697 +拢 2698 +萧 2699 +醋 2700 +脊 2701 +慰 2702 +攻 2703 +狱 2704 +吝 2705 +啬 2706 +煤 2707 +楞 2708 +脏 2709 +迷 2710 +椒 2711 +侄 2712 +璇 2713 +耐 2714 +庵 2715 +帽 2716 +崎 2717 +峻 2718 +援 2719 +娶 2720 +丫 2721 +犹 2722 +豫 2723 +罪 2724 +恶 2725 +陛 2726 +樟 2727 +截 2728 +巧 2729 +驰 2730 +轨 2731 +继 2732 +葱 2733 +蘸 2734 +汕 2735 +蜘 2736 +蛛 2737 +聋 2738 +俱 2739 +捉 2740 +卢 2741 +骆 2742 +氓 2743 +耶 2744 +仔 2745 +激 2746 +渊 2747 +钾 2748 +暖 2749 +钰 2750 +裁 2751 +判 2752 +略 2753 +墓 2754 +洪 2755 +凌 2756 +符 2757 +壮 2758 +陵 2759 +挥 2760 +夷 2761 +尘 2762 +沪 2763 +榆 2764 +涧 2765 +析 2766 +孝 2767 +弘 2768 +椅 2769 +贴 2770 +蛇 2771 +浣 2772 +镯 2773 +枣 2774 +佐 2775 +柑 2776 +谓 2777 +洞 2778 +漳 2779 +撕 2780 +叉 2781 +诛 2782 +糯 2783 +粽 2784 +碎 2785 +幅 2786 +赘 2787 +浆 2788 +循 2789 +偕 2790 +诙 2791 +阚 2792 +摘 2793 +串 2794 +悉 2795 +蜕 2796 +残 2797 +诅 2798 +祁 2799 +仪 2800 +璐 2801 +瑶 2802 +楠 2803 +崂 2804 +供 2805 +掖 2806 +椎 2807 +铆 2808 +钉 2809 +铐 2810 +镣 2811 +栋 2812 +潇 2813 +抓 2814 +屋 2815 +鸦 2816 +玄 2817 +芜 2818 +钨 2819 +毯 2820 +矿 2821 +缩 2822 +酶 2823 +焕 2824 +埃 2825 +霞 2826 +噢 2827 +韵 2828 +艾 2829 +虐 2830 +俘 2831 +颗 2832 +巩 2833 +牵 2834 +汝 2835 +搅 2836 +廉 2837 +啤 2838 +苑 2839 +辍 2840 +缝 2841 +纫 2842 +膜 2843 +娄 2844 +倩 2845 +魂 2846 +姜 2847 +彻 2848 +扉 2849 +镁 2850 +氢 2851 +铝 2852 +淀 2853 +雹 2854 +妍 2855 +鼎 2856 +碗 2857 +亭 2858 +闽 2859 +献 2860 +耻 2861 +畜 2862 +蚯 2863 +蚓 2864 +杆 2865 +靓 2866 +颖 2867 +瘾 2868 +腿 2869 +咧 2870 +嗦 2871 +忆 2872 +卑 2873 +鸽 2874 +藤 2875 +滑 2876 +蝇 2877 +蚂 2878 +蚁 2879 +迢 2880 +蝎 2881 +斑 2882 +赞 2883 +氧 2884 +姝 2885 +擎 2886 +憋 2887 +屈 2888 +讶 2889 +袜 2890 +吐 2891 +秽 2892 +哑 2893 +蓦 2894 +阑 2895 +珊 2896 +披 2897 +巫 2898 +妖 2899 +坪 2900 +疏 2901 +抒 2902 +炉 2903 +舰 2904 +贱 2905 +搬 2906 +遥 2907 +燃 2908 +咽 2909 +喉 2910 +熔 2911 +婵 2912 +奔 2913 +汗 2914 +蓄 2915 +辰 2916 +肩 2917 +洮 2918 +琅 2919 +径 2920 +廊 2921 +姬 2922 +衬 2923 +雯 2924 +滁 2925 +泗 2926 +筹 2927 +诵 2928 +奴 2929 +跨 2930 +娴 2931 +绯 2932 +惯 2933 +谎 2934 +蹈 2935 +潜 2936 +搂 2937 +逆 2938 +钞 2939 +辅 2940 +凶 2941 +橱 2942 +柜 2943 +婕 2944 +矮 2945 +邹 2946 +嫂 2947 +饼 2948 +撸 2949 +壶 2950 +握 2951 +鸳 2952 +鸯 2953 +寸 2954 +堕 2955 +哀 2956 +械 2957 +蜈 2958 +蚣 2959 +袁 2960 +鸿 2961 +穆 2962 +泊 2963 +衮 2964 +弗 2965 +雷 2966 +谜 2967 +俞 2968 +灏 2969 +毅 2970 +迈 2971 +蜂 2972 +辨 2973 +沂 2974 +灭 2975 +腊 2976 +脍 2977 +炙 2978 +卦 2979 +霄 2980 +扯 2981 +泾 2982 +脂 2983 +肪 2984 +淹 2985 +灌 2986 +辱 2987 +丸 2988 +账 2989 +秤 2990 +褐 2991 +芬 2992 +窖 2993 +慈 2994 +益 2995 +亿 2996 +颈 2997 +糜 2998 +隋 2999 +霉 3000 +署 3001 +狄 3002 +酪 3003 +旋 3004 +蔷 3005 +皱 3006 +纹 3007 +枯 3008 +粤 3009 +拔 3010 +菩 3011 +驱 3012 +咦 3013 +掀 3014 +菊 3015 +涩 3016 +耀 3017 +娥 3018 +奘 3019 +眯 3020 +芊 3021 +绪 3022 +沛 3023 +锐 3024 +姓 3025 +氏 3026 +垂 3027 +迫 3028 +絮 3029 +藕 3030 +捎 3031 +蓓 3032 +沫 3033 +奎 3034 +贩 3035 +泵 3036 +疑 3037 +岐 3038 +拓 3039 +詹 3040 +韦 3041 +粥 3042 +瞎 3043 +层 3044 +寿 3045 +淑 3046 +琦 3047 +履 3048 +痣 3049 +蔓 3050 +延 3051 +措 3052 +氰 3053 +胺 3054 +炭 3055 +鸥 3056 +谊 3057 +宛 3058 +悦 3059 +谣 3060 +茎 3061 +堆 3062 +鲤 3063 +坚 3064 +澎 3065 +溜 3066 +贫 3067 +擅 3068 +锌 3069 +竹 3070 +苟 3071 +磷 3072 +庐 3073 +嵌 3074 +潍 3075 +悚 3076 +岑 3077 +稀 3078 +奋 3079 +呦 3080 +梳 3081 +伐 3082 +芒 3083 +吶 3084 +凡 3085 +臂 3086 +驴 3087 +殿 3088 +雁 3089 +粹 3090 +凋 3091 +葵 3092 +烛 3093 +肾 3094 +尝 3095 +磨 3096 +晏 3097 +甄 3098 +嬛 3099 +盼 3100 +肇 3101 +咬 3102 +洼 3103 +匪 3104 +啰 3105 +硅 3106 +铅 3107 +矛 3108 +盾 3109 +贼 3110 +霜 3111 +螺 3112 +漏 3113 +帕 3114 +杉 3115 +矶 3116 +耗 3117 +责 3118 +靖 3119 +呸 3120 +驶 3121 +吞 3122 +睁 3123 +笼 3124 +茉 3125 +赈 3126 +纱 3127 +艘 3128 +炖 3129 +仿 3130 +瞬 3131 +嵊 3132 +澄 3133 +丞 3134 +摔 3135 +宠 3136 +爪 3137 +笋 3138 +庭 3139 +蜡 3140 +戈 3141 +锻 3142 +粗 3143 +糙 3144 +混 3145 +荚 3146 +曙 3147 +凄 3148 +抚 3149 +瀑 3150 +挖 3151 +掘 3152 +垮 3153 +奢 3154 +侈 3155 +揉 3156 +穹 3157 +钦 3158 +蛙 3159 +荧 3160 +悔 3161 +彦 3162 +忏 3163 +祸 3164 +攒 3165 +慌 3166 +簸 3167 +箕 3168 +繁 3169 +尖 3170 +芋 3171 +铠 3172 +沿 3173 +扮 3174 +隐 3175 +促 3176 +庾 3177 +葡 3178 +萄 3179 +硝 3180 +溶 3181 +淡 3182 +炅 3183 +昕 3184 +尧 3185 +妊 3186 +娠 3187 +曰 3188 +祛 3189 +枚 3190 +卒 3191 +陕 3192 +昭 3193 +龚 3194 +债 3195 +嗓 3196 +陷 3197 +阱 3198 +庞 3199 +盲 3200 +侵 3201 +匣 3202 +愤 3203 +怒 3204 +瞅 3205 +遭 3206 +脉 3207 +馒 3208 +愉 3209 +栗 3210 +鲍 3211 +挎 3212 +匆 3213 +缕 3214 +昵 3215 +鳄 3216 +阙 3217 +坟 3218 +捏 3219 +喽 3220 +雀 3221 +贯 3222 +苇 3223 +鹄 3224 +愈 3225 +裂 3226 +伪 3227 +劣 3228 +歹 3229 +溅 3230 +雌 3231 +猛 3232 +逞 3233 +饥 3234 +愚 3235 +牧 3236 +碰 3237 +帜 3238 +佝 3239 +偻 3240 +讪 3241 +馍 3242 +役 3243 +栈 3244 +唾 3245 +缆 3246 +袄 3247 +闸 3248 +织 3249 +筐 3250 +婉 3251 +昏 3252 +拖 3253 +毙 3254 +咙 3255 +褪 3256 +驼 3257 +壳 3258 +孽 3259 +审 3260 +脖 3261 +恳 3262 +孢 3263 +矫 3264 +臻 3265 +兖 3266 +俏 3267 +棍 3268 +唻 3269 +肘 3270 +俭 3271 +冕 3272 +葩 3273 +佑 3274 +鬓 3275 +柚 3276 +赴 3277 +崖 3278 +塌 3279 +厢 3280 +窈 3281 +窕 3282 +逑 3283 +卷 3284 +拂 3285 +蟑 3286 +螂 3287 +契 3288 +羞 3289 +函 3290 +逐 3291 +拌 3292 +肺 3293 +阻 3294 +纵 3295 +痰 3296 +狙 3297 +惋 3298 +枰 3299 +崽 3300 +胚 3301 +骡 3302 +萎 3303 +泄 3304 +呜 3305 +籁 3306 +濮 3307 +阆 3308 +琵 3309 +琶 3310 +跃 3311 +筝 3312 +勃 3313 +楂 3314 +奉 3315 +础 3316 +吆 3317 +壹 3318 +飙 3319 +虑 3320 +脆 3321 +黛 3322 +栓 3323 +逻 3324 +螃 3325 +轰 3326 +仑 3327 +券 3328 +逢 3329 +疮 3330 +私 3331 +窃 3332 +儋 3333 +泼 3334 +熬 3335 +焚 3336 +梨 3337 +吟 3338 +棱 3339 +稳 3340 +翘 3341 +祠 3342 +遮 3343 +瘤 3344 +稣 3345 +唇 3346 +阖 3347 +堡 3348 +禾 3349 +钗 3350 +爵 3351 +赐 3352 +绕 3353 +粘 3354 +癌 3355 +矜 3356 +虱 3357 +婧 3358 +坝 3359 +菏 3360 +隶 3361 +尺 3362 +滕 3363 +竿 3364 +恰 3365 +喱 3366 +冤 3367 +枉 3368 +叠 3369 +穴 3370 +搏 3371 +窦 3372 +栀 3373 +踪 3374 +昼 3375 +氯 3376 +陋 3377 +铭 3378 +禅 3379 +屑 3380 +巢 3381 +咻 3382 +喇 3383 +叭 3384 +棵 3385 +吊 3386 +诈 3387 +娇 3388 +绘 3389 +圩 3390 +仰 3391 +疙 3392 +瘩 3393 +桦 3394 +妾 3395 +丧 3396 +昊 3397 +湄 3398 +靴 3399 +迭 3400 +劝 3401 +溧 3402 +靡 3403 +梗 3404 +倪 3405 +刍 3406 +芽 3407 +篱 3408 +笆 3409 +漯 3410 +镖 3411 +协 3412 +叙 3413 +汾 3414 +豚 3415 +锷 3416 +瑙 3417 +瑜 3418 +伽 3419 +彰 3420 +扒 3421 +麝 3422 +赔 3423 +焉 3424 +亏 3425 +煅 3426 +翱 3427 +哽 3428 +煦 3429 +喷 3430 +舱 3431 +惨 3432 +哗 3433 +躲 3434 +佘 3435 +憾 3436 +旷 3437 +芹 3438 +簧 3439 +疹 3440 +簇 3441 +羹 3442 +刊 3443 +鹦 3444 +鹉 3445 +狡 3446 +猾 3447 +锯 3448 +呛 3449 +泛 3450 +汶 3451 +毗 3452 +衫 3453 +猕 3454 +祺 3455 +悸 3456 +昙 3457 +莽 3458 +杏 3459 +钮 3460 +叛 3461 +锄 3462 +砸 3463 +囤 3464 +犁 3465 +溃 3466 +疡 3467 +迦 3468 +轲 3469 +噜 3470 +犒 3471 +薰 3472 +薛 3473 +哺 3474 +竖 3475 +氟 3476 +渐 3477 +柒 3478 +贰 3479 +捌 3480 +傍 3481 +抹 3482 +褶 3483 +仇 3484 +偎 3485 +馅 3486 +旱 3487 +渭 3488 +昔 3489 +癣 3490 +挚 3491 +姻 3492 +炽 3493 +嘎 3494 +矢 3495 +汀 3496 +馋 3497 +淌 3498 +莓 3499 +貂 3500 +啧 3501 +茱 3502 +萸 3503 +涌 3504 +臀 3505 +恃 3506 +蒲 3507 +朴 3508 +嘟 3509 +扁 3510 +剂 3511 +歧 3512 +啪 3513 +啵 3514 +匠 3515 +帖 3516 +痒 3517 +睿 3518 +踢 3519 +衿 3520 +叽 3521 +崩 3522 +顽 3523 +嫌 3524 +扛 3525 +浔 3526 +拯 3527 +戊 3528 +戚 3529 +蛔 3530 +醇 3531 +笛 3532 +氛 3533 +沦 3534 +婊 3535 +仍 3536 +镍 3537 +渤 3538 +舶 3539 +哆 3540 +睹 3541 +萤 3542 +弧 3543 +辙 3544 +旯 3545 +纷 3546 +熄 3547 +挽 3548 +帘 3549 +蒸 3550 +橄 3551 +榄 3552 +滥 3553 +掩 3554 +兮 3555 +庸 3556 +玟 3557 +垦 3558 +惟 3559 +朕 3560 +脯 3561 +歪 3562 +吾 3563 +碘 3564 +锰 3565 +矾 3566 +拙 3567 +践 3568 +纠 3569 +赡 3570 +暨 3571 +凳 3572 +雾 3573 +缔 3574 +啫 3575 +毁 3576 +宥 3577 +邛 3578 +崃 3579 +禧 3580 +醛 3581 +滤 3582 +嘀 3583 +缪 3584 +萌 3585 +芥 3586 +胀 3587 +鲨 3588 +腩 3589 +勾 3590 +裳 3591 +雍 3592 +蹭 3593 +匹 3594 +髓 3595 +砍 3596 +孰 3597 +辩 3598 +唰 3599 +慷 3600 +慨 3601 +畏 3602 +坠 3603 +钝 3604 +箫 3605 +愧 3606 +劈 3607 +嘶 3608 +粮 3609 +轼 3610 +蟒 3611 +翊 3612 +澧 3613 +揽 3614 +烹 3615 +饪 3616 +踏 3617 +弛 3618 +婢 3619 +奸 3620 +掏 3621 +泓 3622 +袖 3623 +笈 3624 +刑 3625 +俑 3626 +浇 3627 +骊 3628 +蛀 3629 +蚤 3630 +杵 3631 +兹 3632 +晰 3633 +癫 3634 +痫 3635 +逝 3636 +炬 3637 +讼 3638 +陂 3639 +蚕 3640 +绸 3641 +槽 3642 +纨 3643 +牢 3644 +晃 3645 +窄 3646 +蒂 3647 +湃 3648 +硫 3649 +眨 3650 +耸 3651 +浠 3652 +梵 3653 +纺 3654 +贾 3655 +膨 3656 +阀 3657 +堀 3658 +扭 3659 +捂 3660 +扑 3661 +椭 3662 +鳟 3663 +丙 3664 +烯 3665 +冈 3666 +衷 3667 +牟 3668 +郫 3669 +畴 3670 +腥 3671 +亩 3672 +淤 3673 +禄 3674 +倘 3675 +烷 3676 +仆 3677 +刨 3678 +炜 3679 +挨 3680 +鳅 3681 +奚 3682 +峪 3683 +呻 3684 +佣 3685 +渔 3686 +肢 3687 +霏 3688 +旨 3689 +爰 3690 +吨 3691 +珑 3692 +隽 3693 +橙 3694 +箍 3695 +岚 3696 +啸 3697 +倌 3698 +剃 3699 +御 3700 +沸 3701 +棘 3702 +瘫 3703 +痪 3704 +仕 3705 +闺 3706 +炳 3707 +乏 3708 +拱 3709 +墅 3710 +铢 3711 +痤 3712 +琥 3713 +珈 3714 +荟 3715 +翩 3716 +搓 3717 +阮 3718 +芸 3719 +抠 3720 +弓 3721 +锣 3722 +赫 3723 +挡 3724 +侃 3725 +诶 3726 +沽 3727 +绫 3728 +濑 3729 +龈 3730 +乞 3731 +丐 3732 +宴 3733 +馁 3734 +牲 3735 +闰 3736 +亢 3737 +辫 3738 +铲 3739 +嫦 3740 +卵 3741 +佚 3742 +谬 3743 +倡 3744 +抑 3745 +赋 3746 +跆 3747 +削 3748 +氮 3749 +嫩 3750 +噻 3751 +蜗 3752 +鹂 3753 +靶 3754 +妥 3755 +衢 3756 +腻 3757 +砖 3758 +翎 3759 +拈 3760 +卉 3761 +皂 3762 +曦 3763 +荔 3764 +晤 3765 +曜 3766 +趵 3767 +纣 3768 +捞 3769 +蕲 3770 +猿 3771 +榈 3772 +憎 3773 +媚 3774 +绞 3775 +峙 3776 +饲 3777 +瑾 3778 +寡 3779 +釜 3780 +凸 3781 +凹 3782 +嫉 3783 +妒 3784 +婪 3785 +驸 3786 +荤 3787 +弥 3788 +蹦 3789 +驮 3790 +汞 3791 +唠 3792 +叨 3793 +袈 3794 +裟 3795 +毽 3796 +蔗 3797 +蹄 3798 +犍 3799 +珞 3800 +谚 3801 +煎 3802 +腋 3803 +瞳 3804 +丛 3805 +挪 3806 +榴 3807 +钩 3808 +梓 3809 +骁 3810 +烙 3811 +舜 3812 +暮 3813 +擀 3814 +兜 3815 +癜 3816 +姗 3817 +藜 3818 +擒 3819 +歼 3820 +冉 3821 +倚 3822 +漱 3823 +嫣 3824 +椰 3825 +隘 3826 +掐 3827 +栾 3828 +巍 3829 +咔 3830 +稽 3831 +惆 3832 +怅 3833 +镑 3834 +娲 3835 +芷 3836 +藻 3837 +伺 3838 +忌 3839 +桔 3840 +绅 3841 +坂 3842 +澜 3843 +嚓 3844 +苔 3845 +诣 3846 +倔 3847 +酿 3848 +槟 3849 +榔 3850 +粪 3851 +渺 3852 +馗 3853 +峨 3854 +碚 3855 +阎 3856 +巅 3857 +颊 3858 +戬 3859 +吒 3860 +鸵 3861 +岂 3862 +廖 3863 +娅 3864 +旬 3865 +猥 3866 +琐 3867 +扈 3868 +滔 3869 +枷 3870 +崴 3871 +捣 3872 +泻 3873 +甙 3874 +俯 3875 +撑 3876 +芮 3877 +舆 3878 +邂 3879 +逅 3880 +宪 3881 +晖 3882 +岔 3883 +哒 3884 +酵 3885 +痔 3886 +苓 3887 +捶 3888 +睫 3889 +裕 3890 +彤 3891 +潢 3892 +酉 3893 +聂 3894 +氨 3895 +嗷 3896 +皎 3897 +焖 3898 +袭 3899 +惦 3900 +惘 3901 +隙 3902 +彝 3903 +鞘 3904 +厄 3905 +殷 3906 +罕 3907 +嚏 3908 +拇 3909 +尹 3910 +蔻 3911 +颂 3912 +皖 3913 +霖 3914 +屉 3915 +崛 3916 +砣 3917 +穗 3918 +枸 3919 +杞 3920 +竣 3921 +勋 3922 +坍 3923 +溢 3924 +廓 3925 +煽 3926 +囚 3927 +涪 3928 +墩 3929 +琢 3930 +胳 3931 +膊 3932 +彼 3933 +陀 3934 +汹 3935 +柱 3936 +颁 3937 +闫 3938 +熠 3939 +叹 3940 +婿 3941 +娩 3942 +藓 3943 +岷 3944 +婺 3945 +桓 3946 +赁 3947 +罢 3948 +姊 3949 +瓢 3950 +桩 3951 +淫 3952 +堪 3953 +艰 3954 +枢 3955 +枞 3956 +晗 3957 +泷 3958 +逍 3959 +筱 3960 +烽 3961 +渍 3962 +蒿 3963 +殴 3964 +玖 3965 +罐 3966 +剿 3967 +喀 3968 +磕 3969 +铵 3970 +蕊 3971 +篓 3972 +痞 3973 +磅 3974 +礴 3975 +磐 3976 +拘 3977 +瘙 3978 +惕 3979 +孜 3980 +杖 3981 +撇 3982 +敖 3983 +踩 3984 +刹 3985 +蹿 3986 +坎 3987 +氦 3988 +汨 3989 +垣 3990 +垢 3991 +胁 3992 +趴 3993 +苷 3994 +镒 3995 +幢 3996 +鞠 3997 +逾 3998 +鬃 3999 +尉 4000 +韧 4001 +锤 4002 +嘘 4003 +呷 4004 +噎 4005 +煲 4006 +恍 4007 +粱 4008 +亳 4009 +鳞 4010 +懦 4011 +酚 4012 +酞 4013 +哨 4014 +祀 4015 +刃 4016 +蕴 4017 +晟 4018 +菀 4019 +甬 4020 +鼾 4021 +鳖 4022 +螳 4023 +稼 4024 +栽 4025 +蝗 4026 +颌 4027 +咀 4028 +掠 4029 +嘱 4030 +甚 4031 +菅 4032 +奂 4033 +讽 4034 +秸 4035 +釉 4036 +坞 4037 +雇 4038 +绢 4039 +捧 4040 +狈 4041 +桀 4042 +骜 4043 +摊 4044 +臆 4045 +竺 4046 +栅 4047 +贬 4048 +飒 4049 +浸 4050 +噩 4051 +晾 4052 +绐 4053 +殡 4054 +挠 4055 +於 4056 +茁 4057 +瞪 4058 +窠 4059 +汰 4060 +魁 4061 +忒 4062 +璋 4063 +怠 4064 +莺 4065 +冶 4066 +绰 4067 +邈 4068 +圻 4069 +湮 4070 +亨 4071 +躬 4072 +砂 4073 +鹭 4074 +浊 4075 +楹 4076 +珉 4077 +撵 4078 +筏 4079 +荨 4080 +鳝 4081 +沥 4082 +邳 4083 +殉 4084 +憨 4085 +啼 4086 +熏 4087 +蜃 4088 +毋 4089 +彗 4090 +噪 4091 +绛 4092 +祟 4093 +蝙 4094 +蝠 4095 +漆 4096 +酰 4097 +锑 4098 +栖 4099 +肆 4100 +邕 4101 +弋 4102 +绽 4103 +嚼 4104 +霹 4105 +雳 4106 +谍 4107 +恹 4108 +怏 4109 +倦 4110 +轶 4111 +曛 4112 +疚 4113 +棚 4114 +漕 4115 +浃 4116 +勘 4117 +暄 4118 +趁 4119 +斥 4120 +苞 4121 +膳 4122 +赎 4123 +崭 4124 +笙 4125 +摁 4126 +嗅 4127 +瞒 4128 +舵 4129 +铸 4130 +咫 4131 +涅 4132 +瘪 4133 +潼 4134 +粑 4135 +漾 4136 +噶 4137 +鸠 4138 +铉 4139 +豹 4140 +遛 4141 +襟 4142 +壤 4143 +甭 4144 +吮 4145 +耒 4146 +钊 4147 +泞 4148 +拦 4149 +昱 4150 +腑 4151 +惧 4152 +韬 4153 +焗 4154 +窘 4155 +喳 4156 +溏 4157 +鲛 4158 +慵 4159 +菁 4160 +攥 4161 +埔 4162 +呕 4163 +蓑 4164 +笠 4165 +孑 4166 +咕 4167 +觐 4168 +漓 4169 +碾 4170 +浜 4171 +嬉 4172 +迂 4173 +笃 4174 +勉 4175 +锥 4176 +篷 4177 +亥 4178 +龌 4179 +龊 4180 +煞 4181 +蓟 4182 +皓 4183 +惰 4184 +勺 4185 +缨 4186 +峥 4187 +苯 4188 +豁 4189 +颓 4190 +拽 4191 +啄 4192 +麒 4193 +雎 4194 +鲢 4195 +睬 4196 +渣 4197 +唔 4198 +桧 4199 +癞 4200 +蛤 4201 +蟆 4202 +撩 4203 +酯 4204 +戳 4205 +舔 4206 +孺 4207 +怂 4208 +恿 4209 +臃 4210 +戟 4211 +惭 4212 +耿 4213 +徵 4214 +柬 4215 +朽 4216 +磺 4217 +媲 4218 +懿 4219 +悼 4220 +绎 4221 +缅 4222 +茜 4223 +瞻 4224 +炀 4225 +脓 4226 +罄 4227 +秃 4228 +拎 4229 +譬 4230 +榉 4231 +拭 4232 +玥 4233 +崆 4234 +峒 4235 +胛 4236 +糗 4237 +佗 4238 +佬 4239 +袍 4240 +炊 4241 +仞 4242 +霎 4243 +掺 4244 +匀 4245 +姹 4246 +妯 4247 +娌 4248 +帷 4249 +岢 4250 +柄 4251 +阪 4252 +玺 4253 +窑 4254 +肽 4255 +涡 4256 +窟 4257 +阉 4258 +硼 4259 +蛳 4260 +呤 4261 +砚 4262 +偌 4263 +贿 4264 +芗 4265 +蹊 4266 +跷 4267 +雏 4268 +膝 4269 +嗜 4270 +扦 4271 +涟 4272 +殆 4273 +郡 4274 +洵 4275 +酋 4276 +匡 4277 +胤 4278 +撤 4279 +辗 4280 +冀 4281 +捺 4282 +吏 4283 +衩 4284 +腕 4285 +灸 4286 +绔 4287 +瓯 4288 +蜻 4289 +蜓 4290 +窜 4291 +躯 4292 +髦 4293 +诏 4294 +缄 4295 +筠 4296 +沌 4297 +酐 4298 +皋 4299 +隧 4300 +鹊 4301 +傀 4302 +儡 4303 +诲 4304 +嘏 4305 +寅 4306 +骇 4307 +喧 4308 +癀 4309 +瑚 4310 +碉 4311 +羔 4312 +掂 4313 +痹 4314 +孚 4315 +绡 4316 +馊 4317 +虏 4318 +悖 4319 +漪 4320 +琉 4321 +缉 4322 +冥 4323 +饯 4324 +蔺 4325 +瘆 4326 +榨 4327 +盯 4328 +鄞 4329 +妨 4330 +哐 4331 +寇 4332 +鹃 4333 +卞 4334 +喘 4335 +藩 4336 +踹 4337 +粟 4338 +陨 4339 +遣 4340 +鳌 4341 +烨 4342 +抉 4343 +臧 4344 +墉 4345 +疽 4346 +拷 4347 +赃 4348 +哮 4349 +馥 4350 +砰 4351 +拗 4352 +汐 4353 +矣 4354 +沅 4355 +裴 4356 +阐 4357 +蟋 4358 +蟀 4359 +蚀 4360 +恁 4361 +恙 4362 +蝉 4363 +荀 4364 +彧 4365 +銮 4366 +侮 4367 +驿 4368 +婶 4369 +檀 4370 +哩 4371 +镐 4372 +轴 4373 +扳 4374 +飓 4375 +麓 4376 +牺 4377 +垛 4378 +稞 4379 +桴 4380 +痧 4381 +揣 4382 +殇 4383 +邬 4384 +撅 4385 +邸 4386 +鼬 4387 +剥 4388 +胥 4389 +撼 4390 +溟 4391 +鄱 4392 +鲫 4393 +觅 4394 +犊 4395 +恕 4396 +铂 4397 +褔 4398 +淼 4399 +骝 4400 +藉 4401 +裔 4402 +痨 4403 +颤 4404 +尴 4405 +尬 4406 +癖 4407 +拄 4408 +蠕 4409 +虻 4410 +迄 4411 +攸 4412 +浚 4413 +盔 4414 +肮 4415 +侬 4416 +锏 4417 +憧 4418 +憬 4419 +镰 4420 +懈 4421 +挟 4422 +缤 4423 +涎 4424 +睾 4425 +惶 4426 +褚 4427 +藐 4428 +眺 4429 +艇 4430 +昀 4431 +妄 4432 +祗 4433 +壬 4434 +浯 4435 +衲 4436 +來 4437 +黯 4438 +芩 4439 +敞 4440 +绊 4441 +娣 4442 +掷 4443 +茯 4444 +琍 4445 +蛹 4446 +钧 4447 +瘘 4448 +蜥 4449 +蜴 4450 +唬 4451 +驭 4452 +阂 4453 +诃 4454 +疟 4455 +潦 4456 +谀 4457 +肱 4458 +黏 4459 +甥 4460 +眶 4461 +秩 4462 +庇 4463 +钏 4464 +咝 4465 +肴 4466 +宸 4467 +湟 4468 +沣 4469 +煊 4470 +盂 4471 +弈 4472 +瞩 4473 +聆 4474 +疥 4475 +腼 4476 +腆 4477 +胭 4478 +匕 4479 +讳 4480 +戮 4481 +茧 4482 +趾 4483 +亵 4484 +吖 4485 +漩 4486 +逵 4487 +寰 4488 +滇 4489 +渎 4490 +寮 4491 +嘁 4492 +珂 4493 +珀 4494 +稠 4495 +羌 4496 +徘 4497 +徊 4498 +苛 4499 +蕨 4500 +薏 4501 +苡 4502 +戌 4503 +卯 4504 +馈 4505 +溥 4506 +熹 4507 +屡 4508 +巳 4509 +璜 4510 +铮 4511 +踊 4512 +锚 4513 +濠 4514 +噫 4515 +怦 4516 +蓥 4517 +碌 4518 +霓 4519 +牦 4520 +妤 4521 +屹 4522 +缈 4523 +蹉 4524 +驷 4525 +菡 4526 +谔 4527 +琛 4528 +吡 4529 +喹 4530 +呲 4531 +溺 4532 +鳗 4533 +慑 4534 +秆 4535 +骋 4536 +脐 4537 +涤 4538 +荞 4539 +淅 4540 +罘 4541 +焱 4542 +孵 4543 +斟 4544 +酌 4545 +痊 4546 +秉 4547 +砌 4548 +瘁 4549 +胱 4550 +笫 4551 +燮 4552 +衅 4553 +腱 4554 +垒 4555 +锟 4556 +缀 4557 +疵 4558 +墟 4559 +盏 4560 +舂 4561 +侗 4562 +琨 4563 +唧 4564 +怆 4565 +沮 4566 +敛 4567 +瑕 4568 +奠 4569 +汴 4570 +衙 4571 +歆 4572 +嘹 4573 +饽 4574 +拧 4575 +濒 4576 +锭 4577 +嬴 4578 +吱 4579 +靳 4580 +眸 4581 +渲 4582 +睦 4583 +蝼 4584 +瞿 4585 +剁 4586 +紊 4587 +翟 4588 +攘 4589 +蹂 4590 +躏 4591 +淞 4592 +跎 4593 +侍 4594 +铛 4595 +绷 4596 +仟 4597 +瀚 4598 +赉 4599 +俪 4600 +魄 4601 +吼 4602 +酗 4603 +嚒 4604 +彷 4605 +徨 4606 +煜 4607 +曝 4608 +嗑 4609 +俅 4610 +嵘 4611 +隍 4612 +唆 4613 +郜 4614 +栩 4615 +尻 4616 +咗 4617 +茗 4618 +疱 4619 +斐 4620 +菘 4621 +芎 4622 +帼 4623 +枭 4624 +矩 4625 +仨 4626 +幄 4627 +鲸 4628 +猬 4629 +梢 4630 +槐 4631 +璧 4632 +坷 4633 +逯 4634 +踝 4635 +濡 4636 +樵 4637 +肓 4638 +劵 4639 +羚 4640 +髫 4641 +笄 4642 +俾 4643 +匿 4644 +帛 4645 +孀 4646 +焙 4647 +瘟 4648 +籽 4649 +萦 4650 +灼 4651 +箴 4652 +筵 4653 +窒 4654 +裆 4655 +旎 4656 +砝 4657 +妲 4658 +恺 4659 +覃 4660 +寐 4661 +酝 4662 +啃 4663 +塬 4664 +醴 4665 +蜿 4666 +蜒 4667 +愣 4668 +恤 4669 +撂 4670 +瘸 4671 +檐 4672 +琰 4673 +狒 4674 +摧 4675 +诠 4676 +孪 4677 +嘚 4678 +鼹 4679 +囡 4680 +茴 4681 +噤 4682 +僻 4683 +钕 4684 +锴 4685 +渗 4686 +嗫 4687 +撮 4688 +缭 4689 +粼 4690 +咄 4691 +挝 4692 +蛾 4693 +恪 4694 +皙 4695 +莒 4696 +叼 4697 +诽 4698 +妩 4699 +叱 4700 +咤 4701 +挞 4702 +萼 4703 +饵 4704 +澹 4705 +惺 4706 +呶 4707 +铤 4708 +佟 4709 +丕 4710 +靛 4711 +伶 4712 +涣 4713 +桢 4714 +狭 4715 +卅 4716 +蟠 4717 +蟾 4718 +朦 4719 +胧 4720 +咆 4721 +滦 4722 +岖 4723 +篙 4724 +痍 4725 +胰 4726 +谏 4727 +坳 4728 +樯 4729 +橹 4730 +孬 4731 +潴 4732 +厥 4733 +椐 4734 +谩 4735 +恬 4736 +琬 4737 +遁 4738 +褥 4739 +咎 4740 +羁 4741 +苣 4742 +殁 4743 +懵 4744 +褒 4745 +蜚 4746 +蛊 4747 +筛 4748 +耙 4749 +耨 4750 +嬷 4751 +驯 4752 +赅 4753 +畲 4754 +滢 4755 +伎 4756 +庹 4757 +踉 4758 +戎 4759 +膛 4760 +嗡 4761 +吔 4762 +唏 4763 +喏 4764 +哧 4765 +缇 4766 +蚝 4767 +璀 4768 +璨 4769 +捅 4770 +妁 4771 +曳 4772 +吩 4773 +咐 4774 +罂 4775 +垌 4776 +揪 4777 +壕 4778 +跺 4779 +辘 4780 +轳 4781 +噔 4782 +斓 4783 +厮 4784 +叁 4785 +仄 4786 +沼 4787 +鸢 4788 +醪 4789 +郢 4790 +圃 4791 +碜 4792 +鲅 4793 +嚯 4794 +淳 4795 +迩 4796 +诋 4797 +鬟 4798 +汲 4799 +艮 4800 +跤 4801 +麋 4802 +橇 4803 +悱 4804 +恻 4805 +啷 4806 +惮 4807 +樨 4808 +毓 4809 +裱 4810 +堇 4811 +埸 4812 +叵 4813 +腚 4814 +畀 4815 +钼 4816 +赦 4817 +悯 4818 +谴 4819 +稷 4820 +嘢 4821 +盎 4822 +跶 4823 +窥 4824 +瑄 4825 +谤 4826 +柘 4827 +垄 4828 +蠡 4829 +邝 4830 +娆 4831 +俐 4832 +铷 4833 +肋 4834 +涿 4835 +俎 4836 +捜 4837 +罡 4838 +嗝 4839 +唛 4840 +酣 4841 +鹬 4842 +瑀 4843 +帚 4844 +镭 4845 +搽 4846 +钣 4847 +蜇 4848 +嗞 4849 +颉 4850 +耘 4851 +忡 4852 +噼 4853 +睐 4854 +簋 4855 +镚 4856 +朐 4857 +戛 4858 +扪 4859 +鹩 4860 +稹 4861 +嗣 4862 +睇 4863 +弩 4864 +侥 4865 +绚 4866 +虔 4867 +溴 4868 +毂 4869 +漉 4870 +郧 4871 +杈 4872 +埭 4873 +哝 4874 +纾 4875 +箔 4876 +蚍 4877 +呋 4878 +喃 4879 +旌 4880 +袅 4881 +嫡 4882 +2 4883 +睢 4884 +榭 4885 +濉 4886 +雉 4887 +糍 4888 +谙 4889 +坻 4890 +遨 4891 +囔 4892 +鹜 4893 +垩 4894 +嵋 4895 +葑 4896 +叻 4897 +剌 4898 +铀 4899 +鲟 4900 +珏 4901 +唑 4902 +拴 4903 +乍 4904 +镊 4905 +歩 4906 +姘 4907 +戍 4908 +娈 4909 +槿 4910 +魇 4911 +叩 4912 +啾 4913 +腈 4914 +骞 4915 +殃 4916 +髋 4917 +嶙 4918 +璟 4919 +嚷 4920 +鹳 4921 +嗬 4922 +梆 4923 +晁 4924 +龛 4925 +嚎 4926 +熨 4927 +倭 4928 +峦 4929 +蜍 4930 +桉 4931 +齁 4932 +搀 4933 +铬 4934 +刽 4935 +謝 4936 +沒 4937 +簪 4938 +邺 4939 +嵬 4940 +馄 4941 +饨 4942 +蜢 4943 +嗒 4944 +芨 4945 +弶 4946 +晞 4947 +搔 4948 +昴 4949 +夙 4950 +徙 4951 +霾 4952 +嗖 4953 +碴 4954 +秧 4955 +芍 4956 +匝 4957 +泫 4958 +琯 4959 +扼 4960 +砒 4961 +栎 4962 +卟 4963 +琊 4964 +怯 4965 +侩 4966 +峯 4967 +忿 4968 +藁 4969 +蹼 4970 +毡 4971 +埤 4972 +膘 4973 +噗 4974 +阕 4975 +嘭 4976 +椿 4977 +涸 4978 +祯 4979 +芵 4980 +螨 4981 +寥 4982 +梶 4983 +嘈 4984 +泠 4985 +侏 4986 +棂 4987 +缶 4988 +捋 4989 +钜 4990 +璞 4991 +媞 4992 +唢 4993 +邰 4994 +蚱 4995 +薜 4996 +牒 4997 +缥 4998 +咿 4999 +遐 5000 +蕙 5001 +惬 5002 +惚 5003 +硚 5004 +麽 5005 +踌 5006 +褂 5007 +蜉 5008 +蝣 5009 +腌 5010 +熘 5011 +缮 5012 +锢 5013 +犽 5014 +蹬 5015 +皈 5016 +剔 5017 +芪 5018 +妪 5019 +钇 5020 +仃 5021 +荏 5022 +苒 5023 +塾 5024 +阡 5025 +瑨 5026 +冢 5027 +匈 5028 +庶 5029 +荃 5030 +茬 5031 +妗 5032 +暹 5033 +犷 5034 +嵴 5035 +鳃 5036 +羲 5037 +岱 5038 +烩 5039 +勐 5040 +霁 5041 +厝 5042 +飚 5043 +瀛 5044 +炕 5045 +桅 5046 +垓 5047 +晌 5048 +黒 5049 +蚩 5050 +夔 5051 +垚 5052 +烊 5053 +眀 5054 +荼 5055 +蘼 5056 +尅 5057 +舫 5058 +拣 5059 +蹋 5060 +劭 5061 +耆 5062 +陡 5063 +樽 5064 +谒 5065 +觞 5066 +箩 5067 +槛 5068 +傈 5069 +僳 5070 +爻 5071 +皑 5072 +滘 5073 +嬅 5074 +丶 5075 +邋 5076 +遢 5077 +讴 5078 +隅 5079 +邃 5080 +谑 5081 +哔 5082 +矬 5083 +姣 5084 +凛 5085 +冽 5086 +殒 5087 +眈 5088 +鹧 5089 +鸪 5090 +飕 5091 +亘 5092 +篝 5093 +嘅 5094 +乜 5095 +黜 5096 +颇 5097 +鄄 5098 +蔫 5099 +贻 5100 +猝 5101 +绌 5102 +芈 5103 +隼 5104 +戆 5105 +鹫 5106 +霑 5107 +宕 5108 +凇 5109 +铨 5110 +町 5111 +礁 5112 +蕃 5113 +淖 5114 +搐 5115 +饴 5116 +榛 5117 +晔 5118 +祢 5119 +酥 5120 +丨 5121 +赂 5122 +噘 5123 +黍 5124 +幌 5125 +骅 5126 +黝 5127 +帧 5128 +胯 5129 +埙 5130 +敕 5131 +涓 5132 +掣 5133 +圪 5134 +榻 5135 +濛 5136 +擞 5137 +篡 5138 +榷 5139 +亟 5140 +渌 5141 +锹 5142 +啐 5143 +捍 5144 +嘣 5145 +跻 5146 +桠 5147 +贮 5148 +蛰 5149 +猖 5150 +骸 5151 +溉 5152 +铎 5153 +吁 5154 +溯 5155 +踞 5156 +俨 5157 +茌 5158 +蒯 5159 +篆 5160 +膺 5161 +垭 5162 +匮 5163 +撰 5164 +擂 5165 +倜 5166 +傥 5167 +蔑 5168 +弼 5169 +珮 5170 +颢 5171 +钿 5172 +迸 5173 +凿 5174 +湫 5175 +焯 5176 +硒 5177 +畈 5178 +觑 5179 +揶 5180 +禀 5181 +宦 5182 +杷 5183 +讷 5184 +踮 5185 +掳 5186 +窿 5187 +捻 5188 +褴 5189 +褛 5190 +瑛 5191 +胫 5192 +喋 5193 +沓 5194 +汛 5195 +掴 5196 +魉 5197 +馀 5198 +隗 5199 +咘 5200 +呱 5201 +獭 5202 +畊 5203 +莜 5204 +祐 5205 +轧 5206 +魍 5207 +昶 5208 +诓 5209 +囗 5210 +莠 5211 +岌 5212 +潸 5213 +涞 5214 +綦 5215 +畸 5216 +阄 5217 +遏 5218 +啶 5219 +冇 5220 +懋 5221 +煨 5222 +羱 5223 +诟 5224 +枳 5225 +鲶 5226 +燊 5227 +猷 5228 +铄 5229 +缰 5230 +搪 5231 +赊 5232 +诩 5233 +佼 5234 +钵 5235 +谌 5236 +嬗 5237 +砥 5238 +砺 5239 +觊 5240 +觎 5241 +颅 5242 +怵 5243 +疸 5244 +锆 5245 +缢 5246 +棣 5247 +蛎 5248 +鄯 5249 +茸 5250 +谶 5251 +蹶 5252 +侑 5253 +滂 5254 +襁 5255 +褓 5256 +杳 5257 +臊 5258 +摒 5259 +袂 5260 +掸 5261 +鹞 5262 +忱 5263 +湉 5264 +汩 5265 +剽 5266 +槌 5267 +塍 5268 +喟 5269 +讹 5270 +抡 5271 +烃 5272 +咁 5273 +珺 5274 +槎 5275 +砼 5276 +泯 5277 +泮 5278 +遴 5279 +匾 5280 +沏 5281 +悌 5282 +麾 5283 +垡 5284 +鏖 5285 +垅 5286 +斛 5287 +镂 5288 +骷 5289 +髅 5290 +豺 5291 +诿 5292 +狰 5293 +狞 5294 +泱 5295 +榫 5296 +嗤 5297 +瞥 5298 +揄 5299 +哌 5300 +婀 5301 +恸 5302 +蛐 5303 +镀 5304 +霈 5305 +钒 5306 +踱 5307 +淆 5308 +薹 5309 +纭 5310 +瘠 5311 +戾 5312 +夭 5313 +铰 5314 +渚 5315 +犇 5316 +舀 5317 +傣 5318 +獗 5319 +瞭 5320 +兢 5321 +犟 5322 +袒 5323 +铖 5324 +颚 5325 +徜 5326 +徉 5327 +囍 5328 +酆 5329 +铡 5330 +睽 5331 +裨 5332 +饕 5333 +躇 5334 +噱 5335 +赓 5336 +懊 5337 +蟊 5338 +趸 5339 +鄢 5340 +埝 5341 +椟 5342 +粳 5343 +跛 5344 +莴 5345 +娉 5346 +嗄 5347 +邙 5348 +渑 5349 +佶 5350 +颍 5351 +溆 5352 +诧 5353 +抨 5354 +憷 5355 +涠 5356 +痼 5357 +砀 5358 +剐 5359 +缙 5360 +鞑 5361 +坭 5362 +烬 5363 +唁 5364 +臼 5365 +瓮 5366 +袱 5367 +珩 5368 +蝌 5369 +蚪 5370 +诬 5371 +迥 5372 +楸 5373 +皿 5374 +蜷 5375 +遑 5376 +啖 5377 +篪 5378 +崮 5379 +讧 5380 +盹 5381 +瞑 5382 +鲳 5383 +谟 5384 +拮 5385 +琏 5386 +瞰 5387 +憩 5388 +馏 5389 +炷 5390 +眩 5391 +羿 5392 +洙 5393 +珲 5394 +愫 5395 +佯 5396 +舸 5397 +祎 5398 +旮 5399 +翌 5400 +畿 5401 +桎 5402 +梏 5403 +钳 5404 +鳍 5405 +犸 5406 +祉 5407 +缜 5408 +硌 5409 +殓 5410 +砾 5411 +酩 5412 +酊 5413 +兀 5414 +矸 5415 +髙 5416 +疝 5417 +膑 5418 +哂 5419 +僚 5420 +耷 5421 +窨 5422 +孳 5423 +鲠 5424 +淝 5425 +搡 5426 +伢 5427 +鲷 5428 +谕 5429 +頫 5430 +泺 5431 +谧 5432 +煳 5433 +萁 5434 +馕 5435 +鹌 5436 +鹑 5437 +钴 5438 +埇 5439 +摈 5440 +踵 5441 +冗 5442 +铣 5443 +萃 5444 +忤 5445 +揩 5446 +铧 5447 +矗 5448 +闾 5449 +柞 5450 +貉 5451 +撺 5452 +掇 5453 +灞 5454 +醍 5455 +痱 5456 +粲 5457 +糠 5458 +讣 5459 +蹴 5460 +茆 5461 +螈 5462 +旻 5463 +蔼 5464 +咣 5465 +麸 5466 +涝 5467 +渥 5468 +垤 5469 +咭 5470 +玳 5471 +瑁 5472 +郏 5473 +纂 5474 +扞 5475 +峭 5476 +铩 5477 +锨 5478 +坩 5479 +埚 5480 +瑭 5481 +札 5482 +舛 5483 +臬 5484 +郯 5485 +晦 5486 +耄 5487 +耋 5488 +俚 5489 +鲭 5490 +柩 5491 +黟 5492 +骼 5493 +蛆 5494 +跋 5495 +俸 5496 +幡 5497 +愕 5498 +噙 5499 +峋 5500 +厩 5501 +夯 5502 +擢 5503 +枋 5504 +葳 5505 +偃 5506 +赝 5507 +昝 5508 +镉 5509 +嫔 5510 +潋 5511 +娓 5512 +郅 5513 +瘀 5514 +奄 5515 +荇 5516 +咂 5517 +痉 5518 +挛 5519 +祚 5520 +庖 5521 +纰 5522 +簌 5523 +淬 5524 +掮 5525 +俟 5526 +臾 5527 +雒 5528 +吋 5529 +颧 5530 +嗔 5531 +诘 5532 +焘 5533 +獾 5534 +氤 5535 +氲 5536 +鲲 5537 +麂 5538 +罹 5539 +澍 5540 +镳 5541 +囱 5542 +玷 5543 +嗳 5544 +擘 5545 +濂 5546 +逡 5547 +骛 5548 +镔 5549 +湍 5550 +讥 5551 +蹁 5552 +跹 5553 +淦 5554 +骰 5555 +疃 5556 +腓 5557 +嵇 5558 +怄 5559 +谯 5560 +啕 5561 +坯 5562 +钎 5563 +锒 5564 +伉 5565 +佻 5566 +腴 5567 +怼 5568 +浐 5569 +摹 5570 +僮 5571 +芾 5572 +矍 5573 +泔 5574 +蚬 5575 +屐 5576 +翕 5577 +唿 5578 +苋 5579 +氪 5580 +楔 5581 +莪 5582 +掬 5583 +舷 5584 +骐 5585 +嗲 5586 +荻 5587 +缱 5588 +绻 5589 +嫚 5590 +铟 5591 +饷 5592 +醐 5593 +伫 5594 +澶 5595 +郇 5596 +蹚 5597 +藿 5598 +鳕 5599 +蝈 5600 +钯 5601 +铍 5602 +骠 5603 +盅 5604 +蜊 5605 +腭 5606 +谘 5607 +孛 5608 +豇 5609 +囫 5610 +囵 5611 +抿 5612 +楣 5613 +廾 5614 +貔 5615 +貅 5616 +蛉 5617 +猹 5618 +蚴 5619 +轱 5620 +葚 5621 +胗 5622 +鸮 5623 +篦 5624 +谆 5625 +篑 5626 +莅 5627 +砷 5628 +蝾 5629 +疴 5630 +葺 5631 +瘴 5632 +滹 5633 +砭 5634 +噌 5635 +鸾 5636 +珙 5637 +碣 5638 +餮 5639 +荸 5640 +荠 5641 +犄 5642 +歙 5643 +樾 5644 +淙 5645 +痢 5646 +濯 5647 +轫 5648 +琮 5649 +啜 5650 +闳 5651 +椁 5652 +蓼 5653 +垴 5654 +唷 5655 +炔 5656 +峁 5657 +囹 5658 +尕 5659 +嗪 5660 +缎 5661 +拚 5662 +稔 5663 +牍 5664 +赳 5665 +忪 5666 +菖 5667 +佃 5668 +埂 5669 +宓 5670 +瞠 5671 +洹 5672 +锲 5673 +睑 5674 +攫 5675 +竽 5676 +蹩 5677 +慜 5678 +锉 5679 +羧 5680 +崧 5681 +醺 5682 +舐 5683 +讫 5684 +熵 5685 +▁GONNA 5686 +瘢 5687 +秭 5688 +跄 5689 +绀 5690 +懑 5691 +弭 5692 +萋 5693 +篁 5694 +缛 5695 +茭 5696 +吠 5697 +鲑 5698 +幔 5699 +潺 5700 +鹈 5701 +鹕 5702 +椴 5703 +哕 5704 +剜 5705 +湎 5706 +玑 5707 +槃 5708 +暌 5709 +蹒 5710 +跚 5711 +恣 5712 +磬 5713 +悭 5714 +劾 5715 +唳 5716 +绉 5717 +枇 5718 +蜱 5719 +瞟 5720 +膈 5721 +磴 5722 +嶂 5723 +苫 5724 +邡 5725 +骈 5726 +惴 5727 +硖 5728 +鳜 5729 +羸 5730 +秣 5731 +殚 5732 +桷 5733 +罔 5734 +颦 5735 +桁 5736 +鸩 5737 +孱 5738 +伥 5739 +愎 5740 +圄 5741 +贲 5742 +旖 5743 +荥 5744 +徇 5745 +镌 5746 +偈 5747 +敝 5748 +刎 5749 +跬 5750 +欸 5751 +髌 5752 +椤 5753 +觥 5754 +踟 5755 +斡 5756 +陉 5757 +谡 5758 +龅 5759 +鸨 5760 +豢 5761 +豉 5762 +悻 5763 +曈 5764 +茼 5765 +谗 5766 +忖 5767 +牯 5768 +痂 5769 +虢 5770 +馓 5771 +跖 5772 +聿 5773 +箅 5774 +塅 5775 +丼 5776 +獐 5777 +肏 5778 +逄 5779 +钡 5780 +叒 5781 +霭 5782 +鲮 5783 +凫 5784 +鹥 5785 +鳙 5786 +玦 5787 +蒡 5788 +嘬 5789 +鹗 5790 +鬄 5791 +鎏 5792 +嘤 5793 +绦 5794 +涔 5795 +齑 5796 +蒌 5797 +墘 5798 +俠 5799 +蛭 5800 +薅 5801 +叕 5802 +砧 5803 +嘧 5804 +媺 5805 +蚵 5806 +楽 5807 +浄 5808 +厍 5809 +鳊 5810 +泂 5811 +龋 5812 +瓒 5813 +瑧 5814 +邨 5815 +峣 5816 +蚺 5817 +鲉 5818 +滟 5819 +堑 5820 +豳 5821 +骧 5822 +艹 5823 +柾 5824 +鬣 5825 +眦 5826 +畦 5827 +虬 5828 +睨 5829 +飨 5830 +蘖 5831 +羟 5832 +瓤 5833 +岫 5834 +惇 5835 +鲵 5836 +痦 5837 +笤 5838 +憙 5839 +痩 5840 +煋 5841 +媤 5842 +佤 5843 +羮 5844 +鏊 5845 +昇 5846 +蛱 5847 +珅 5848 +庋 5849 +搵 5850 +旸 5851 +岿 5852 +亓 5853 +揸 5854 +谂 5855 +淠 5856 +糅 5857 +儆 5858 +苕 5859 +刿 5860 +呒 5861 +岙 5862 +荜 5863 +玧 5864 +鄠 5865 +讬 5866 +祕 5867 +箦 5868 +醚 5869 +膻 5870 +笕 5871 +蛏 5872 +哞 5873 +饸 5874 +饹 5875 +愻 5876 +汫 5877 +鹇 5878 +栉 5879 +沇 5880 +擤 5881 +徳 5882 +黢 5883 +狍 5884 +錫 5885 +暝 5886 +機 5887 +鉅 5888 +菓 5889 +廋 5890 +橛 5891 +羣 5892 +笊 5893 +魃 5894 +掼 5895 +魑 5896 +靥 5897 +酔 5898 +铱 5899 +峄 5900 +哋 5901 +畹 5902 +鍪 5903 +髀 5904 +嚄 5905 +秾 5906 +苾 5907 +孓 5908 +汆 5909 +嗟 5910 +锺 5911 +睥 5912 +炝 5913 +怔 5914 +咛 5915 +巉 5916 +墒 5917 +岘 5918 +禛 5919 +陟 5920 +皲 5921 +萘 5922 +妣 5923 +芃 5924 +煸 5925 +郦 5926 +蒗 5927 +仝 5928 +抻 5929 +苜 5930 +蓿 5931 +鎵 5932 +減 5933 +燧 5934 +娭 5935 +毑 5936 +诂 5937 +烔 5938 +猗 5939 +哏 5940 +氙 5941 +匯 5942 +颛 5943 +鞣 5944 +笺 5945 +枖 5946 +忾 5947 +黉 5948 +埯 5949 +敩 5950 +玏 5951 +钺 5952 +纥 5953 +佈 5954 +沖 5955 +蚜 5956 +莨 5957 +菟 5958 +麇 5959 +沤 5960 +耦 5961 +赭 5962 +祊 5963 +璄 5964 +旼 5965 +浞 5966 +痄 5967 +蔸 5968 +璎 5969 +屛 5970 +錤 5971 +弢 5972 +绨 5973 +員 5974 +誕 5975 +祜 5976 +勍 5977 +浉 5978 +娑 5979 +呓 5980 +啉 5981 +嗐 5982 +弁 5983 +绺 5984 +撷 5985 +崑 5986 +诌 5987 +標 5988 +甯 5989 +俣 5990 +趔 5991 +趄 5992 +垠 5993 +赟 5994 +馐 5995 +畑 5996 +給 5997 +幣 5998 +產 5999 +恵 6000 +併 6001 +蒹 6002 +葭 6003 +後 6004 +瀍 6005 +愠 6006 +莛 6007 +蝰 6008 +鹮 6009 +逶 6010 +侪 6011 +蒽 6012 +巽 6013 +瓴 6014 +鲱 6015 +薙 6016 +過 6017 +億 6018 +車 6019 +鲇 6020 +淨 6021 +嗎 6022 +诨 6023 +靚 6024 +內 6025 +糁 6026 +錾 6027 +刈 6028 +滯 6029 +炆 6030 +徂 6031 +傩 6032 +鲺 6033 +叟 6034 +埗 6035 +篠 6036 +焐 6037 +暻 6038 +盃 6039 +髻 6040 +樘 6041 +墈 6042 +菉 6043 +巯 6044 +嘌 6045 +遒 6046 +鼋 6047 +匍 6048 +匐 6049 +臜 6050 +馔 6051 +鲎 6052 +獠 6053 +蟇 6054 +栄 6055 +騎 6056 +賽 6057 +場 6058 +幾 6059 +鐘 6060 +镛 6061 +鸶 6062 +镆 6063 +窸 6064 +庠 6065 +蒺 6066 +溱 6067 +倮 6068 +楪 6069 +帀 6070 +躶 6071 +洰 6072 +圉 6073 +圊 6074 +捨 6075 +谝 6076 +呔 6077 +勖 6078 +揖 6079 +喈 6080 +霰 6081 +觋 6082 +嫪 6083 +毐 6084 +繇 6085 +珐 6086 +馃 6087 +孃 6088 +逖 6089 +骶 6090 +喬 6091 +奧 6092 +風 6093 +裵 6094 +胍 6095 +確 6096 +揠 6097 +榀 6098 +聒 6099 +谪 6100 +歘 6101 +粿 6102 +舾 6103 +聩 6104 +嫘 6105 +砟 6106 +侉 6107 +捯 6108 +饬 6109 +囏 6110 +喙 6111 +笥 6112 +燿 6113 +鮀 6114 +芡 6115 +蛄 6116 +铳 6117 +挲 6118 +笞 6119 +廿 6120 +蠹 6121 +湋 6122 +暎 6123 +霙 6124 +颔 6125 +苁 6126 +啮 6127 +囖 6128 +寤 6129 +炟 6130 +乩 6131 +熥 6132 +桡 6133 +阈 6134 +孖 6135 +鐢 6136 +衾 6137 +怍 6138 +沆 6139 +囿 6140 +胬 6141 +陲 6142 +缦 6143 +誇 6144 +醮 6145 +箬 6146 +盥 6147 +鹘 6148 +诳 6149 +氡 6150 +狎 6151 +枧 6152 +谄 6153 +芣 6154 +苢 6155 +俤 6156 +誊 6157 +殄 6158 +辋 6159 +係 6160 +迤 6161 +謦 6162 +簰 6163 +滓 6164 +嬢 6165 +倏 6166 +睺 6167 +滏 6168 +脘 6169 +嗙 6170 +谥 6171 +歃 6172 +锃 6173 +欻 6174 +挼 6175 +襙 6176 +檄 6177 +龇 6178 +楫 6179 +咵 6180 +徭 6181 +闱 6182 +嚅 6183 +鳑 6184 +鲏 6185 +佞 6186 +箜 6187 +篌 6188 +蹑 6189 +喑 6190 +胄 6191 +鞥 6192 +蟥 6193 +骢 6194 +蹙 6195 +柰 6196 +蕤 6197 +癸 6198 +哙 6199 +睚 6200 +绾 6201 +篾 6202 +鳏 6203 +谲 6204 +袤 6205 +翳 6206 +蹰 6207 +槊 6208 +黠 6209 +姒 6210 +锱 6211 +猢 6212 +狲 6213 +粝 6214 +戕 6215 +茕 6216 +瀣 6217 +踽 6218 +绶 6219 +媾 6220 +舢 6221 +螯 6222 +茏 6223 +廪 6224 +诰 6225 +辇 6226 +琚 6227 +汜 6228 +洇 6229 +還 6230 +遽 6231 +槁 6232 +靼 6233 +髡 6234 +鸬 6235 +鹚 6236 +捭 6237 +黩 6238 +俶 6239 +個 6240 +圜 6241 +颞 6242 +苻 6243 +恽 6244 +腧 6245 +甾 6246 +辎 6247 +顼 6248 +阗 6249 +鬻 6250 +鬶 6251 +沔 6252 +狃 6253 +#0 6254 +#1 6255 +#2 6256 \ No newline at end of file diff --git a/examples/zipformer/python/convert.py b/examples/zipformer/python/convert.py new file mode 100644 index 00000000..06c83e41 --- /dev/null +++ b/examples/zipformer/python/convert.py @@ -0,0 +1,70 @@ +import sys +from rknn.api import RKNN + +DEFAULT_QUANT = False + +def parse_arg(): + if len(sys.argv) < 3: + print("Usage: python3 {} onnx_model_path [platform] [dtype(optional)] [output_rknn_path(optional)]".format(sys.argv[0])) + print(" platform choose from [rk3562,rk3566,rk3568,rk3576,rk3588]") + print(" dtype choose from [fp] for [rk3562,rk3566,rk3568,rk3576,rk3588]") + exit(1) + + model_path = sys.argv[1] + platform = sys.argv[2] + + do_quant = DEFAULT_QUANT + if len(sys.argv) > 3: + model_type = sys.argv[3] + if model_type not in ['i8', 'u8', 'fp']: + print("ERROR: Invalid model type: {}".format(model_type)) + exit(1) + elif model_type in ['i8', 'u8']: + do_quant = True + else: + do_quant = False + + if len(sys.argv) > 4: + output_path = sys.argv[4] + else: + output_path = model_path.replace('.onnx', '.rknn') + + return model_path, platform, do_quant, output_path + +if __name__ == '__main__': + model_path, platform, do_quant, output_path = parse_arg() + + # Create RKNN object + rknn = RKNN(verbose=False) + + # Pre-process config + print('--> Config model') + rknn.config(target_platform=platform) + print('done') + + # Load model + print('--> Loading model') + ret = rknn.load_onnx(model=model_path) + if ret != 0: + print('Load model failed!') + exit(ret) + print('done') + + # Build model + print('--> Building model') + ret = rknn.build(do_quantization=do_quant) + if ret != 0: + print('Build model failed!') + exit(ret) + print('done') + + # Export rknn model + print('--> Export rknn model') + ret = rknn.export_rknn(output_path) + if ret != 0: + print('Export rknn model failed!') + exit(ret) + print('done') + + # Release + rknn.release() diff --git a/examples/zipformer/python/zipformer.py b/examples/zipformer/python/zipformer.py new file mode 100644 index 00000000..64e86843 --- /dev/null +++ b/examples/zipformer/python/zipformer.py @@ -0,0 +1,314 @@ +import onnxruntime +from rknn.api import RKNN +import torch +import kaldifeat +import soundfile as sf +import numpy as np +import argparse +import scipy + +class BaseModel(): + def __init__(self): + self.model_config = {'x': [1, 103, 80], 'cached_len_0': [2, 1], 'cached_len_1': [2, 1], 'cached_len_2': [2, 1], 'cached_len_3': [2, 1], + 'cached_len_4': [2, 1], 'cached_avg_0': [2, 1, 256], 'cached_avg_1': [2, 1, 256], 'cached_avg_2': [2, 1, 256], + 'cached_avg_3': [2, 1, 256], 'cached_avg_4': [2, 1, 256], 'cached_key_0': [2, 192, 1, 192], 'cached_key_1': [2, 96, 1, 192], + 'cached_key_2': [2, 48, 1, 192], 'cached_key_3': [2, 24, 1, 192], 'cached_key_4': [2, 96, 1, 192], 'cached_val_0': [2, 192, 1, 96], + 'cached_val_1': [2, 96, 1, 96], 'cached_val_2': [2, 48, 1, 96], 'cached_val_3': [2, 24, 1, 96], 'cached_val_4': [2, 96, 1, 96], + 'cached_val2_0': [2, 192, 1, 96], 'cached_val2_1': [2, 96, 1, 96], 'cached_val2_2': [2, 48, 1, 96], 'cached_val2_3': [2, 24, 1, 96], + 'cached_val2_4': [2, 96, 1, 96], 'cached_conv1_0': [2, 1, 256, 30], 'cached_conv1_1': [2, 1, 256, 30], 'cached_conv1_2': [2, 1, 256, 30], + 'cached_conv1_3': [2, 1, 256, 30], 'cached_conv1_4': [2, 1, 256, 30], 'cached_conv2_0': [2, 1, 256, 30], 'cached_conv2_1': [2, 1, 256, 30], + 'cached_conv2_2': [2, 1, 256, 30], 'cached_conv2_3': [2, 1, 256, 30], 'cached_conv2_4': [2, 1, 256, 30]} + + def init_encoder_input(self): + self.encoder_input = [] + self.encoder_input_dict = {} + for input_name in self.model_config: + if 'cached_len' in input_name: + data = np.zeros((self.model_config[input_name]), dtype=np.int64) + self.encoder_input.append(data) + self.encoder_input_dict[input_name] = data + else: + data = np.zeros((self.model_config[input_name]), dtype=np.float32) + self.encoder_input.append(data) + self.encoder_input_dict[input_name] = data + + def update_encoder_input(self, out, model_type): + for idx, input_name in enumerate(self.encoder_input_dict): + if idx == 0: + continue + if idx > 10 and model_type == 'rknn': + data = self.convert_nchw_to_nhwc(out[idx]) + else: + data = out[idx] + self.encoder_input[idx] = data + self.encoder_input_dict[input_name] = data + + def convert_nchw_to_nhwc(self, src): + dst = np.transpose(src, (0, 2, 3, 1)) + return dst + + def init_model(self, model_path, target, device_id): + pass + + def release_model(self): + pass + + def run_encoder(self, x): + pass + + def run_decoder(self, decoder_input): + pass + + def run_joiner(self, encoder_out, decoder_out): + pass + + def run_greedy_search(self, frames, context_size, decoder_out, hyp, num_processed_frames, timestamp, frame_offset): + encoder_out = self.run_encoder(frames) + encoder_out = encoder_out.squeeze(0) + + blank_id = 0 + unk_id = 2 + if decoder_out is None and hyp is None: + hyp = [blank_id] * context_size + decoder_input = np.array([hyp], dtype=np.int64) + decoder_out = self.run_decoder(decoder_input) + + T = encoder_out.shape[0] + for t in range(T): + cur_encoder_out = encoder_out[t: t + 1] + joiner_out = self.run_joiner(cur_encoder_out, decoder_out).squeeze(0) + y = np.argmax(joiner_out, axis=0) + if y != blank_id and y != unk_id: + timestamp.append(frame_offset + t) + hyp.append(y) + decoder_input = hyp[-context_size:] + decoder_input = np.array([decoder_input], dtype=np.int64) + decoder_out = self.run_decoder(decoder_input) + frame_offset += T + + return hyp, decoder_out, timestamp, frame_offset + +class OnnxModel(BaseModel): + def __init__( + self, + encoder_model_path, + decoder_model_path, + joiner_model_path, + target, + device_id + ): + super().__init__() + + self.encoder = self.init_model(encoder_model_path, target, device_id) + self.decoder = self.init_model(decoder_model_path, target, device_id) + self.joiner = self.init_model(joiner_model_path, target, device_id) + + def init_model(self, model_path, target, device_id): + model = onnxruntime.InferenceSession(model_path, providers=['CPUExecutionProvider']) + return model + + def release_model(self): + del self.encoder + del self.decoder + del self.joiner + self.encoder = None + self.decoder = None + self.joiner = None + + def run_encoder(self, x): + self.encoder_input[0] = x.numpy() + self.encoder_input_dict['x'] = x.numpy() + out = self.encoder.run(None, self.encoder_input_dict) + self.update_encoder_input(out, 'onnx') + return out[0] + + def run_decoder(self, decoder_input): + out = self.decoder.run(None, {self.decoder.get_inputs()[0].name: decoder_input})[0] + return out + + def run_joiner(self, encoder_out, decoder_out): + out = self.joiner.run(None, {self.joiner.get_inputs()[0].name: encoder_out, + self.joiner.get_inputs()[1].name: decoder_out})[0] + return out + +class RKNNModel(BaseModel): + def __init__( + self, + encoder_model_path, + decoder_model_path, + joiner_model_path, + target, + device_id + ): + super().__init__() + + self.encoder = self.init_model(encoder_model_path, target, device_id) + self.decoder = self.init_model(decoder_model_path, target, device_id) + self.joiner = self.init_model(joiner_model_path, target, device_id) + + def init_model(self, model_path, target, device_id): + # Create RKNN object + rknn = RKNN(verbose=False) + + # Load RKNN model + print('--> Loading model') + ret = rknn.load_rknn(model_path) + if ret != 0: + print('Load RKNN model \"{}\" failed!'.format(model_path)) + exit(ret) + print('done') + + # init runtime environment + print('--> Init runtime environment') + ret = rknn.init_runtime( + target=target, device_id=device_id) + if ret != 0: + print('Init runtime environment failed') + exit(ret) + + return rknn + + def release_model(self): + self.encoder.release() + self.decoder.release() + self.joiner.release() + self.encoder = None + self.decoder = None + self.joiner = None + + def run_encoder(self, x): + self.encoder_input[0] = x.numpy() + self.encoder_input_dict['x'] = x.numpy() + out = self.encoder.inference(inputs=self.encoder_input) + self.update_encoder_input(out, 'rknn') + return out[0] + + def run_decoder(self, decoder_input): + out = self.decoder.inference(inputs=decoder_input)[0] + return out + + def run_joiner(self, encoder_out, decoder_out): + out = self.joiner.inference(inputs=[encoder_out, decoder_out])[0] + return out + +def read_vocab(tokens_file): + with open(tokens_file, 'r') as f: + vocab = {} + for line in f: + if len(line.strip().split(' ')) < 2: + key = line.strip().split(' ')[0] + value = "" + else: + value, key = line.strip().split(' ') + vocab[key] = value + return vocab + +def set_model(args): + if args.encoder_model_path.endswith(".rknn") \ + and args.decoder_model_path.endswith(".rknn") and args.decoder_model_path.endswith(".rknn"): + model = RKNNModel(args.encoder_model_path, args.decoder_model_path, args.joiner_model_path, + target=args.target, device_id=args.device_id) + + elif args.encoder_model_path.endswith(".onnx") \ + and args.decoder_model_path.endswith(".onnx") and args.decoder_model_path.endswith(".onnx"): + model = OnnxModel(args.encoder_model_path, args.decoder_model_path, args.joiner_model_path, + target=args.target, device_id=args.device_id) + return model + +def run_model(model, audio_data): + # Set kaldifeat config + opts = kaldifeat.FbankOptions() + opts.frame_opts.samp_freq = 16000 # sample_rate=16000 + opts.mel_opts.num_bins = 80 + opts.mel_opts.high_freq = -400 + opts.frame_opts.dither = 0 + opts.frame_opts.snip_edges = False + fbank = kaldifeat.OnlineFbank(opts) + + # Inference + num_processed_frames = 0 + segment = 103 + offset = 96 + context_size = 2 + hyp = None + decoder_out = None + + fbank.accept_waveform(sampling_rate=sample_rate, waveform=audio_data) + num_frames = fbank.num_frames_ready + timestamp = [] + frame_offset = 0 + + while num_frames - num_processed_frames > 0: + if (num_frames - num_processed_frames) < segment: + tail_padding_len = (segment - (num_frames - num_processed_frames)) / 100.0 + tail_padding = torch.zeros(int(tail_padding_len * sample_rate), dtype=torch.float32) + fbank.accept_waveform(sampling_rate=sample_rate, waveform=tail_padding) + frames = [] + for i in range(segment): + frames.append(fbank.get_frame(num_processed_frames + i)) + + frames = torch.cat(frames, dim=0) + frames = frames.unsqueeze(0) + hyp, decoder_out, timestamp, frame_offset = model.run_greedy_search(frames, context_size, decoder_out, hyp, num_processed_frames, timestamp, frame_offset) + num_processed_frames += offset + + return hyp[context_size:], timestamp + +def post_process(hyp, vocab, timestamp): + text = "" + for i in hyp: + text += vocab[str(i)] + text = text.replace("▁", " ").strip() + + frame_shift_ms = 10 + subsampling_factor = 4 + frame_shift_s = frame_shift_ms / 1000.0 * subsampling_factor + real_timestamp = [round(frame_shift_s*t, 2) for t in timestamp] + return text, real_timestamp + +def ensure_sample_rate(waveform, original_sample_rate, desired_sample_rate=16000): + if original_sample_rate != desired_sample_rate: + print("resample_audio: {} HZ -> {} HZ".format(original_sample_rate, desired_sample_rate)) + desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate)) + waveform = scipy.signal.resample(waveform, desired_length) + return waveform, desired_sample_rate + +def ensure_channels(waveform, original_channels, desired_channels=1): + if original_channels != desired_channels: + print("convert_channels: {} -> {}".format(original_channels, desired_channels)) + waveform = np.mean(waveform, axis=1) + return waveform, desired_channels + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Zipformer Python Demo") + # basic params + parser.add_argument("--encoder_model_path", type=str, required=True, help="encoder model path, could be .onnx or .rknn file") + parser.add_argument("--decoder_model_path", type=str, required=True, help="decoder model path, could be .onnx or .rknn file") + parser.add_argument("--joiner_model_path", type=str, required=True, help="joiner model path, could be .onnx or .rknn file") + parser.add_argument("--target", type=str, default="rk3588", help="target RKNPU platform") + parser.add_argument("--device_id", type=str, default=None, help="device id") + args = parser.parse_args() + + # Set inputs + vocab = read_vocab("../model/vocab.txt") + audio_data, sample_rate = sf.read("../model/test.wav") + channels = audio_data.ndim + audio_data, channels = ensure_channels(audio_data, channels) + audio_data, sample_rate = ensure_sample_rate(audio_data, sample_rate) + audio_data = torch.tensor(audio_data, dtype=torch.float32) + + # Set model + model = set_model(args) + model.init_encoder_input() + + # Run model + hyp, timestamp = run_model(model, audio_data) + + # Post process + text, timestamp = post_process(hyp, vocab, timestamp) + print("\nTimestamp (s):", timestamp) + print("\nZipformer output:", text) + + # Release model + model.release_model() \ No newline at end of file diff --git a/py_utils/onnx_executor.py b/py_utils/onnx_executor.py index 772dcace..e0dcedaf 100644 --- a/py_utils/onnx_executor.py +++ b/py_utils/onnx_executor.py @@ -36,8 +36,8 @@ def __init__(self, model_path) -> None: self.sess = rt.InferenceSession(model_path, sess_options=sp_options, providers=['CPUExecutionProvider']) self.model_path = model_path - def __del__(self): - self.release() + # def __del__(self): + # self.release() def run(self, input_datas): if self.sess is None: diff --git a/py_utils/pytorch_executor.py b/py_utils/pytorch_executor.py index f509119a..321f73e4 100644 --- a/py_utils/pytorch_executor.py +++ b/py_utils/pytorch_executor.py @@ -21,8 +21,8 @@ def __init__(self, model_path, qnnpack=False) -> None: self.pt_model = torch.jit.load(model_path) self.pt_model.eval() - def __del__(self): - self.release() + # def __del__(self): + # self.release() def run(self, input_datas): if self.pt_model is None: diff --git a/py_utils/rknn_executor.py b/py_utils/rknn_executor.py index d65461b6..21b4132c 100644 --- a/py_utils/rknn_executor.py +++ b/py_utils/rknn_executor.py @@ -20,8 +20,8 @@ def __init__(self, model_path, target=None, device_id=None) -> None: self.rknn = rknn - def __del__(self): - self.release() + # def __del__(self): + # self.release() def run(self, inputs): if self.rknn is None: diff --git a/utils/audio_utils.c b/utils/audio_utils.c index 8023a154..7521ced8 100644 --- a/utils/audio_utils.c +++ b/utils/audio_utils.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "audio_utils.h" int read_audio(const char *path, audio_buffer_t *audio) @@ -68,4 +69,58 @@ int save_audio(const char *path, float *data, int num_frames, int sample_rate, i sf_close(outfile); return 0; -} \ No newline at end of file +} + +int resample_audio(audio_buffer_t *audio, int original_sample_rate, int desired_sample_rate) +{ + int original_length = audio->num_frames; + int out_length = round(original_length * (double)desired_sample_rate / (double)original_sample_rate); + printf("resample_audio: %d HZ -> %d HZ \n", original_sample_rate, desired_sample_rate); + + float *resampled_data = (float *)malloc(out_length * sizeof(float)); + if (!resampled_data) + { + return -1; + } + + for (int i = 0; i < out_length; ++i) + { + double src_index = i * (double)original_sample_rate / (double)desired_sample_rate; + int left_index = (int)floor(src_index); + int right_index = (left_index + 1 < original_length) ? left_index + 1 : left_index; + double fraction = src_index - left_index; + resampled_data[i] = (1.0f - fraction) * audio->data[left_index] + fraction * audio->data[right_index]; + } + + audio->num_frames = out_length; + free(audio->data); + audio->data = resampled_data; + + return 0; +} + +int convert_channels(audio_buffer_t *audio) +{ + + int original_num_channels = audio->num_channels; + printf("convert_channels: %d -> %d \n", original_num_channels, 1); + + float *converted_data = (float *)malloc(audio->num_frames * sizeof(float)); + if (!converted_data) + { + return -1; + } + + for (int i = 0; i < audio->num_frames; ++i) + { + float left = audio->data[i * 2]; + float right = audio->data[i * 2 + 1]; + converted_data[i] = (left + right) / 2.0f; + } + + audio->num_channels = 1; + free(audio->data); + audio->data = converted_data; + + return 0; +} diff --git a/utils/audio_utils.h b/utils/audio_utils.h index e692d0d6..9f587d57 100644 --- a/utils/audio_utils.h +++ b/utils/audio_utils.h @@ -35,6 +35,34 @@ int read_audio(const char *path, audio_buffer_t *audio); */ int save_audio(const char *path, float *data, int num_frames, int sample_rate, int num_channels); +/** + * @brief Resamples audio data to a desired sample rate. + * + * This function adjusts the sample rate of the provided audio data from + * the original sample rate to the desired sample rate. The audio data + * is assumed to be in a format compatible with the processing logic. + * + * @param audio [in/out] Pointer to the audio buffer structure containing + * the audio data to be resampled. + * @param original_sample_rate [in] The original sample rate of the audio data. + * @param desired_sample_rate [in] The target sample rate to resample the audio data to. + * @return int 0 on success, -1 on error. + */ +int resample_audio(audio_buffer_t *audio, int original_sample_rate, int desired_sample_rate); + +/** + * @brief Converts audio data to a single channel (mono). + * + * This function takes two-channel audio data and converts it to single + * channel (mono) by averaging the channels or using another merging strategy. + * The audio data will be modified in place. + * + * @param audio [in/out] Pointer to the audio buffer structure containing + * the audio data to be converted. + * @return int 0 on success, -1 on error. + */ +int convert_channels(audio_buffer_t *audio); + #ifdef __cplusplus } // extern "C" #endif diff --git a/utils/image_utils.c b/utils/image_utils.c index 8b25e387..545c4a16 100644 --- a/utils/image_utils.c +++ b/utils/image_utils.c @@ -671,9 +671,9 @@ int convert_image(image_buffer_t* src_img, image_buffer_t* dst_img, image_rect_t #else #if defined(RV1106_1103) - if(src_img->width % 4 == 0) { + if(src_img->width % 4 == 0 && dst_img->width % 4 == 0) { #else - if(src_img->width % 16 == 0) { + if(src_img->width % 16 == 0 && dst_img->width % 16 == 0) { #endif ret = convert_image_rga(src_img, dst_img, src_box, dst_box, color); if (ret != 0) { @@ -681,7 +681,7 @@ int convert_image(image_buffer_t* src_img, image_buffer_t* dst_img, image_rect_t ret = convert_image_cpu(src_img, dst_img, src_box, dst_box, color); } } else { - printf("src width is not 16-aligned, convert image use cpu\n"); + printf("src width is not 4/16-aligned, convert image use cpu\n"); ret = convert_image_cpu(src_img, dst_img, src_box, dst_box, color); } #endif