xiph · Developer-Ecosystem-Engineering · Jan 2, 2024
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
@@ -160,6 +160,7 @@ FLAC__bool FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual_33bit
 #   ifdef FLAC__CPU_ARM64
 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
+FLAC__bool FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual_neon(const FLAC__int32 *  data, uint32_t data_len, const FLAC__int32  qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
 #   endif
 
 #  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN

diff --git a/src/libFLAC/lpc_intrin_neon.c b/src/libFLAC/lpc_intrin_neon.c
@@ -1268,6 +1268,62 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
     return;
 }
 
+
+
+FLAC__bool FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual_neon(const FLAC__int32 *  data, uint32_t data_len, const FLAC__int32  qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[])
+{
+	uint32_t i, j, k, data_len_unroll;
+	const FLAC__int32 *pData;
+	FLAC__int32 ordered_qlp_coeff[FLAC__MAX_LPC_ORDER];
+	FLAC__int64 sum[4], residual_to_check[4];
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	// To allow compiler's autovectorization of the multplications we inverse the order of the qlp coefficients -
+	//  their indices increase same as the corresponding data indices.
+	for(j = 0; j < order; j++) {
+		ordered_qlp_coeff[j] = qlp_coeff[order - 1 - j];
+	}
+
+	data_len_unroll = data_len & (-4);
+	for(i = 0; i < data_len_unroll; i += 4) {
+		pData = data + i + 0 - (int)order;
+		sum[0] = sum[1] = sum[2] = sum[3] = 0;
+
+		for(j = 0; j < order; j++) {
+			for(k = 0; k < 4; k++) {
+				sum[k] += ordered_qlp_coeff[j] * (FLAC__int64)pData[j + k];
+			}
+		}
+		/* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */
+		for(k = 0; k < 4; k++) {
+			residual_to_check[k] = data[i + k] - (sum[k] >> lp_quantization);
+			if(residual_to_check[k] <= INT32_MIN || residual_to_check[k] > INT32_MAX)
+				return false;
+			residual[i + k] = residual_to_check[k];
+		}
+	}
+
+	// Leftover of the data_len
+	for( ; i < data_len; i++) {
+		pData = data + i + 0 - (int)order;
+		sum[0] = 0;
+
+		for(j = 0; j < order; j++) {
+			sum[0] += ordered_qlp_coeff[j] * (FLAC__int64)pData[j];
+		}
+		/* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */
+		residual_to_check[0] = data[i] - (sum[0] >> lp_quantization);
+		if(residual_to_check[0] <= INT32_MIN || residual_to_check[0] > INT32_MAX)
+			return false;
+		residual[i] = residual_to_check[0];
+	}
+
+	return true;
+}
+
+
 #endif /* FLAC__CPU_ARM64 && FLAC__HAS_ARCH64INTRIN */
 #endif /* FLAC__NO_ASM */
 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
@@ -448,6 +448,7 @@ typedef struct FLAC__StreamEncoderPrivate {
 	void (*local_lpc_compute_residual_from_qlp_coefficients)(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
 	void (*local_lpc_compute_residual_from_qlp_coefficients_64bit)(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
 	void (*local_lpc_compute_residual_from_qlp_coefficients_16bit)(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
+	FLAC__bool (*local_lpc_compute_residual_from_qlp_coefficients_limit_residual)(const FLAC__int32 * flac_restrict data, uint32_t data_len, const FLAC__int32 * flac_restrict qlp_coeff, uint32_t order, int lp_quantization, FLAC__int32 * flac_restrict residual);
 #endif
 	FLAC__bool disable_mmx;
 	FLAC__bool disable_sse2;
@@ -981,6 +982,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients;
 	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide;
 	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients;
+	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_limit_residual = FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual;
 #endif
 	/* now override with asm where appropriate */
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
@@ -996,9 +998,10 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 	else
 		encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
 #endif
-    encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
-    encoder->private_->local_lpc_compute_residual_from_qlp_coefficients       = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
-    encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
+    encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit 			= FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
+    encoder->private_->local_lpc_compute_residual_from_qlp_coefficients       			= FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
+    encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit 			= FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
+	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_limit_residual	= FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual_neon;
 #endif /* defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN */
 
 	if(encoder->private_->cpuinfo.use_asm) {
@@ -4511,7 +4514,7 @@ uint32_t evaluate_lpc_subframe_(
 
 	if(FLAC__lpc_max_residual_bps(subframe_bps, qlp_coeff, order, quantization) > 32) {
 		if(subframe_bps <= 32){
-			if(!FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual(((FLAC__int32 *)signal)+order, residual_samples, qlp_coeff, order, quantization, residual))
+			if(!encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_limit_residual(((FLAC__int32 *)signal)+order, residual_samples, qlp_coeff, order, quantization, residual))
 				return 0;
 		}
 		else