@@ -14,11 +14,13 @@ LOG_MODULE_DECLARE(asrc, CONFIG_SOF_LOG_LEVEL);
1414void asrc_fir_filter16 (struct asrc_farrow * src_obj , int16_t * * output_buffers ,
1515 int index_output_frame )
1616{
17- ae_f32x2 prod ;
18- ae_f32x2 filter01 = AE_ZERO32 (); /* Note: Init is not needed */
19- ae_f32x2 filter23 = AE_ZERO32 (); /* Note: Init is not needed */
20- ae_f16x4 buffer0123 = AE_ZERO16 (); /* Note: Init is not needed */
21- ae_f32x2 * filter_p ;
17+ ae_valignx2 align_filter ;
18+ ae_valign align_buffer ;
19+ ae_int64 prod ;
20+ ae_f32x2 filter01 ;
21+ ae_f32x2 filter23 ;
22+ ae_f16x4 buffer0123 ;
23+ ae_int32x4 * filter_p ;
2224 ae_f16x4 * buffer_p ;
2325 int n_limit ;
2426 int ch ;
@@ -39,60 +41,39 @@ void asrc_fir_filter16(struct asrc_farrow *src_obj, int16_t **output_buffers,
3941 /* Iterate over each channel */
4042 for (ch = 0 ; ch < src_obj -> num_channels ; ch ++ ) {
4143 /* Pointer to the beginning of the impulse response */
42- filter_p = (ae_f32x2 * )& src_obj -> impulse_response [0 ];
44+ filter_p = (ae_int32x4 * )& src_obj -> impulse_response [0 ];
4345
4446 /* Pointer to the buffered input data */
4547 buffer_p =
4648 (ae_f16x4 * )& src_obj -> ring_buffers16 [ch ]
4749 [src_obj -> buffer_write_position ];
4850
4951 /* Allows unaligned load of 64 bit per cycle */
50- ae_valign align_filter = AE_LA64_PP (filter_p );
51- ae_valign align_buffer = AE_LA64_PP (buffer_p );
52+ align_filter = AE_LA128_PP (filter_p );
53+ align_buffer = AE_LA64_PP (buffer_p );
5254
5355 /* Initialise the accumulator */
54- prod = AE_ZERO32 ();
56+ prod = AE_ZERO64 ();
5557
5658 /* Iterate over the filter bins */
5759 for (n = 0 ; n < n_limit ; n ++ ) {
5860 /* Read four buffered samples at once */
5961 AE_LA16X4_IP (buffer0123 , align_buffer , buffer_p );
6062
6163 /* Store four bins of the impulse response */
62- AE_LA32X2_IP (filter01 , align_filter , filter_p );
63- AE_LA32X2_IP (filter23 , align_filter , filter_p );
64-
65- /* Multiply and accumulate
66- * the lower half bits in 'buffer0123' are used
67- */
68- AE_MULAFP32X16X2RS_L (prod , filter23 , buffer0123 );
69- /* the upper half bits in 'buffer0123' are used */
70- AE_MULAFP32X16X2RS_H (prod , filter01 , buffer0123 );
71- }
72-
73- /* Shift left after accumulation, because interim
74- * results might saturate during filtering prod = prod
75- * << 1; will shift after last addition
76- */
77-
78- /* swap LL and HH reusing filter01 to perform
79- * saturated addition of both halves
80- */
81- filter01 = AE_SEL32_LH (prod , prod );
64+ AE_LA32X2X2_IP (filter01 , filter23 , align_filter , filter_p );
8265
83- /* Add up the lower and upper 32 bit data of the
84- * 'prod' prod = AE_ADD32_HL_LH(prod, prod); fix using
85- * saturated addition
86- */
87- prod = AE_ADD32S (prod , filter01 );
66+ /* Multiply and accumulate */
67+ AE_MULAAAAFQ32X16 (prod , filter01 , filter23 , buffer0123 );
68+ }
8869
89- /* Shift with saturation */
90- prod = AE_SLAI32S (prod , 1 );
70+ /* Shift with saturation, use filter01 as scratch */
71+ filter01 = AE_SLAI32S (AE_ROUND32F48SASYM ( prod ) , 1 );
9172
9273 /* Round 'prod' to 16 bit and store it in
9374 * (de-)interleaved format in the output buffers
9475 */
95- AE_S16_0_X (AE_ROUND16X4F32SSYM (prod , prod ),
76+ AE_S16_0_X (AE_ROUND16X4F32SSYM (filter01 , filter01 ),
9677 (ae_f16 * )& output_buffers [ch ][i ], 0 );
9778 }
9879}
0 commit comments