Skip to content

Commit 407710b

Browse files
committed
Audio: ASRC: Optimize for HiFi5 function asrc_fir_filter16()
This change improves efficiency of FIR filter compute. The filter coefficients load is changed to 128 bits wide for four 32 bit coefficients. The dual-MAC is changed to quad-MAC with single accumulator. The saving is 1.3 MCPS, from 25.4 to 24.1 MCPS with 16 bit 44.1 to 48.8 kHz stereo push mode ASRC. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent d2942f3 commit 407710b

1 file changed

Lines changed: 18 additions & 37 deletions

File tree

src/audio/asrc/asrc_farrow_hifi5.c

Lines changed: 18 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@ LOG_MODULE_DECLARE(asrc, CONFIG_SOF_LOG_LEVEL);
1414
void asrc_fir_filter16(struct asrc_farrow *src_obj, int16_t **output_buffers,
1515
int index_output_frame)
1616
{
17-
ae_f32x2 prod;
18-
ae_f32x2 filter01 = AE_ZERO32(); /* Note: Init is not needed */
19-
ae_f32x2 filter23 = AE_ZERO32(); /* Note: Init is not needed */
20-
ae_f16x4 buffer0123 = AE_ZERO16(); /* Note: Init is not needed */
21-
ae_f32x2 *filter_p;
17+
ae_valignx2 align_filter;
18+
ae_valign align_buffer;
19+
ae_int64 prod;
20+
ae_f32x2 filter01;
21+
ae_f32x2 filter23;
22+
ae_f16x4 buffer0123;
23+
ae_int32x4 *filter_p;
2224
ae_f16x4 *buffer_p;
2325
int n_limit;
2426
int ch;
@@ -39,60 +41,39 @@ void asrc_fir_filter16(struct asrc_farrow *src_obj, int16_t **output_buffers,
3941
/* Iterate over each channel */
4042
for (ch = 0; ch < src_obj->num_channels; ch++) {
4143
/* Pointer to the beginning of the impulse response */
42-
filter_p = (ae_f32x2 *)&src_obj->impulse_response[0];
44+
filter_p = (ae_int32x4 *)&src_obj->impulse_response[0];
4345

4446
/* Pointer to the buffered input data */
4547
buffer_p =
4648
(ae_f16x4 *)&src_obj->ring_buffers16[ch]
4749
[src_obj->buffer_write_position];
4850

4951
/* Allows unaligned load of 64 bit per cycle */
50-
ae_valign align_filter = AE_LA64_PP(filter_p);
51-
ae_valign align_buffer = AE_LA64_PP(buffer_p);
52+
align_filter = AE_LA128_PP(filter_p);
53+
align_buffer = AE_LA64_PP(buffer_p);
5254

5355
/* Initialise the accumulator */
54-
prod = AE_ZERO32();
56+
prod = AE_ZERO64();
5557

5658
/* Iterate over the filter bins */
5759
for (n = 0; n < n_limit; n++) {
5860
/* Read four buffered samples at once */
5961
AE_LA16X4_IP(buffer0123, align_buffer, buffer_p);
6062

6163
/* Store four bins of the impulse response */
62-
AE_LA32X2_IP(filter01, align_filter, filter_p);
63-
AE_LA32X2_IP(filter23, align_filter, filter_p);
64-
65-
/* Multiply and accumulate
66-
* the lower half bits in 'buffer0123' are used
67-
*/
68-
AE_MULAFP32X16X2RS_L(prod, filter23, buffer0123);
69-
/* the upper half bits in 'buffer0123' are used */
70-
AE_MULAFP32X16X2RS_H(prod, filter01, buffer0123);
71-
}
72-
73-
/* Shift left after accumulation, because interim
74-
* results might saturate during filtering prod = prod
75-
* << 1; will shift after last addition
76-
*/
77-
78-
/* swap LL and HH reusing filter01 to perform
79-
* saturated addition of both halves
80-
*/
81-
filter01 = AE_SEL32_LH(prod, prod);
64+
AE_LA32X2X2_IP(filter01, filter23, align_filter, filter_p);
8265

83-
/* Add up the lower and upper 32 bit data of the
84-
* 'prod' prod = AE_ADD32_HL_LH(prod, prod); fix using
85-
* saturated addition
86-
*/
87-
prod = AE_ADD32S(prod, filter01);
66+
/* Multiply and accumulate */
67+
AE_MULAAAAFQ32X16(prod, filter01, filter23, buffer0123);
68+
}
8869

89-
/* Shift with saturation */
90-
prod = AE_SLAI32S(prod, 1);
70+
/* Shift with saturation, use filter01 as scratch */
71+
filter01 = AE_SLAI32S(AE_ROUND32F48SASYM(prod), 1);
9172

9273
/* Round 'prod' to 16 bit and store it in
9374
* (de-)interleaved format in the output buffers
9475
*/
95-
AE_S16_0_X(AE_ROUND16X4F32SSYM(prod, prod),
76+
AE_S16_0_X(AE_ROUND16X4F32SSYM(filter01, filter01),
9677
(ae_f16 *)&output_buffers[ch][i], 0);
9778
}
9879
}

0 commit comments

Comments
 (0)