diff --git a/compile_flags.txt b/compile_flags.txt new file mode 100644 index 00000000000..fff9fc83e1d --- /dev/null +++ b/compile_flags.txt @@ -0,0 +1,11 @@ +-D +SIMD +-I +src/include +-I +src/include/private +-I +libs/teletone/src +-mavx +-mavx2 +-march=native diff --git a/src/include/switch_simd.h b/src/include/switch_simd.h new file mode 100644 index 00000000000..7908472686a --- /dev/null +++ b/src/include/switch_simd.h @@ -0,0 +1,337 @@ +/* + * (c) 2025 Stéphane Alnet + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * Contributor(s): + * Stéphane Alnet <stephane@shimaore.net> + * + * switch_simd.h -- SIMD definitions + * + */ + +#ifndef SWITCH_SIMD_H +#define SWITCH_SIMD_H + +#ifdef SIMD + +/* The initial goal of this module is to provide noticeable speed improvements for audio muxing. It probably could be extended to video processing, but I haven't tried yet. + * For higher speed improvemrnts you generally want your data to be aligned on the SIMD datasize. + * (See e.g. https://www.agner.org/optimize/instruction_tables.pdf for speed measurements.) + * Here we focus on 256 bits (8 octets) since + * - as of 2025 it is essentially available on most x86_64 hardware via AVX and AVX2, + * - and is an appropriate size for e.g. PCMU or PCMA at 8kHz (512 bits would be too much for 160 bytes). + * For easy alignment, use the SWITCH_ALIGN macro. It can be used in struct/union, and for stack-allocated variables. + * Pointers might or might not be aligned. For example, glibc malloc will return 8-octets aligned memory blocks, but an arbitrary pointer inside that structure will not necessarily be aligned! + * Alignment results in faster loads and stores - instead of sequencing the load and store, the microcode can use a 128-bit or 256-bit lane to move the data between cache and register in a smaller number of steps. + */ + +#include <stdalign.h> +#include <string.h> +#include <simde/x86/sse2.h> +#include <simde/x86/avx.h> +#include <simde/x86/avx2.h> +/* SIMDE will provide substitutes for AVX512 functions on lower platforms. */ +#include <simde/x86/avx512.h> + +enum { + int16_per_m256i = sizeof(simde__m256i)/sizeof(int16_t), + mask_int16_per_m256i = int16_per_m256i-1, + int16_per_m128i = sizeof(simde__m128i)/sizeof(int16_t), + mask_int16_per_m128i = int16_per_m128i-1, + int32_per_m256i = sizeof(simde__m256i)/sizeof(int32_t), +}; + +/* Apply the `SWITCH_ALIGN` prefix to: + * - function variables + * - struct/union fields + * e.g. + * + * SWITCH_ALIGN int16_t data[SWITCH_RECOMMENDED_BUFFER_SIZE/sizeof(int16_t)]; + * + * Then `data` can be used safely as destination or source for SIMD_mux_aligned_unbound_sln, for example. + */ +#define SWITCH_ALIGN alignas(sizeof(simde__m256i)) + +/* SIMD-optimized int16_t saturated addition + * - aligned: both int16_t pointers must be aligned on 256 bits boundary + * - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary. + * will modify data outside of the range if sample%4 != 0; might SIGSEV if the underlying buffer is too short. + * It is safe to use with buffers defined as + * + * SWITCH_ALIGN data[SWITCH_RECOMMENDED_BUFFER_SIZE]; + * + * for example. + */ +inline static void SIMD_mux_sln_m256i_m256i_unbound(simde__m256i *dst, const simde__m256i *add, int samples) +{ + int x; + const int blocks = samples / int16_per_m256i; + for ( x = 0; x < blocks; x++) { + /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */ + simde_mm256_store_si256( + dst+x, + simde_mm256_adds_epi16( + /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */ + simde_mm256_load_si256(dst+x), + simde_mm256_load_si256(add+x) + )); + } +} + +/* SIMD-optimized int16_t satured addition + * - only the first parameter must be aligned + * - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary. + */ +inline static void SIMD_mux_sln_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples) +{ + uint x; + const uint blocks = samples / int16_per_m256i; + for ( x = 0; x < blocks; x++) { + simde_mm256_store_si256( + dst+x, + simde_mm256_adds_epi16( + simde_mm256_load_si256(dst+x), + simde_mm256_loadu_si256(add+x*int16_per_m256i) + )); + } +} + +/* SIMD-optimized int16_t saturated addition + * - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary. + */ +inline static void SIMD_mux_sln_int16_int16_unbound(int16_t *dst, const int16_t *add, int samples) +{ + uint x; + const uint blocks = samples / int16_per_m256i; + for ( x = 0; x < blocks; x++) { + simde_mm256_storeu_si256( + dst+x*int16_per_m256i, + simde_mm256_adds_epi16( + simde_mm256_loadu_si256(dst+x*int16_per_m256i), + simde_mm256_loadu_si256(add+x*int16_per_m256i) + )); + } +} + +inline static int SIMD_is_aligned256(const void *p) { + return (uintptr_t)p % sizeof(simde__m256i) == 0; +} + +inline static int SIMD_is_aligned128(const void *p) { + return (uintptr_t)p % sizeof(simde__m128i) == 0; +} + +inline static void SIMD_mux_sln(int16_t *dst, const int16_t *add, int samples) +{ + /* Round down to the nearest 256 bits block */ + uint bound_len = samples & ~mask_int16_per_m256i; + uint extra = samples & mask_int16_per_m256i; + + const int dst_aligned = SIMD_is_aligned256(dst); + const int src_aligned = SIMD_is_aligned256(add); + + /* Process as much as we can from the original buffer */ + if (dst_aligned && src_aligned) { + SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)dst, (const simde__m256i *)add, bound_len); + } else if (dst_aligned) { + SIMD_mux_sln_m256i_int16_unbound((simde__m256i *)dst, add, bound_len); + } else { + SIMD_mux_sln_int16_int16_unbound(dst, add, bound_len); + } + + if (extra > 0) { + /* Since the original buffers might not go all the way up to the next 256 bits, we copy the data + * in local buffers large enough to hold it, then do the maths in SIMD. + */ + SWITCH_ALIGN int16_t _dst[int16_per_m256i]; + SWITCH_ALIGN int16_t _add[int16_per_m256i]; + memcpy(_dst, dst+bound_len, sizeof(int16_t) * extra); + memcpy(_add, add+bound_len, sizeof(int16_t) * extra); + SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)_dst, (const simde__m256i *)_add, extra); + memcpy(dst+bound_len, _dst, sizeof(int16_t) * extra); + } +} + +/* In mod_conference we do 16-to-32 bit conversions to avoid overflow. */ + +/* Convert to unaligned int16_t to unaligned int32_t. + * - unbound: might overflow the input and output buffers boundaries if samples is not a multiple of 16. + */ +inline static void SIMD_convert32_int16_unbound(int32_t *dst, const int16_t *src, int samples) +{ + uint x; + const uint blocks = samples / int16_per_m128i; + for ( x = 0; x < blocks; x++) { + /* Store 8 int32 at once. + * Apparently SIMDE doesn't define an _aligned_ store operation, but this is fine. + */ + simde_mm256_storeu_epi32(dst+x, + /* Sign-extend from 16-bits to 32-bits */ + simde_mm256_cvtepi16_epi32( + /* Load 8 int16 at one */ + simde_mm_loadu_epi16(src+x))); + } +} + +/* Convert to aligned int32_t (in bunches of 8) to int16_t (in bunches of 8). + * - unbound: might overflow the input and output buffer boundaries. + */ +inline static void SIMD_convert16_m256i_unbound(simde__m128i *dst, const simde__m256i *src, int samples) +{ + uint x; + const uint blocks = samples / int32_per_m256i; + for ( x = 0; x < blocks; x++) { + simde_mm_store_si128( + dst+x, + simde_mm256_cvtsepi32_epi16( + simde_mm256_load_si256(src+x) + )); + } + +} + +/* Add int16_t samples to packed int32_t values. + * - unbound: might overflow the input and output buffer boundaries. + */ +inline static void SIMD_mux32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *add, int samples) +{ + uint x; + const uint blocks = samples / int16_per_m128i; + for ( x = 0; x < blocks; x++) { + /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */ + simde_mm256_store_si256( + dst+x, + simde_mm256_add_epi32( + /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */ + simde_mm256_load_si256(dst+x), + simde_mm256_cvtepi16_epi32( + simde_mm_load_si128(add+x) + ))); + } +} + +/* Add int16_t samples to packed int32_t values. + * - unbound: might overflow the input and output buffer boundaries. + */ +inline static void SIMD_mux32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples) +{ + uint x; + const uint blocks = samples / int16_per_m128i; + for ( x = 0; x < blocks; x++) { + simde_mm256_store_si256( + dst+x, + simde_mm256_add_epi32( + simde_mm256_load_si256(dst+x), + simde_mm256_cvtepi16_epi32( + simde_mm_loadu_epi16(add+x*int16_per_m128i) + ))); + } +} + +/* Add int16_t samples to packed int32_t values. */ +inline static void SIMD_mux32_sln(simde__m256i *dst, const int16_t *add, int samples) +{ + /* Round down to the nearest 128 bits block */ + uint bound_len = samples & ~mask_int16_per_m128i; + uint extra = samples & mask_int16_per_m128i; + + const int src_aligned = SIMD_is_aligned128(add); + + /* Process as much as we can from the original buffer */ + if (src_aligned) { + SIMD_mux32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len); + } else { + SIMD_mux32_m256i_int16_unbound(dst, add, bound_len); + } + + if (extra > 0) { + /* Since the original buffers might not go all the way up to the next 256 bits, we copy the data + * in local buffers large enough to hold it, then do the maths in SIMD. + */ + SWITCH_ALIGN int16_t _add[int16_per_m128i]; + memcpy(_add, add+bound_len, sizeof(int16_t) * extra); + SIMD_mux32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra); + } +} + +/* Subtract packed, aligned int16_t values from packed, aligned int32_t values. + * - unbound: might overflow the input and output buffer boundaries. + */ +inline static void SIMD_sub32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *sub, int samples) +{ + uint x; + const uint blocks = samples / int16_per_m128i; + for ( x = 0; x < blocks; x++) { + /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */ + simde_mm256_store_si256( + dst+x, + simde_mm256_sub_epi32( + /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */ + simde_mm256_load_si256(dst+x), + simde_mm256_cvtepi16_epi32( + simde_mm_load_si128(sub+x) + ))); + } +} + +/* Subtract int16_t values from packed, aligned int32_t values. + * - unbound: might overflow the input and output buffer boundaries. + */ +inline static void SIMD_sub32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples) +{ + uint x; + const uint blocks = samples / int16_per_m128i; + for ( x = 0; x < blocks; x++) { + simde_mm256_store_si256( + dst+x, + simde_mm256_sub_epi32( + simde_mm256_load_si256(dst+x), + simde_mm256_cvtepi16_epi32( + simde_mm_loadu_epi16(add+x*int16_per_m128i) + ))); + } +} + +/* Subtract int16_t values from packed, aligned int32_t values. + */ +inline static void SIMD_sub32_sln(simde__m256i *dst, const int16_t *add, int samples) +{ + /* Round down to the nearest 256 bits block */ + uint bound_len = samples & ~mask_int16_per_m128i; + uint extra = samples & mask_int16_per_m128i; + + const int src_aligned = SIMD_is_aligned128(add); + + /* Process as much as we can from the original buffer */ + if (src_aligned) { + SIMD_sub32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len); + } else { + SIMD_sub32_m256i_int16_unbound(dst, add, bound_len); + } + + if (extra > 0) { + /* Since the original buffers might not go all the way up to the next 256 bits, we copy the data + * in local buffers large enough to hold it, then do the maths in SIMD. + */ + SWITCH_ALIGN int16_t _add[int16_per_m128i]; + memcpy(_add, add+bound_len, sizeof(int16_t) * extra); + SIMD_sub32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra); + } +} + +#else /* SIMD */ + +#define SWITCH_ALIGN + +#endif /* SIMD */ + +#endif /* SWITCH_SIMD_H */ diff --git a/src/mod/applications/mod_conference/conference_member.c b/src/mod/applications/mod_conference/conference_member.c index c258e597836..f36ed1bbf80 100644 --- a/src/mod/applications/mod_conference/conference_member.c +++ b/src/mod/applications/mod_conference/conference_member.c @@ -35,11 +35,13 @@ * Seven Du <dujinfang@gmail.com> * Emmanuel Schmidbauer <e.schmidbauer@gmail.com> * William King <william.king@quentustech.com> + * Stephane Alnet <stephane@shimaore.net> * * mod_conference.c -- Software Conference Bridge * */ #include <mod_conference.h> +#include <switch_simd.h> int conference_member_noise_gate_check(conference_member_t *member) { @@ -550,7 +552,7 @@ void conference_member_check_channels(switch_frame_t *frame, conference_member_t void conference_member_add_file_data(conference_member_t *member, int16_t *data, switch_size_t file_data_len) { switch_size_t file_sample_len; - int16_t file_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 }; + SWITCH_ALIGN int16_t file_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 }; switch_mutex_lock(member->fnode_mutex); @@ -618,14 +620,18 @@ void conference_member_add_file_data(conference_member_t *member, int16_t *data, conference_al_process(member->fnode->al, file_frame, file_sample_len * 2, member->conference->rate); } - for (i = 0; i < (int)file_sample_len * member->conference->channels; i++) { - if (member->fnode->mux) { + if (member->fnode->mux) { +#ifdef SIMD + SIMD_mux_sln(data, file_frame, (int)file_sample_len * member->conference->channels); +#else + for (i = 0; i < (int)file_sample_len * member->conference->channels; i++) { sample = data[i] + file_frame[i]; switch_normalize_to_16bit(sample); data[i] = (int16_t)sample; - } else { - data[i] = file_frame[i]; } +#endif + } else { + memcpy(data, file_frame, (int)file_sample_len * member->conference->channels * sizeof(int16_t)); } } diff --git a/src/mod/applications/mod_conference/mod_conference.c b/src/mod/applications/mod_conference/mod_conference.c index aa606170d5e..a0a73998bbe 100644 --- a/src/mod/applications/mod_conference/mod_conference.c +++ b/src/mod/applications/mod_conference/mod_conference.c @@ -35,11 +35,13 @@ * Seven Du <dujinfang@gmail.com> * Emmanuel Schmidbauer <e.schmidbauer@gmail.com> * William King <william.king@quentustech.com> + * Stephane Alnet <stephane@shimaore.net> * * mod_conference.c -- Software Conference Bridge * */ #include <mod_conference.h> +#include <switch_simd.h> SWITCH_MODULE_LOAD_FUNCTION(mod_conference_load); SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_conference_shutdown); @@ -218,7 +220,11 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob uint8_t *async_file_frame; int16_t *bptr; uint32_t x = 0; +#ifndef SIMD int32_t z = 0; +#else + simde__m256i z; +#endif conference_cdr_node_t *np; switch_time_t last_heartbeat_time = switch_epoch_time_now(NULL); @@ -552,6 +558,20 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob } } else { if (has_file_data) { +#ifdef SIMD + int16_t *muxed; + muxed = (int16_t *) file_frame; + bptr = (int16_t *) async_file_frame; + /* Note: we use the fact that the buffers (file_frame & async_file_frame) are allocated + * full-size at SWITCH_RECOMMENDED_BUFFER_SIZE to assert that we do not go over the end + * of the buffer. (We know this because 8192 (SWITCH_RECOMMENDED_BUFFER_SIZE) is a + * multiple of 32 (sizeof(simde__mm256i), so even in the worst case we will end up with an + * integer number of loops.) + * Note: apparently APR doesn't support aligned_alloc / memalign, too bad. + * At least on libc we should be getting 8-bytes aligned malloc(), if this is what APR uses. + */ + SIMD_mux_sln_int16_int16_unbound(muxed,bptr,file_sample_len * conference->channels); +#else switch_size_t x; for (x = 0; x < file_sample_len * conference->channels; x++) { int32_t z; @@ -563,6 +583,7 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob switch_normalize_to_16bit(z); muxed[x] = (int16_t) z; } +#endif } else { memcpy(file_frame, async_file_frame, file_sample_len * 2 * conference->channels); has_file_data = 1; @@ -574,21 +595,33 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob if (ready || has_file_data) { /* Use more bits in the main_frame to preserve the exact sum of the audio samples. */ - int main_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 }; - int16_t write_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 }; + SWITCH_ALIGN int32_t main_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 }; + SWITCH_ALIGN int16_t write_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 }; /* Init the main frame with file data if there is any. */ bptr = (int16_t *) file_frame; if (has_file_data && file_sample_len) { - +#ifdef SIMD + /* Both `file_frame` and `main_frame` have lengths multiple of 8 bytes, so + * we should be safe loading and storing beyond `len`. + */ + size_t samples = MIN(bytes / 2, file_sample_len * conference->channels); + size_t index = file_sample_len * conference->channels * sizeof(main_frame[0]); + SIMD_convert32_int16_unbound(main_frame, bptr, samples); + memset(main_frame+index, 255, sizeof(main_frame)-index); +#else for (x = 0; x < bytes / 2; x++) { + /* It's unclear here why we say `<=` rather than `<`, looks like an offset-by-one error. + * For now the SIMD code assumes that `<` was meant here. + */ if (x <= file_sample_len * conference->channels) { main_frame[x] = (int32_t) bptr[x]; } else { memset(&main_frame[x], 255, sizeof(main_frame[x])); } } +#endif } conference->mux_loop_count = 0; @@ -604,9 +637,13 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob } bptr = (int16_t *) omember->frame; +#ifdef SIMD + SIMD_mux32_sln((simde__m256i *)main_frame, bptr, omember->read / 2); +#else /* SIMD */ for (x = 0; x < omember->read / 2; x++) { main_frame[x] += (int32_t) bptr[x]; } +#endif /* SIMD */ } /* Create write frame once per member who is not deaf for each sample in the main frame @@ -636,12 +673,24 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob bptr = (int16_t *) omember->frame; +#ifndef SIMD for (x = 0; x < bytes / 2 ; x++) { z = main_frame[x]; +#else + for (x = 0; x < bytes / 2 ; x += int32_per_m256i) { + z = simde_mm256_loadu_epi32((simde__m256i *)(main_frame+x)); +#endif /* bptr[x] represents my own contribution to this audio sample */ - if (conference_utils_member_test_flag(omember, MFLAG_HAS_AUDIO) && x <= omember->read / 2) { + /* It's unclear here why we say `<=` rather than `<`, looks like an offset-by-one error. + * For now the SIMD code assumes that `<` was meant here. + */ + if (conference_utils_member_test_flag(omember, MFLAG_HAS_AUDIO) && x < omember->read / 2) { +#ifndef SIMD z -= (int32_t) bptr[x]; +#else + z = simde_mm256_sub_epi32(z, simde_mm256_cvtepi16_epi32(simde_mm_loadu_epi16(bptr+x))); +#endif } /* when there are relationships, we have to do more work by scouring all the members to see if there are any @@ -655,7 +704,11 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob int16_t *rptr = (int16_t *) imember->frame; for (rel = imember->relationships; rel; rel = rel->next) { if ((rel->id == omember->id || rel->id == 0) && !switch_test_flag(rel, RFLAG_CAN_SPEAK)) { +#ifndef SIMD z -= (int32_t) rptr[x]; +#else + z = simde_mm256_sub_epi32(z, simde_mm256_cvtepi16_epi32(simde_mm_loadu_epi16(rptr+x))); +#endif found = 1; break; } @@ -663,7 +716,11 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob if (!found) { for (rel = omember->relationships; rel; rel = rel->next) { if ((rel->id == imember->id || rel->id == 0) && !switch_test_flag(rel, RFLAG_CAN_HEAR)) { +#ifndef SIMD z -= (int32_t) rptr[x]; +#else + z = simde_mm256_sub_epi32(z, simde_mm256_cvtepi16_epi32(simde_mm_loadu_epi16(rptr+x))); +#endif break; } } @@ -674,8 +731,12 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob } /* Now we can convert to 16 bit. */ +#ifndef SIMD switch_normalize_to_16bit(z); write_frame[x] = (int16_t) z; +#else + simde_mm_store_si128((simde__m128i *)(write_frame+x), simde_mm256_cvtsepi32_epi16(z)); +#endif } if (!omember->channel || switch_channel_test_flag(omember->channel, CF_AUDIO)) { diff --git a/src/mod/applications/mod_fsv/mod_fsv.c b/src/mod/applications/mod_fsv/mod_fsv.c index 37f413d6b2e..e7fe3df41cf 100644 --- a/src/mod/applications/mod_fsv/mod_fsv.c +++ b/src/mod/applications/mod_fsv/mod_fsv.c @@ -1177,6 +1177,7 @@ static switch_status_t fsv_file_write(switch_file_handle_t *handle, void *data, int i; uint32_t j; int32_t mixed = 0; + // SIMD for (i = 0; (size_t)i < *len; i++) { for (j = 0; j < handle->channels; j++) { mixed += xdata[i * handle->channels + j]; diff --git a/src/switch_core_media_bug.c b/src/switch_core_media_bug.c index 7aabceb7b78..98afc0045a3 100644 --- a/src/switch_core_media_bug.c +++ b/src/switch_core_media_bug.c @@ -387,6 +387,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_bug_read(switch_media_bug_t *b right = dp; /* write stream */ right_len = wlen; } + // SIMD? interleave for (x = 0; x < blen; x++) { if (x < left_len) { *(tp++) = *(left + x); @@ -401,6 +402,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_bug_read(switch_media_bug_t *b } memcpy(frame->data, bug->tmp, bytes * 2); } else { + // SIMD -- at least for normalize for (x = 0; x < blen; x++) { int32_t w = 0, r = 0, z = 0; @@ -414,11 +416,13 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_bug_read(switch_media_bug_t *b z = w + r; + // This makes no sense whatsoever if (z > SWITCH_SMAX || z < SWITCH_SMIN) { if (r) z += (r/2); if (w) z += (w/2); } + // SIMD --- saturate 32 to 16 switch_normalize_to_16bit(z); *(fp + x) = (int16_t) z; diff --git a/src/switch_ivr_async.c b/src/switch_ivr_async.c index 4075f0adce4..342fc1ee575 100644 --- a/src/switch_ivr_async.c +++ b/src/switch_ivr_async.c @@ -28,6 +28,7 @@ * Bret McDanel <bret AT 0xdecafbad dot com> * Luke Dashjr <luke@openmethods.com> (OpenMethods, LLC) * Christopher M. Rienzo <chris@rienzo.com> + * Stephane Alnet <stephane@shimaore.net> * * switch_ivr_async.c -- IVR Library (async operations) * @@ -37,6 +38,7 @@ #include "private/switch_core_pvt.h" #include <speex/speex_preprocess.h> #include <speex/speex_echo.h> +#include <switch_simd.h> struct switch_ivr_dmachine_binding { char *digits; @@ -831,17 +833,21 @@ static switch_bool_t write_displace_callback(switch_media_bug_t *bug, void *user len = rframe->samples; if (dh->mux) { - int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE]; + SWITCH_ALIGN int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE]; int16_t *fp = rframe->data; uint32_t x; st = switch_core_file_read(&dh->fh, buf, &len); +#ifdef SIMD + SIMD_mux_sln(fp, buf, (uint32_t) len * dh->fh.channels); +#else for (x = 0; x < (uint32_t) len * dh->fh.channels; x++) { int32_t mixed = fp[x] + buf[x]; switch_normalize_to_16bit(mixed); fp[x] = (int16_t) mixed; } +#endif } else { st = switch_core_file_read(&dh->fh, rframe->data, &len); if (len < rframe->samples) { @@ -932,17 +938,21 @@ static switch_bool_t read_displace_callback(switch_media_bug_t *bug, void *user_ len = rframe->samples; if (dh->mux) { - int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE]; + SWITCH_ALIGN int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE]; int16_t *fp = rframe->data; uint32_t x; st = switch_core_file_read(&dh->fh, buf, &len); +#ifdef SIMD + SIMD_mux_sln(fp, buf, (uint32_t) len * dh->fh.channels); +#else for (x = 0; x < (uint32_t) len * dh->fh.channels; x++) { int32_t mixed = fp[x] + buf[x]; switch_normalize_to_16bit(mixed); fp[x] = (int16_t) mixed; } +#endif } else { st = switch_core_file_read(&dh->fh, rframe->data, &len); @@ -2297,7 +2307,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_eavesdrop_session(switch_core_session switch_channel_t *tchannel = switch_core_session_get_channel(tsession); switch_frame_t *read_frame, write_frame = { 0 }; switch_codec_t codec = { 0 }; - int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE / 2]; + SWITCH_ALIGN int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE / 2]; uint32_t tlen; const char *macro_name = "eavesdrop_announce"; const char *id_name = NULL; diff --git a/src/switch_ivr_originate.c b/src/switch_ivr_originate.c index 99c70991bde..09cf0409aa7 100644 --- a/src/switch_ivr_originate.c +++ b/src/switch_ivr_originate.c @@ -26,12 +26,15 @@ * Anthony Minessale II <anthm@freeswitch.org> * Michael Jerris <mike@jerris.com> * Travis Cross <tc@traviscross.com> + * Stephane Alnet <stephane@shimaore.net> * * switch_ivr_originate.c -- IVR Library (originate) * */ #include <switch.h> +#include <switch_simd.h> + #define QUOTED_ESC_COMMA 1 #define UNQUOTED_ESC_COMMA 2 @@ -1899,7 +1902,7 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void * early_state_t *state = (early_state_t *) obj; originate_status_t originate_status[MAX_PEERS] = { {0} }; uint8_t array_pos = 0; - int16_t mux_data[SWITCH_RECOMMENDED_BUFFER_SIZE / 2] = { 0 }; + SWITCH_ALIGN int16_t mux_data[SWITCH_RECOMMENDED_BUFFER_SIZE / 2] = { 0 }; int32_t sample; switch_codec_t read_codecs[MAX_PEERS] = { {0} }; int i, x, ready = 0, answered = 0, ring_ready = 0; @@ -1980,11 +1983,15 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void * if (datalen < read_frame->datalen) { datalen = read_frame->datalen; } +#ifdef SIMD + SIMD_mux_sln(mux_data, data, (int) read_frame->datalen / 2); +#else for (x = 0; x < (int) read_frame->datalen / 2; x++) { sample = data[x] + mux_data[x]; switch_normalize_to_16bit(sample); mux_data[x] = (int16_t) sample; } +#endif } } else { status = switch_core_session_read_frame(session, &read_frame, SWITCH_IO_FLAG_NONE, 0); diff --git a/src/switch_resample.c b/src/switch_resample.c index ca14a221aba..d8beceff13f 100644 --- a/src/switch_resample.c +++ b/src/switch_resample.c @@ -24,6 +24,7 @@ * Contributor(s): * * Anthony Minessale II <anthm@freeswitch.org> + * Stephane Alnet <stephane@shimaore.net> * * * switch_resample.c -- Resampler @@ -36,6 +37,7 @@ #include <switch_private.h> #endif #include <speex/speex_resampler.h> +#include <switch_simd.h> #define NORMFACT (float)0x8000 #define MAXSAMPLE (float)0x7FFF @@ -122,6 +124,8 @@ SWITCH_DECLARE(switch_size_t) switch_float_to_short(float *f, short *s, switch_s { switch_size_t i; float ft; + + // SIMD:Unused, only in mod_managed. for (i = 0; i < len; i++) { ft = f[i] * NORMFACT; if (ft >= 0) { @@ -129,6 +133,9 @@ SWITCH_DECLARE(switch_size_t) switch_float_to_short(float *f, short *s, switch_s } else { s[i] = (short) (ft - 0.5); } + // SIMD: No instruction to convert `ps` to `epi16` (saturated or not). + // We could first convert to epi32 (`simde_mm256_cvtps_epi32`) then down to saturated epi16 + // (`simde_mm256_cvtsepi32_epi16`). if ((float) s[i] > MAXSAMPLE) s[i] = (short) MAXSAMPLE / 2; if (s[i] < (short) -MAXSAMPLE) @@ -145,6 +152,7 @@ SWITCH_DECLARE(int) switch_char_to_float(char *c, float *f, int len) return (-1); } + // SIMD:Unused, only in mod_managed. for (i = 1; i < len; i += 2) { f[(int) (i / 2)] = (float) (((c[i]) * 0x100) + c[i - 1]); f[(int) (i / 2)] /= NORMFACT; @@ -161,6 +169,7 @@ SWITCH_DECLARE(int) switch_float_to_char(float *f, char *c, int len) int i; float ft; long l; + // SIMD: Unused, only in mod_managed. for (i = 0; i < len; i++) { ft = f[i] * NORMFACT; if (ft >= 0) { @@ -177,7 +186,7 @@ SWITCH_DECLARE(int) switch_float_to_char(float *f, char *c, int len) SWITCH_DECLARE(int) switch_short_to_float(short *s, float *f, int len) { int i; - + // SIMD: Unused, only in mod_managed. for (i = 0; i < len; i++) { f[i] = (float) (s[i]) / NORMFACT; /* f[i] = (float) s[i]; */ @@ -185,10 +194,13 @@ SWITCH_DECLARE(int) switch_short_to_float(short *s, float *f, int len) return len; } - SWITCH_DECLARE(void) switch_swap_linear(int16_t *buf, int len) { int i; + + /* SIMD: there is no point in implementing this using SIMD. + * Rely on GCC optimization instead, when adding AVX/AVX2 flags. + */ for (i = 0; i < len; i++) { buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00); } @@ -217,6 +229,7 @@ SWITCH_DECLARE(void) switch_generate_sln_silence(int16_t *data, uint32_t samples sum_rnd += rnd2; } + // SIMD? s = (int16_t) ((int16_t) sum_rnd / (int) divisor); for (j = 0; j < channels; j++) { @@ -241,11 +254,15 @@ SWITCH_DECLARE(uint32_t) switch_merge_sln(int16_t *data, uint32_t samples, int16 x = samples; } +#ifdef SIMD + SIMD_mux_sln(data, other_data, x * channels); +#else for (i = 0; i < x * channels; i++) { z = data[i] + other_data[i]; switch_normalize_to_16bit(z); data[i] = (int16_t) z; } +#endif return x; } @@ -264,9 +281,16 @@ SWITCH_DECLARE(uint32_t) switch_unmerge_sln(int16_t *data, uint32_t samples, int x = samples; } +#ifdef SIMD + // Only used in switch_core_session_read_frame for read_demux_frame, I don't know how often this happens. for (i = 0; i < x * channels; i++) { data[i] -= other_data[i]; } +#else + for (i = 0; i < x * channels; i++) { + data[i] -= other_data[i]; + } +#endif return x; } @@ -278,6 +302,9 @@ SWITCH_DECLARE(void) switch_mux_channels(int16_t *data, switch_size_t samples, u switch_assert(channels < 11); + /* Due to how the data is stored (channels are interleaved, and the number of channels is not fixed), there + * is no gain in using SIMD, since moving the data around in memory would be time-prohibitive. + */ if (orig_channels > channels) { if (channels == 1) { for (i = 0; i < samples; i++) { @@ -344,26 +371,32 @@ SWITCH_DECLARE(void) switch_mux_channels(int16_t *data, switch_size_t samples, u SWITCH_DECLARE(void) switch_change_sln_volume_granular(int16_t *data, uint32_t samples, int32_t vol) { - double newrate = 0; + /* Note: existing code was being pedantic in using `double` because we're at most providing 28 bits + * of precision, while a regular `float` can accodomodate 24 bits. Not sure the expense + * in processing terms is worth the audible outcome over 16 bits. + * Converting to float so that SIMD can process larger bunches at once. + */ + float newrate = 0; // change in dB mapped to ratio for output sample // computed as (powf(10.0f, (float)(change_in_dB) / 20.0f)) - static const double pos[SWITCH_GRANULAR_VOLUME_MAX] = { + static const float pos[SWITCH_GRANULAR_VOLUME_MAX] = { 1.122018, 1.258925, 1.412538, 1.584893, 1.778279, 1.995262, 2.238721, 2.511887, 2.818383, 3.162278, 3.548134, 3.981072, 4.466835, 5.011872, 5.623413, 6.309574, 7.079458, 7.943282, 8.912509, 10.000000, 11.220183, 12.589254, 14.125375, 15.848933, 17.782795, 19.952621, 22.387213, 25.118862, 28.183832, 31.622776, 35.481335, 39.810719, 44.668358, 50.118729, 56.234131, 63.095726, 70.794586, 79.432816, 89.125107, 100.000000, 112.201836, 125.892517, 141.253784, 158.489334, 177.827942, 199.526215, 223.872070, 251.188705, 281.838318, 316.227753 }; - static const double neg[SWITCH_GRANULAR_VOLUME_MAX] = { + static const float neg[SWITCH_GRANULAR_VOLUME_MAX] = { 0.891251, 0.794328, 0.707946, 0.630957, 0.562341, 0.501187, 0.446684, 0.398107, 0.354813, 0.316228, 0.281838, 0.251189, 0.223872, 0.199526, 0.177828, 0.158489, 0.141254, 0.125893, 0.112202, 0.100000, 0.089125, 0.079433, 0.070795, 0.063096, 0.056234, 0.050119, 0.044668, 0.039811, 0.035481, 0.031623, 0.028184, 0.025119, 0.022387, 0.019953, 0.017783, 0.015849, 0.014125, 0.012589, 0.011220, 0.010000, 0.008913, 0.007943, 0.007079, 0.006310, 0.005623, 0.005012, 0.004467, 0.003981, 0.003548, 0.000000 // NOTE mapped -50 dB ratio to total silence instead of 0.003162 }; - const double *chart; + const float *chart; uint32_t i; + // FIXME shouldn't we memset here? if (vol == 0) return; switch_normalize_volume_granular(vol); @@ -385,11 +418,37 @@ SWITCH_DECLARE(void) switch_change_sln_volume_granular(int16_t *data, uint32_t s uint32_t x; int16_t *fp = data; +#ifdef UNFINISHED_SIMD + simde__m256 _newrate = simde_mm256_broadcast_ss(&newrate); + enum { + float_per_m128 = sizeof(simde__m128i) / sizeof(float), + mask_float_per_m128 = float_per_m128-1, + }; + uint32_t blocks = samples & ~mask_float_per_m128; + uint32_t extra = samples & mask_float_per_m128; + for (x = 0; x < blocks; x += float_per_m128) { + /* This does what is written, but it doesn't sound like this would be the most efficient + * way to do it. Might be better to have the coefficients as integer (fixed-point) and + * use e.g. _mm256_mulhi_epi16 (maybe). + */ + simde_mm_storeu_si128(fp+x, // not sure about this one + simde_mm256_cvtsepi32_epi16( + simde_mm256_cvtps_epi32( + simde_mm256_mul_ps( + _newrate, + simde_mm256_cvtepi32_ps( + simde_mm256_cvtepi16_epi32( + simde_mm_loadu_si128(fp+x))) // not sure about this one + )))); + } + // FIXME write the code for the `extra` data. +#else for (x = 0; x < samples; x++) { tmp = (int32_t) (fp[x] * newrate); switch_normalize_to_16bit(tmp); fp[x] = (int16_t) tmp; } +#endif } else { memset(data, 0, samples * 2); } @@ -424,6 +483,7 @@ SWITCH_DECLARE(void) switch_change_sln_volume(int16_t *data, uint32_t samples, i uint32_t x; int16_t *fp = data; + // SIMD for (x = 0; x < samples; x++) { tmp = (int32_t) (fp[x] * newrate); switch_normalize_to_16bit(tmp); @@ -539,6 +599,7 @@ SWITCH_DECLARE(switch_status_t) switch_agc_feed(switch_agc_t *agc, int16_t *data uint32_t energy = 0; int i; + // SIMD for (i = 0; i < samples * channels; i++) { energy += abs(data[i]); } diff --git a/src/switch_rtp.c b/src/switch_rtp.c index 46e13253b73..b1cd08e7b25 100644 --- a/src/switch_rtp.c +++ b/src/switch_rtp.c @@ -37,6 +37,8 @@ #endif #include <switch_stun.h> #include <fspr_network_io.h> +#include <switch_simd.h> + #undef PACKAGE_NAME #undef PACKAGE_STRING #undef PACKAGE_TARNAME @@ -8327,6 +8329,7 @@ static int rtp_common_write(switch_rtp_t *rtp_session, if (rtp_session->flags[SWITCH_RTP_FLAG_VAD] && rtp_session->last_rtp_hdr.pt == rtp_session->vad_data.read_codec->implementation->ianacode) { + // Candidate for SWITCH_ALIGN if we add SIMD to the energy computation int16_t decoded[SWITCH_RECOMMENDED_BUFFER_SIZE / sizeof(int16_t)] = { 0 }; uint32_t rate = 0; uint32_t codec_flags = 0;