diff --git a/compile_flags.txt b/compile_flags.txt
new file mode 100644
index 00000000000..fff9fc83e1d
--- /dev/null
+++ b/compile_flags.txt
@@ -0,0 +1,11 @@
+-D
+SIMD
+-I
+src/include
+-I
+src/include/private
+-I
+libs/teletone/src
+-mavx
+-mavx2
+-march=native
diff --git a/src/include/switch_simd.h b/src/include/switch_simd.h
new file mode 100644
index 00000000000..7908472686a
--- /dev/null
+++ b/src/include/switch_simd.h
@@ -0,0 +1,337 @@
+/*
+ * (c) 2025 Stéphane Alnet
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * Contributor(s):
+ * Stéphane Alnet <stephane@shimaore.net>
+ *
+ * switch_simd.h -- SIMD definitions
+ *
+ */
+
+#ifndef SWITCH_SIMD_H
+#define SWITCH_SIMD_H
+
+#ifdef SIMD
+
+/* The initial goal of this module is to provide noticeable speed improvements for audio muxing. It probably could be extended to video processing, but I haven't tried yet.
+ * For higher speed improvemrnts you generally want your data to be aligned on the SIMD datasize.
+ * (See e.g. https://www.agner.org/optimize/instruction_tables.pdf for speed measurements.)
+ * Here we focus on 256 bits (8 octets) since
+ * - as of 2025 it is essentially available on most x86_64 hardware via AVX and AVX2,
+ * - and is an appropriate size for e.g. PCMU or PCMA at 8kHz (512 bits would be too much for 160 bytes).
+ * For easy alignment, use the SWITCH_ALIGN macro. It can be used in struct/union, and for stack-allocated variables.
+ * Pointers might or might not be aligned. For example, glibc malloc will return 8-octets aligned memory blocks, but an arbitrary pointer inside that structure will not necessarily be aligned!
+ * Alignment results in faster loads and stores - instead of sequencing the load and store, the microcode can use a 128-bit or 256-bit lane to move the data between cache and register in a smaller number of steps.
+ */
+
+#include <stdalign.h>
+#include <string.h>
+#include <simde/x86/sse2.h>
+#include <simde/x86/avx.h>
+#include <simde/x86/avx2.h>
+/* SIMDE will provide substitutes for AVX512 functions on lower platforms. */
+#include <simde/x86/avx512.h>
+
+enum {
+  int16_per_m256i = sizeof(simde__m256i)/sizeof(int16_t),
+  mask_int16_per_m256i = int16_per_m256i-1,
+  int16_per_m128i = sizeof(simde__m128i)/sizeof(int16_t),
+  mask_int16_per_m128i = int16_per_m128i-1,
+  int32_per_m256i = sizeof(simde__m256i)/sizeof(int32_t),
+};
+
+/* Apply the `SWITCH_ALIGN` prefix to:
+ * - function variables
+ * - struct/union fields
+ * e.g.
+ *
+ *   SWITCH_ALIGN int16_t data[SWITCH_RECOMMENDED_BUFFER_SIZE/sizeof(int16_t)];
+ *
+ * Then `data` can be used safely as destination or source for SIMD_mux_aligned_unbound_sln, for example.
+ */
+#define SWITCH_ALIGN alignas(sizeof(simde__m256i))
+
+/* SIMD-optimized int16_t saturated addition
+ * - aligned: both int16_t pointers must be aligned on 256 bits boundary
+ * - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
+ *            will modify data outside of the range if sample%4 != 0; might SIGSEV if the underlying buffer is too short.
+ * It is safe to use with buffers defined as
+ *
+ *    SWITCH_ALIGN data[SWITCH_RECOMMENDED_BUFFER_SIZE];
+ *
+ * for example.
+ */
+inline static void SIMD_mux_sln_m256i_m256i_unbound(simde__m256i *dst, const simde__m256i *add, int samples)
+{
+  int x;
+  const int blocks = samples / int16_per_m256i;
+  for ( x = 0; x < blocks; x++) {
+    /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
+    simde_mm256_store_si256(
+      dst+x,
+      simde_mm256_adds_epi16(
+        /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
+        simde_mm256_load_si256(dst+x),
+        simde_mm256_load_si256(add+x)
+      ));
+  }
+}
+
+/* SIMD-optimized int16_t satured addition
+ * - only the first parameter must be aligned
+ * - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
+ */
+inline static void SIMD_mux_sln_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
+{
+  uint x;
+  const uint blocks = samples / int16_per_m256i;
+  for ( x = 0; x < blocks; x++) {
+    simde_mm256_store_si256(
+      dst+x,
+      simde_mm256_adds_epi16(
+        simde_mm256_load_si256(dst+x),
+        simde_mm256_loadu_si256(add+x*int16_per_m256i)
+      ));
+  }
+}
+
+/* SIMD-optimized int16_t saturated addition
+ * - unbound: underlying buffer must end on m256i (256 bits / 16 int16_t) boundary.
+ */
+inline static void SIMD_mux_sln_int16_int16_unbound(int16_t *dst, const int16_t *add, int samples)
+{
+  uint x;
+  const uint blocks = samples / int16_per_m256i;
+  for ( x = 0; x < blocks; x++) {
+    simde_mm256_storeu_si256(
+      dst+x*int16_per_m256i,
+      simde_mm256_adds_epi16(
+        simde_mm256_loadu_si256(dst+x*int16_per_m256i),
+        simde_mm256_loadu_si256(add+x*int16_per_m256i)
+      ));
+  }
+}
+
+inline static int SIMD_is_aligned256(const void *p) {
+  return (uintptr_t)p % sizeof(simde__m256i) == 0;
+}
+
+inline static int SIMD_is_aligned128(const void *p) {
+  return (uintptr_t)p % sizeof(simde__m128i) == 0;
+}
+
+inline static void SIMD_mux_sln(int16_t *dst, const int16_t *add, int samples)
+{
+  /* Round down to the nearest 256 bits block */
+  uint bound_len = samples & ~mask_int16_per_m256i;
+  uint extra = samples & mask_int16_per_m256i;
+
+  const int dst_aligned = SIMD_is_aligned256(dst);
+  const int src_aligned = SIMD_is_aligned256(add);
+
+  /* Process as much as we can from the original buffer */
+  if (dst_aligned && src_aligned) {
+    SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)dst, (const simde__m256i *)add, bound_len);
+  } else if (dst_aligned) {
+    SIMD_mux_sln_m256i_int16_unbound((simde__m256i *)dst, add, bound_len);
+  } else {
+    SIMD_mux_sln_int16_int16_unbound(dst, add, bound_len);
+  }
+
+  if (extra > 0) {
+    /* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
+     * in local buffers large enough to hold it, then do the maths in SIMD.
+     */
+    SWITCH_ALIGN int16_t _dst[int16_per_m256i];
+    SWITCH_ALIGN int16_t _add[int16_per_m256i];
+    memcpy(_dst, dst+bound_len, sizeof(int16_t) * extra);
+    memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
+    SIMD_mux_sln_m256i_m256i_unbound((simde__m256i *)_dst, (const simde__m256i *)_add, extra);
+    memcpy(dst+bound_len, _dst, sizeof(int16_t) * extra);
+  }
+}
+
+/* In mod_conference we do 16-to-32 bit conversions to avoid overflow. */
+
+/* Convert to unaligned int16_t to unaligned int32_t.
+ * - unbound: might overflow the input and output buffers boundaries if samples is not a multiple of 16.
+ */
+inline static void SIMD_convert32_int16_unbound(int32_t *dst, const int16_t *src, int samples)
+{
+  uint x;
+  const uint blocks = samples / int16_per_m128i;
+  for ( x = 0; x < blocks; x++) {
+    /* Store 8 int32 at once.
+     * Apparently SIMDE doesn't define an _aligned_ store operation, but this is fine.
+     */
+    simde_mm256_storeu_epi32(dst+x,
+      /* Sign-extend from 16-bits to 32-bits */
+      simde_mm256_cvtepi16_epi32(
+        /* Load 8 int16 at one */
+        simde_mm_loadu_epi16(src+x)));
+  }
+}
+
+/* Convert to aligned int32_t (in bunches of 8) to int16_t (in bunches of 8).
+ * - unbound: might overflow the input and output buffer boundaries.
+ */
+inline static void SIMD_convert16_m256i_unbound(simde__m128i *dst, const simde__m256i *src, int samples)
+{
+  uint x;
+  const uint blocks = samples / int32_per_m256i;
+  for ( x = 0; x < blocks; x++) {
+    simde_mm_store_si128(
+      dst+x,
+        simde_mm256_cvtsepi32_epi16(
+          simde_mm256_load_si256(src+x)
+      ));
+  }
+
+}
+
+/* Add int16_t samples to packed int32_t values.
+ * - unbound: might overflow the input and output buffer boundaries.
+ */
+inline static void SIMD_mux32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *add, int samples)
+{
+  uint x;
+  const uint blocks = samples / int16_per_m128i;
+  for ( x = 0; x < blocks; x++) {
+    /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
+    simde_mm256_store_si256(
+      dst+x,
+      simde_mm256_add_epi32(
+        /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
+        simde_mm256_load_si256(dst+x),
+        simde_mm256_cvtepi16_epi32(
+          simde_mm_load_si128(add+x)
+      )));
+  }
+}
+
+/* Add int16_t samples to packed int32_t values.
+ * - unbound: might overflow the input and output buffer boundaries.
+ */
+inline static void SIMD_mux32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
+{
+  uint x;
+  const uint blocks = samples / int16_per_m128i;
+  for ( x = 0; x < blocks; x++) {
+    simde_mm256_store_si256(
+      dst+x,
+      simde_mm256_add_epi32(
+        simde_mm256_load_si256(dst+x),
+        simde_mm256_cvtepi16_epi32(
+          simde_mm_loadu_epi16(add+x*int16_per_m128i)
+      )));
+  }
+}
+
+/* Add int16_t samples to packed int32_t values. */
+inline static void SIMD_mux32_sln(simde__m256i *dst, const int16_t *add, int samples)
+{
+  /* Round down to the nearest 128 bits block */
+  uint bound_len = samples & ~mask_int16_per_m128i;
+  uint extra = samples & mask_int16_per_m128i;
+
+  const int src_aligned = SIMD_is_aligned128(add);
+
+  /* Process as much as we can from the original buffer */
+  if (src_aligned) {
+    SIMD_mux32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len);
+  } else {
+    SIMD_mux32_m256i_int16_unbound(dst, add, bound_len);
+  }
+
+  if (extra > 0) {
+    /* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
+     * in local buffers large enough to hold it, then do the maths in SIMD.
+     */
+    SWITCH_ALIGN int16_t _add[int16_per_m128i];
+    memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
+    SIMD_mux32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra);
+  }
+}
+
+/* Subtract packed, aligned int16_t values from packed, aligned int32_t values.
+ * - unbound: might overflow the input and output buffer boundaries.
+ */
+inline static void SIMD_sub32_m256i_m128i_unbound(simde__m256i *dst, const simde__m128i *sub, int samples)
+{
+  uint x;
+  const uint blocks = samples / int16_per_m128i;
+  for ( x = 0; x < blocks; x++) {
+    /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
+    simde_mm256_store_si256(
+      dst+x,
+      simde_mm256_sub_epi32(
+        /* AVX: Must be aligned on a 32-byte (128 bits) boundary) */
+        simde_mm256_load_si256(dst+x),
+        simde_mm256_cvtepi16_epi32(
+          simde_mm_load_si128(sub+x)
+      )));
+  }
+}
+
+/* Subtract int16_t values from packed, aligned int32_t values.
+ * - unbound: might overflow the input and output buffer boundaries.
+ */
+inline static void SIMD_sub32_m256i_int16_unbound(simde__m256i *dst, const int16_t *add, int samples)
+{
+  uint x;
+  const uint blocks = samples / int16_per_m128i;
+  for ( x = 0; x < blocks; x++) {
+    simde_mm256_store_si256(
+      dst+x,
+      simde_mm256_sub_epi32(
+        simde_mm256_load_si256(dst+x),
+        simde_mm256_cvtepi16_epi32(
+          simde_mm_loadu_epi16(add+x*int16_per_m128i)
+      )));
+  }
+}
+
+/* Subtract int16_t values from packed, aligned int32_t values.
+ */
+inline static void SIMD_sub32_sln(simde__m256i *dst, const int16_t *add, int samples)
+{
+  /* Round down to the nearest 256 bits block */
+  uint bound_len = samples & ~mask_int16_per_m128i;
+  uint extra = samples & mask_int16_per_m128i;
+
+  const int src_aligned = SIMD_is_aligned128(add);
+
+  /* Process as much as we can from the original buffer */
+  if (src_aligned) {
+    SIMD_sub32_m256i_m128i_unbound((simde__m256i *)dst, (const simde__m128i *)add, bound_len);
+  } else {
+    SIMD_sub32_m256i_int16_unbound(dst, add, bound_len);
+  }
+
+  if (extra > 0) {
+    /* Since the original buffers might not go all the way up to the next 256 bits, we copy the data
+     * in local buffers large enough to hold it, then do the maths in SIMD.
+     */
+    SWITCH_ALIGN int16_t _add[int16_per_m128i];
+    memcpy(_add, add+bound_len, sizeof(int16_t) * extra);
+    SIMD_sub32_m256i_m128i_unbound(dst, (const simde__m128i *)_add, extra);
+  }
+}
+
+#else /* SIMD */
+
+#define SWITCH_ALIGN
+
+#endif /* SIMD */
+
+#endif /* SWITCH_SIMD_H */
diff --git a/src/mod/applications/mod_conference/conference_member.c b/src/mod/applications/mod_conference/conference_member.c
index c258e597836..f36ed1bbf80 100644
--- a/src/mod/applications/mod_conference/conference_member.c
+++ b/src/mod/applications/mod_conference/conference_member.c
@@ -35,11 +35,13 @@
  * Seven Du <dujinfang@gmail.com>
  * Emmanuel Schmidbauer <e.schmidbauer@gmail.com>
  * William King <william.king@quentustech.com>
+ * Stephane Alnet <stephane@shimaore.net>
  *
  * mod_conference.c -- Software Conference Bridge
  *
  */
 #include <mod_conference.h>
+#include <switch_simd.h>
 
 int conference_member_noise_gate_check(conference_member_t *member)
 {
@@ -550,7 +552,7 @@ void conference_member_check_channels(switch_frame_t *frame, conference_member_t
 void conference_member_add_file_data(conference_member_t *member, int16_t *data, switch_size_t file_data_len)
 {
 	switch_size_t file_sample_len;
-	int16_t file_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
+	SWITCH_ALIGN int16_t file_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
 
 
 	switch_mutex_lock(member->fnode_mutex);
@@ -618,14 +620,18 @@ void conference_member_add_file_data(conference_member_t *member, int16_t *data,
 					conference_al_process(member->fnode->al, file_frame, file_sample_len * 2, member->conference->rate);
 				}
 
-				for (i = 0; i < (int)file_sample_len * member->conference->channels; i++) {
-					if (member->fnode->mux) {
+				if (member->fnode->mux) {
+#ifdef SIMD
+					SIMD_mux_sln(data, file_frame, (int)file_sample_len * member->conference->channels);
+#else
+					for (i = 0; i < (int)file_sample_len * member->conference->channels; i++) {
 						sample = data[i] + file_frame[i];
 						switch_normalize_to_16bit(sample);
 						data[i] = (int16_t)sample;
-					} else {
-						data[i] = file_frame[i];
 					}
+#endif
+				} else {
+					memcpy(data, file_frame, (int)file_sample_len * member->conference->channels * sizeof(int16_t));
 				}
 
 			}
diff --git a/src/mod/applications/mod_conference/mod_conference.c b/src/mod/applications/mod_conference/mod_conference.c
index aa606170d5e..a0a73998bbe 100644
--- a/src/mod/applications/mod_conference/mod_conference.c
+++ b/src/mod/applications/mod_conference/mod_conference.c
@@ -35,11 +35,13 @@
  * Seven Du <dujinfang@gmail.com>
  * Emmanuel Schmidbauer <e.schmidbauer@gmail.com>
  * William King <william.king@quentustech.com>
+ * Stephane Alnet <stephane@shimaore.net>
  *
  * mod_conference.c -- Software Conference Bridge
  *
  */
 #include <mod_conference.h>
+#include <switch_simd.h>
 
 SWITCH_MODULE_LOAD_FUNCTION(mod_conference_load);
 SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_conference_shutdown);
@@ -218,7 +220,11 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 	uint8_t *async_file_frame;
 	int16_t *bptr;
 	uint32_t x = 0;
+#ifndef SIMD
 	int32_t z = 0;
+#else
+	simde__m256i z;
+#endif
 	conference_cdr_node_t *np;
 	switch_time_t last_heartbeat_time = switch_epoch_time_now(NULL);
 
@@ -552,6 +558,20 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 					}
 				} else {
 					if (has_file_data) {
+#ifdef SIMD
+						int16_t *muxed;
+						muxed = (int16_t *) file_frame;
+						bptr = (int16_t *) async_file_frame;
+						/* Note: we use the fact that the buffers (file_frame & async_file_frame) are allocated
+						 * full-size at SWITCH_RECOMMENDED_BUFFER_SIZE to assert that we do not go over the end
+						 * of the buffer. (We know this because 8192 (SWITCH_RECOMMENDED_BUFFER_SIZE) is a
+						 * multiple of 32 (sizeof(simde__mm256i), so even in the worst case we will end up with an
+						 * integer number of loops.)
+						 * Note: apparently APR doesn't support aligned_alloc / memalign, too bad.
+						 * At least on libc we should be getting 8-bytes aligned malloc(), if this is what APR uses.
+						 */
+						SIMD_mux_sln_int16_int16_unbound(muxed,bptr,file_sample_len * conference->channels);
+#else
 						switch_size_t x;
 						for (x = 0; x < file_sample_len * conference->channels; x++) {
 							int32_t z;
@@ -563,6 +583,7 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 							switch_normalize_to_16bit(z);
 							muxed[x] = (int16_t) z;
 						}
+#endif
 					} else {
 						memcpy(file_frame, async_file_frame, file_sample_len * 2 * conference->channels);
 						has_file_data = 1;
@@ -574,21 +595,33 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 
 		if (ready || has_file_data) {
 			/* Use more bits in the main_frame to preserve the exact sum of the audio samples. */
-			int main_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
-			int16_t write_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
+			SWITCH_ALIGN int32_t main_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
+			SWITCH_ALIGN int16_t write_frame[SWITCH_RECOMMENDED_BUFFER_SIZE] = { 0 };
 
 
 			/* Init the main frame with file data if there is any. */
 			bptr = (int16_t *) file_frame;
 			if (has_file_data && file_sample_len) {
-
+#ifdef SIMD
+				/* Both `file_frame` and `main_frame` have lengths multiple of 8 bytes, so
+				 * we should be safe loading and storing beyond `len`.
+				 */
+				size_t samples = MIN(bytes / 2, file_sample_len * conference->channels);
+				size_t index = file_sample_len * conference->channels * sizeof(main_frame[0]);
+				SIMD_convert32_int16_unbound(main_frame, bptr, samples);
+				memset(main_frame+index, 255, sizeof(main_frame)-index);
+#else
 				for (x = 0; x < bytes / 2; x++) {
+					/* It's unclear here why we say `<=` rather than `<`, looks like an offset-by-one error.
+					 * For now the SIMD code assumes that `<` was meant here.
+					 */
 					if (x <= file_sample_len * conference->channels) {
 						main_frame[x] = (int32_t) bptr[x];
 					} else {
 						memset(&main_frame[x], 255, sizeof(main_frame[x]));
 					}
 				}
+#endif
 			}
 
 			conference->mux_loop_count = 0;
@@ -604,9 +637,13 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 				}
 
 				bptr = (int16_t *) omember->frame;
+#ifdef SIMD
+				SIMD_mux32_sln((simde__m256i *)main_frame, bptr, omember->read / 2);
+#else /* SIMD */
 				for (x = 0; x < omember->read / 2; x++) {
 					main_frame[x] += (int32_t) bptr[x];
 				}
+#endif /* SIMD */
 			}
 
 			/* Create write frame once per member who is not deaf for each sample in the main frame
@@ -636,12 +673,24 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 
 				bptr = (int16_t *) omember->frame;
 
+#ifndef SIMD
 				for (x = 0; x < bytes / 2 ; x++) {
 					z = main_frame[x];
+#else
+				for (x = 0; x < bytes / 2 ; x += int32_per_m256i) {
+					z = simde_mm256_loadu_epi32((simde__m256i *)(main_frame+x));
+#endif
 
 					/* bptr[x] represents my own contribution to this audio sample */
-					if (conference_utils_member_test_flag(omember, MFLAG_HAS_AUDIO) && x <= omember->read / 2) {
+					/* It's unclear here why we say `<=` rather than `<`, looks like an offset-by-one error.
+					 * For now the SIMD code assumes that `<` was meant here.
+					 */
+					if (conference_utils_member_test_flag(omember, MFLAG_HAS_AUDIO) && x < omember->read / 2) {
+#ifndef SIMD
 						z -= (int32_t) bptr[x];
+#else
+						z = simde_mm256_sub_epi32(z, simde_mm256_cvtepi16_epi32(simde_mm_loadu_epi16(bptr+x)));
+#endif
 					}
 
 					/* when there are relationships, we have to do more work by scouring all the members to see if there are any
@@ -655,7 +704,11 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 								int16_t *rptr = (int16_t *) imember->frame;
 								for (rel = imember->relationships; rel; rel = rel->next) {
 									if ((rel->id == omember->id || rel->id == 0) && !switch_test_flag(rel, RFLAG_CAN_SPEAK)) {
+#ifndef SIMD
 										z -= (int32_t) rptr[x];
+#else
+										z = simde_mm256_sub_epi32(z, simde_mm256_cvtepi16_epi32(simde_mm_loadu_epi16(rptr+x)));
+#endif
 										found = 1;
 										break;
 									}
@@ -663,7 +716,11 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 								if (!found) {
 									for (rel = omember->relationships; rel; rel = rel->next) {
 										if ((rel->id == imember->id || rel->id == 0) && !switch_test_flag(rel, RFLAG_CAN_HEAR)) {
+#ifndef SIMD
 											z -= (int32_t) rptr[x];
+#else
+											z = simde_mm256_sub_epi32(z, simde_mm256_cvtepi16_epi32(simde_mm_loadu_epi16(rptr+x)));
+#endif
 											break;
 										}
 									}
@@ -674,8 +731,12 @@ void *SWITCH_THREAD_FUNC conference_thread_run(switch_thread_t *thread, void *ob
 					}
 
 					/* Now we can convert to 16 bit. */
+#ifndef SIMD
 					switch_normalize_to_16bit(z);
 					write_frame[x] = (int16_t) z;
+#else
+					simde_mm_store_si128((simde__m128i *)(write_frame+x), simde_mm256_cvtsepi32_epi16(z));
+#endif
 				}
 
 				if (!omember->channel || switch_channel_test_flag(omember->channel, CF_AUDIO)) {
diff --git a/src/mod/applications/mod_fsv/mod_fsv.c b/src/mod/applications/mod_fsv/mod_fsv.c
index 37f413d6b2e..e7fe3df41cf 100644
--- a/src/mod/applications/mod_fsv/mod_fsv.c
+++ b/src/mod/applications/mod_fsv/mod_fsv.c
@@ -1177,6 +1177,7 @@ static switch_status_t fsv_file_write(switch_file_handle_t *handle, void *data,
 		int i;
 		uint32_t j;
 		int32_t mixed = 0;
+		// SIMD
 		for (i = 0; (size_t)i < *len; i++) {
 			for (j = 0; j < handle->channels; j++) {
 				mixed += xdata[i * handle->channels + j];
diff --git a/src/switch_core_media_bug.c b/src/switch_core_media_bug.c
index 7aabceb7b78..98afc0045a3 100644
--- a/src/switch_core_media_bug.c
+++ b/src/switch_core_media_bug.c
@@ -387,6 +387,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_bug_read(switch_media_bug_t *b
 			right = dp; /* write stream */
 			right_len = wlen;
 		}
+		// SIMD? interleave
 		for (x = 0; x < blen; x++) {
 			if (x < left_len) {
 				*(tp++) = *(left + x);
@@ -401,6 +402,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_bug_read(switch_media_bug_t *b
 		}
 		memcpy(frame->data, bug->tmp, bytes * 2);
 	} else {
+		// SIMD -- at least for normalize
 		for (x = 0; x < blen; x++) {
 			int32_t w = 0, r = 0, z = 0;
 
@@ -414,11 +416,13 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_bug_read(switch_media_bug_t *b
 
 			z = w + r;
 
+			// This makes no sense whatsoever
 			if (z > SWITCH_SMAX || z < SWITCH_SMIN) {
 				if (r) z += (r/2);
 				if (w) z += (w/2);
 			}
 
+			// SIMD --- saturate 32 to 16
 			switch_normalize_to_16bit(z);
 
 			*(fp + x) = (int16_t) z;
diff --git a/src/switch_ivr_async.c b/src/switch_ivr_async.c
index 4075f0adce4..342fc1ee575 100644
--- a/src/switch_ivr_async.c
+++ b/src/switch_ivr_async.c
@@ -28,6 +28,7 @@
  * Bret McDanel <bret AT 0xdecafbad dot com>
  * Luke Dashjr <luke@openmethods.com> (OpenMethods, LLC)
  * Christopher M. Rienzo <chris@rienzo.com>
+ * Stephane Alnet <stephane@shimaore.net>
  *
  * switch_ivr_async.c -- IVR Library (async operations)
  *
@@ -37,6 +38,7 @@
 #include "private/switch_core_pvt.h"
 #include <speex/speex_preprocess.h>
 #include <speex/speex_echo.h>
+#include <switch_simd.h>
 
 struct switch_ivr_dmachine_binding {
 	char *digits;
@@ -831,17 +833,21 @@ static switch_bool_t write_displace_callback(switch_media_bug_t *bug, void *user
 			len = rframe->samples;
 
 			if (dh->mux) {
-				int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE];
+				SWITCH_ALIGN int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE];
 				int16_t *fp = rframe->data;
 				uint32_t x;
 
 				st = switch_core_file_read(&dh->fh, buf, &len);
 
+#ifdef SIMD
+				SIMD_mux_sln(fp, buf, (uint32_t) len * dh->fh.channels);
+#else
 				for (x = 0; x < (uint32_t) len * dh->fh.channels; x++) {
 					int32_t mixed = fp[x] + buf[x];
 					switch_normalize_to_16bit(mixed);
 					fp[x] = (int16_t) mixed;
 				}
+#endif
 			} else {
 				st = switch_core_file_read(&dh->fh, rframe->data, &len);
 				if (len < rframe->samples) {
@@ -932,17 +938,21 @@ static switch_bool_t read_displace_callback(switch_media_bug_t *bug, void *user_
 			len = rframe->samples;
 
 			if (dh->mux) {
-				int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE];
+				SWITCH_ALIGN int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE];
 				int16_t *fp = rframe->data;
 				uint32_t x;
 
 				st = switch_core_file_read(&dh->fh, buf, &len);
 
+#ifdef SIMD
+				SIMD_mux_sln(fp, buf, (uint32_t) len * dh->fh.channels);
+#else
 				for (x = 0; x < (uint32_t) len * dh->fh.channels; x++) {
 					int32_t mixed = fp[x] + buf[x];
 					switch_normalize_to_16bit(mixed);
 					fp[x] = (int16_t) mixed;
 				}
+#endif
 
 			} else {
 				st = switch_core_file_read(&dh->fh, rframe->data, &len);
@@ -2297,7 +2307,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_eavesdrop_session(switch_core_session
 		switch_channel_t *tchannel = switch_core_session_get_channel(tsession);
 		switch_frame_t *read_frame, write_frame = { 0 };
 		switch_codec_t codec = { 0 };
-		int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE / 2];
+		SWITCH_ALIGN int16_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE / 2];
 		uint32_t tlen;
 		const char *macro_name = "eavesdrop_announce";
 		const char *id_name = NULL;
diff --git a/src/switch_ivr_originate.c b/src/switch_ivr_originate.c
index 99c70991bde..09cf0409aa7 100644
--- a/src/switch_ivr_originate.c
+++ b/src/switch_ivr_originate.c
@@ -26,12 +26,15 @@
  * Anthony Minessale II <anthm@freeswitch.org>
  * Michael Jerris <mike@jerris.com>
  * Travis Cross <tc@traviscross.com>
+ * Stephane Alnet <stephane@shimaore.net>
  *
  * switch_ivr_originate.c -- IVR Library (originate)
  *
  */
 
 #include <switch.h>
+#include <switch_simd.h>
+
 #define QUOTED_ESC_COMMA 1
 #define UNQUOTED_ESC_COMMA 2
 
@@ -1899,7 +1902,7 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void *
 	early_state_t *state = (early_state_t *) obj;
 	originate_status_t originate_status[MAX_PEERS] = { {0} };
 	uint8_t array_pos = 0;
-	int16_t mux_data[SWITCH_RECOMMENDED_BUFFER_SIZE / 2] = { 0 };
+	SWITCH_ALIGN int16_t mux_data[SWITCH_RECOMMENDED_BUFFER_SIZE / 2] = { 0 };
 	int32_t sample;
 	switch_codec_t read_codecs[MAX_PEERS] = { {0} };
 	int i, x, ready = 0, answered = 0, ring_ready = 0;
@@ -1980,11 +1983,15 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void *
 						if (datalen < read_frame->datalen) {
 							datalen = read_frame->datalen;
 						}
+#ifdef SIMD
+						SIMD_mux_sln(mux_data, data, (int) read_frame->datalen / 2);
+#else
 						for (x = 0; x < (int) read_frame->datalen / 2; x++) {
 							sample = data[x] + mux_data[x];
 							switch_normalize_to_16bit(sample);
 							mux_data[x] = (int16_t) sample;
 						}
+#endif
 					}
 				} else {
 					status = switch_core_session_read_frame(session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
diff --git a/src/switch_resample.c b/src/switch_resample.c
index ca14a221aba..d8beceff13f 100644
--- a/src/switch_resample.c
+++ b/src/switch_resample.c
@@ -24,6 +24,7 @@
  * Contributor(s):
  *
  * Anthony Minessale II <anthm@freeswitch.org>
+ * Stephane Alnet <stephane@shimaore.net>
  *
  *
  * switch_resample.c -- Resampler
@@ -36,6 +37,7 @@
 #include <switch_private.h>
 #endif
 #include <speex/speex_resampler.h>
+#include <switch_simd.h>
 
 #define NORMFACT (float)0x8000
 #define MAXSAMPLE (float)0x7FFF
@@ -122,6 +124,8 @@ SWITCH_DECLARE(switch_size_t) switch_float_to_short(float *f, short *s, switch_s
 {
 	switch_size_t i;
 	float ft;
+
+	// SIMD:Unused, only in mod_managed.
 	for (i = 0; i < len; i++) {
 		ft = f[i] * NORMFACT;
 		if (ft >= 0) {
@@ -129,6 +133,9 @@ SWITCH_DECLARE(switch_size_t) switch_float_to_short(float *f, short *s, switch_s
 		} else {
 			s[i] = (short) (ft - 0.5);
 		}
+		// SIMD: No instruction to convert `ps` to `epi16` (saturated or not).
+		// We could first convert to epi32 (`simde_mm256_cvtps_epi32`) then down to saturated epi16
+		// (`simde_mm256_cvtsepi32_epi16`).
 		if ((float) s[i] > MAXSAMPLE)
 			s[i] = (short) MAXSAMPLE / 2;
 		if (s[i] < (short) -MAXSAMPLE)
@@ -145,6 +152,7 @@ SWITCH_DECLARE(int) switch_char_to_float(char *c, float *f, int len)
 		return (-1);
 	}
 
+	// SIMD:Unused, only in mod_managed.
 	for (i = 1; i < len; i += 2) {
 		f[(int) (i / 2)] = (float) (((c[i]) * 0x100) + c[i - 1]);
 		f[(int) (i / 2)] /= NORMFACT;
@@ -161,6 +169,7 @@ SWITCH_DECLARE(int) switch_float_to_char(float *f, char *c, int len)
 	int i;
 	float ft;
 	long l;
+	// SIMD: Unused, only in mod_managed.
 	for (i = 0; i < len; i++) {
 		ft = f[i] * NORMFACT;
 		if (ft >= 0) {
@@ -177,7 +186,7 @@ SWITCH_DECLARE(int) switch_float_to_char(float *f, char *c, int len)
 SWITCH_DECLARE(int) switch_short_to_float(short *s, float *f, int len)
 {
 	int i;
-
+	// SIMD: Unused, only in mod_managed.
 	for (i = 0; i < len; i++) {
 		f[i] = (float) (s[i]) / NORMFACT;
 		/* f[i] = (float) s[i]; */
@@ -185,10 +194,13 @@ SWITCH_DECLARE(int) switch_short_to_float(short *s, float *f, int len)
 	return len;
 }
 
-
 SWITCH_DECLARE(void) switch_swap_linear(int16_t *buf, int len)
 {
 	int i;
+
+	/* SIMD: there is no point in implementing this using SIMD.
+	 * Rely on GCC optimization instead, when adding AVX/AVX2 flags.
+	 */
 	for (i = 0; i < len; i++) {
 		buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
 	}
@@ -217,6 +229,7 @@ SWITCH_DECLARE(void) switch_generate_sln_silence(int16_t *data, uint32_t samples
 			sum_rnd += rnd2;
 		}
 
+		// SIMD?
 		s = (int16_t) ((int16_t) sum_rnd / (int) divisor);
 
 		for (j = 0; j < channels; j++) {
@@ -241,11 +254,15 @@ SWITCH_DECLARE(uint32_t) switch_merge_sln(int16_t *data, uint32_t samples, int16
 		x = samples;
 	}
 
+#ifdef SIMD
+	SIMD_mux_sln(data, other_data, x * channels);
+#else
 	for (i = 0; i < x * channels; i++) {
 		z = data[i] + other_data[i];
 		switch_normalize_to_16bit(z);
 		data[i] = (int16_t) z;
 	}
+#endif
 
 	return x;
 }
@@ -264,9 +281,16 @@ SWITCH_DECLARE(uint32_t) switch_unmerge_sln(int16_t *data, uint32_t samples, int
 		x = samples;
 	}
 
+#ifdef SIMD
+	// Only used in switch_core_session_read_frame for read_demux_frame, I don't know how often this happens.
 	for (i = 0; i < x * channels; i++) {
 		data[i] -= other_data[i];
 	}
+#else
+	for (i = 0; i < x * channels; i++) {
+		data[i] -= other_data[i];
+	}
+#endif
 
 	return x;
 }
@@ -278,6 +302,9 @@ SWITCH_DECLARE(void) switch_mux_channels(int16_t *data, switch_size_t samples, u
 
 	switch_assert(channels < 11);
 
+	/* Due to how the data is stored (channels are interleaved, and the number of channels is not fixed), there
+	 * is no gain in using SIMD, since moving the data around in memory would be time-prohibitive.
+	 */
 	if (orig_channels > channels) {
 		if (channels == 1) {
 			for (i = 0; i < samples; i++) {
@@ -344,26 +371,32 @@ SWITCH_DECLARE(void) switch_mux_channels(int16_t *data, switch_size_t samples, u
 
 SWITCH_DECLARE(void) switch_change_sln_volume_granular(int16_t *data, uint32_t samples, int32_t vol)
 {
-	double newrate = 0;
+	/* Note: existing code was being pedantic in using `double` because we're at most providing 28 bits
+	 * of precision, while a regular `float` can accodomodate 24 bits. Not sure the expense
+	 * in processing terms is worth the audible outcome over 16 bits.
+	 * Converting to float so that SIMD can process larger bunches at once.
+	 */
+	float newrate = 0;
 	// change in dB mapped to ratio for output sample
 	// computed as (powf(10.0f, (float)(change_in_dB) / 20.0f))
-	static const double pos[SWITCH_GRANULAR_VOLUME_MAX] = {
+	static const float pos[SWITCH_GRANULAR_VOLUME_MAX] = {
 		  1.122018,   1.258925,   1.412538,   1.584893,   1.778279,   1.995262,   2.238721,   2.511887,   2.818383,   3.162278,
 		  3.548134,   3.981072,   4.466835,   5.011872,   5.623413,   6.309574,   7.079458,   7.943282,   8.912509,  10.000000,
 		 11.220183,  12.589254,  14.125375,  15.848933,  17.782795,  19.952621,  22.387213,  25.118862,  28.183832,  31.622776,
 		 35.481335,  39.810719,  44.668358,  50.118729,  56.234131,  63.095726,  70.794586,  79.432816,  89.125107, 100.000000,
 		112.201836, 125.892517, 141.253784, 158.489334, 177.827942, 199.526215, 223.872070, 251.188705, 281.838318, 316.227753
 	};
-	static const double neg[SWITCH_GRANULAR_VOLUME_MAX] = {
+	static const float neg[SWITCH_GRANULAR_VOLUME_MAX] = {
 		0.891251, 0.794328, 0.707946, 0.630957, 0.562341, 0.501187, 0.446684, 0.398107, 0.354813, 0.316228,
 		0.281838, 0.251189, 0.223872, 0.199526, 0.177828, 0.158489, 0.141254, 0.125893, 0.112202, 0.100000,
 		0.089125, 0.079433, 0.070795, 0.063096, 0.056234, 0.050119, 0.044668, 0.039811, 0.035481, 0.031623,
 		0.028184, 0.025119, 0.022387, 0.019953, 0.017783, 0.015849, 0.014125, 0.012589, 0.011220, 0.010000,
 		0.008913, 0.007943, 0.007079, 0.006310, 0.005623, 0.005012, 0.004467, 0.003981, 0.003548, 0.000000  // NOTE mapped -50 dB ratio to total silence instead of 0.003162
 	};
-	const double *chart;
+	const float *chart;
 	uint32_t i;
 
+	// FIXME shouldn't we memset here?
 	if (vol == 0) return;
 
 	switch_normalize_volume_granular(vol);
@@ -385,11 +418,37 @@ SWITCH_DECLARE(void) switch_change_sln_volume_granular(int16_t *data, uint32_t s
 		uint32_t x;
 		int16_t *fp = data;
 
+#ifdef UNFINISHED_SIMD
+		simde__m256 _newrate = simde_mm256_broadcast_ss(&newrate);
+		enum {
+			float_per_m128 = sizeof(simde__m128i) / sizeof(float),
+			mask_float_per_m128 = float_per_m128-1,
+		};
+		uint32_t blocks = samples & ~mask_float_per_m128;
+		uint32_t extra = samples & mask_float_per_m128;
+		for (x = 0; x < blocks; x += float_per_m128) {
+			/* This does what is written, but it doesn't sound like this would be the most efficient
+			 * way to do it. Might be better to have the coefficients as integer (fixed-point) and
+			 * use e.g. _mm256_mulhi_epi16 (maybe).
+			 */
+			simde_mm_storeu_si128(fp+x, // not sure about this one
+				simde_mm256_cvtsepi32_epi16(
+					simde_mm256_cvtps_epi32(
+						simde_mm256_mul_ps(
+							_newrate,
+							simde_mm256_cvtepi32_ps(
+							simde_mm256_cvtepi16_epi32(
+								simde_mm_loadu_si128(fp+x))) // not sure about this one
+							))));
+		}
+		// FIXME write the code for the `extra` data.
+#else
 		for (x = 0; x < samples; x++) {
 			tmp = (int32_t) (fp[x] * newrate);
 			switch_normalize_to_16bit(tmp);
 			fp[x] = (int16_t) tmp;
 		}
+#endif
 	} else {
 		memset(data, 0, samples * 2);
 	}
@@ -424,6 +483,7 @@ SWITCH_DECLARE(void) switch_change_sln_volume(int16_t *data, uint32_t samples, i
 		uint32_t x;
 		int16_t *fp = data;
 
+		// SIMD
 		for (x = 0; x < samples; x++) {
 			tmp = (int32_t) (fp[x] * newrate);
 			switch_normalize_to_16bit(tmp);
@@ -539,6 +599,7 @@ SWITCH_DECLARE(switch_status_t) switch_agc_feed(switch_agc_t *agc, int16_t *data
 		uint32_t energy = 0;
 		int i;
 
+		// SIMD
 		for (i = 0; i < samples * channels; i++) {
 			energy += abs(data[i]);
 		}
diff --git a/src/switch_rtp.c b/src/switch_rtp.c
index 46e13253b73..b1cd08e7b25 100644
--- a/src/switch_rtp.c
+++ b/src/switch_rtp.c
@@ -37,6 +37,8 @@
 #endif
 #include <switch_stun.h>
 #include <fspr_network_io.h>
+#include <switch_simd.h>
+
 #undef PACKAGE_NAME
 #undef PACKAGE_STRING
 #undef PACKAGE_TARNAME
@@ -8327,6 +8329,7 @@ static int rtp_common_write(switch_rtp_t *rtp_session,
 	if (rtp_session->flags[SWITCH_RTP_FLAG_VAD] &&
 		rtp_session->last_rtp_hdr.pt == rtp_session->vad_data.read_codec->implementation->ianacode) {
 
+		// Candidate for SWITCH_ALIGN if we add SIMD to the energy computation
 		int16_t decoded[SWITCH_RECOMMENDED_BUFFER_SIZE / sizeof(int16_t)] = { 0 };
 		uint32_t rate = 0;
 		uint32_t codec_flags = 0;