Merge pull request #1961 from folkertdev/pmadd-correct-signedness

sayantn · web-flow · commit 50134e10cbd2 · 2025-11-17T15:52:10.000Z
correct signedness of pmadd arguments
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
@@ -1773,7 +1773,7 @@ pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32())) }
+    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
 }
 
 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
@@ -3702,7 +3702,7 @@ unsafe extern "C" {
     #[link_name = "llvm.x86.avx2.phsub.sw"]
     fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
     #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
-    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
+    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
     #[link_name = "llvm.x86.avx2.mpsadbw"]
     fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
@@ -5955,7 +5955,7 @@ pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
 pub fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    unsafe { transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64())) }
+    unsafe { transmute(vpmaddubsw(a.as_u8x64(), b.as_i8x64())) }
 }
 
 /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11688,7 +11688,7 @@ unsafe extern "C" {
     fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
 
     #[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
-    fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
+    fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;
 
     #[link_name = "llvm.x86.avx512.packssdw.512"]
     fn vpackssdw(a: i32x16, b: i32x16) -> i16x32;