From 0fc6e243814403036a1cca33bc769d79585a56cb Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sat, 11 Apr 2026 20:34:23 -0400 Subject: [PATCH] NEON-optimize 9-7 IDWT Takes `bin/bench_dwt -I` from 0.865 s to 0.672 s on my system. --- src/lib/openjp2/dwt.c | 90 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 11aae472d..a1b0f6ffe 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -55,6 +55,9 @@ #if (defined(__AVX2__) || defined(__AVX512F__)) #include #endif +#ifdef __ARM_NEON +#include +#endif #if defined(__GNUC__) #pragma GCC poison malloc calloc realloc free @@ -3282,6 +3285,72 @@ static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w, } } +#elif defined(__ARM_NEON) + +static void opj_v8dwt_decode_step1_neon(opj_v8_t* w, + OPJ_UINT32 start, + OPJ_UINT32 end, + const OPJ_FLOAT32 c) +{ + OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; + OPJ_UINT32 i; + float32x4_t vc = vdupq_n_f32(c); + /* To be adapted if NB_ELTS_V8 changes */ + fw += 2 * NB_ELTS_V8 * start; + for (i = start; i < end; ++i, fw += 2 * NB_ELTS_V8) { + float32x4_t v0 = vld1q_f32(fw); + float32x4_t v1 = vld1q_f32(fw + 4); + vst1q_f32(fw, vmulq_f32(v0, vc)); + vst1q_f32(fw + 4, vmulq_f32(v1, vc)); + } +} + +static void opj_v8dwt_decode_step2_neon(opj_v8_t* l, opj_v8_t* w, + OPJ_UINT32 start, + OPJ_UINT32 end, + OPJ_UINT32 m, + OPJ_FLOAT32 c) +{ + OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l; + OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w; + OPJ_UINT32 i; + OPJ_UINT32 imax = opj_uint_min(end, m); + float32x4_t vc; + if (start > 0) { + fw += 2 * NB_ELTS_V8 * start; + fl = fw - 2 * NB_ELTS_V8; + } + /* To be adapted if NB_ELTS_V8 changes */ + vc = vdupq_n_f32(c); + for (i = start; i < imax; ++i) { + float32x4_t fl0 = vld1q_f32(fl); + float32x4_t fl1 = vld1q_f32(fl + 4); + float32x4_t fw0 = vld1q_f32(fw); + float32x4_t fw1 = vld1q_f32(fw + 4); + float32x4_t fwm8 = vld1q_f32(fw - 8); + float32x4_t fwm4 = vld1q_f32(fw - 4); + fwm8 = vmlaq_f32(fwm8, vaddq_f32(fl0, fw0), vc); + fwm4 = vmlaq_f32(fwm4, vaddq_f32(fl1, fw1), vc); + vst1q_f32(fw - 8, fwm8); + vst1q_f32(fw - 4, fwm4); + fl = fw; + fw += 2 * NB_ELTS_V8; + } + if (m < end) { + float32x4_t vc2, fl0, fl1, fwm8, fwm4; + assert(m + 1 == end); + vc2 = vaddq_f32(vc, vc); + fl0 = vld1q_f32(fl); + fl1 = vld1q_f32(fl + 4); + fwm8 = vld1q_f32(fw - 8); + fwm4 = vld1q_f32(fw - 4); + fwm8 = vmlaq_f32(fwm8, fl0, vc2); + fwm4 = vmlaq_f32(fwm4, fl1, vc2); + vst1q_f32(fw - 8, fwm8); + vst1q_f32(fw - 4, fwm4); + } +} + #else static void opj_v8dwt_decode_step1(opj_v8_t* w, @@ -3395,6 +3464,27 @@ static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) dwt->win_h_x0, dwt->win_h_x1, (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), _mm_set1_ps(-opj_dwt_alpha)); +#elif defined(__ARM_NEON) + opj_v8dwt_decode_step1_neon(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, + opj_K); + opj_v8dwt_decode_step1_neon(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, + two_invK); + opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1, + dwt->win_l_x0, dwt->win_l_x1, + (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), + -opj_dwt_delta); + opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1, + dwt->win_h_x0, dwt->win_h_x1, + (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), + -opj_dwt_gamma); + opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1, + dwt->win_l_x0, dwt->win_l_x1, + (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), + -opj_dwt_beta); + opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1, + dwt->win_h_x0, dwt->win_h_x1, + (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), + -opj_dwt_alpha); #else opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, opj_K);