Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions src/lib/openjp2/dwt.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
#if (defined(__AVX2__) || defined(__AVX512F__))
#include <immintrin.h>
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif

#if defined(__GNUC__)
#pragma GCC poison malloc calloc realloc free
Expand Down Expand Up @@ -3282,6 +3285,72 @@ static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
}
}

#elif defined(__ARM_NEON)

static void opj_v8dwt_decode_step1_neon(opj_v8_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
const OPJ_FLOAT32 c)
{
OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
OPJ_UINT32 i;
float32x4_t vc = vdupq_n_f32(c);
/* To be adapted if NB_ELTS_V8 changes */
fw += 2 * NB_ELTS_V8 * start;
for (i = start; i < end; ++i, fw += 2 * NB_ELTS_V8) {
float32x4_t v0 = vld1q_f32(fw);
float32x4_t v1 = vld1q_f32(fw + 4);
vst1q_f32(fw, vmulq_f32(v0, vc));
vst1q_f32(fw + 4, vmulq_f32(v1, vc));
}
}

static void opj_v8dwt_decode_step2_neon(opj_v8_t* l, opj_v8_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
OPJ_UINT32 m,
OPJ_FLOAT32 c)
{
OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
OPJ_UINT32 i;
OPJ_UINT32 imax = opj_uint_min(end, m);
float32x4_t vc;
if (start > 0) {
fw += 2 * NB_ELTS_V8 * start;
fl = fw - 2 * NB_ELTS_V8;
}
/* To be adapted if NB_ELTS_V8 changes */
vc = vdupq_n_f32(c);
for (i = start; i < imax; ++i) {
float32x4_t fl0 = vld1q_f32(fl);
float32x4_t fl1 = vld1q_f32(fl + 4);
float32x4_t fw0 = vld1q_f32(fw);
float32x4_t fw1 = vld1q_f32(fw + 4);
float32x4_t fwm8 = vld1q_f32(fw - 8);
float32x4_t fwm4 = vld1q_f32(fw - 4);
fwm8 = vmlaq_f32(fwm8, vaddq_f32(fl0, fw0), vc);
fwm4 = vmlaq_f32(fwm4, vaddq_f32(fl1, fw1), vc);
vst1q_f32(fw - 8, fwm8);
vst1q_f32(fw - 4, fwm4);
fl = fw;
fw += 2 * NB_ELTS_V8;
}
if (m < end) {
float32x4_t vc2, fl0, fl1, fwm8, fwm4;
assert(m + 1 == end);
vc2 = vaddq_f32(vc, vc);
fl0 = vld1q_f32(fl);
fl1 = vld1q_f32(fl + 4);
fwm8 = vld1q_f32(fw - 8);
fwm4 = vld1q_f32(fw - 4);
fwm8 = vmlaq_f32(fwm8, fl0, vc2);
fwm4 = vmlaq_f32(fwm4, fl1, vc2);
vst1q_f32(fw - 8, fwm8);
vst1q_f32(fw - 4, fwm4);
}
}

#else

static void opj_v8dwt_decode_step1(opj_v8_t* w,
Expand Down Expand Up @@ -3395,6 +3464,27 @@ static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
_mm_set1_ps(-opj_dwt_alpha));
#elif defined(__ARM_NEON)
opj_v8dwt_decode_step1_neon(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
opj_K);
opj_v8dwt_decode_step1_neon(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
two_invK);
opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1,
dwt->win_l_x0, dwt->win_l_x1,
(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-opj_dwt_delta);
opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1,
dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-opj_dwt_gamma);
opj_v8dwt_decode_step2_neon(dwt->wavelet + b, dwt->wavelet + a + 1,
dwt->win_l_x0, dwt->win_l_x1,
(OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
-opj_dwt_beta);
opj_v8dwt_decode_step2_neon(dwt->wavelet + a, dwt->wavelet + b + 1,
dwt->win_h_x0, dwt->win_h_x1,
(OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
-opj_dwt_alpha);
#else
opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
opj_K);
Expand Down
Loading