Skip to content

Commit bd5a613

Browse files
committed
A little vectorized pure "C" code... ^_^.
Signed-off-by: jpsdr <jpsdr.psx@free.fr>
1 parent 8223dd9 commit bd5a613

1 file changed

Lines changed: 126 additions & 107 deletions

File tree

ResampleMT/resample.cpp

Lines changed: 126 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -321,34 +321,33 @@ static void resize_v_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
321321

322322
for (int y = MinY; y < MaxY; y++)
323323
{
324-
//const int kernel_size = program->kernel_sizes[y];
325324
const BYTE *src_ptr = src + pitch_table[program->pixel_offset[y]];
326325

327-
// perhaps helps vectorizing decision
328-
//const int ksmod4 = (kernel_size >> 2) << 2;
329-
330326
for (int x = 0; x < width; x++)
331327
{
332328
const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + x;
333329

334-
int result = rounder;
330+
int resultx4[4] = {rounder,0,0,0};
335331

336332
for (int i = 0; i < ksmod4; i += 4)
337333
{
338-
result +=((int)*(src2_ptr))*current_coeff[i];
339-
result +=((int)*(src2_ptr+src_pitch1))*current_coeff[i+1];
340-
result +=((int)*(src2_ptr+src_pitch2))*current_coeff[i+2];
341-
result +=((int)*(src2_ptr+src_pitch3))*current_coeff[i+3];
334+
resultx4[0] += ((short)*(src2_ptr))*current_coeff[i];
335+
resultx4[1] += ((short)*(src2_ptr+src_pitch1))*current_coeff[i+1];
336+
resultx4[2] += ((short)*(src2_ptr+src_pitch2))*current_coeff[i+2];
337+
resultx4[3] += ((short)*(src2_ptr+src_pitch3))*current_coeff[i+3];
342338
src2_ptr += src_pitch4;
343339
}
340+
341+
int result_single = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
342+
344343
for (int i = ksmod4; i < kernel_size; i++)
345344
{
346-
result +=((int)*(src2_ptr))*current_coeff[i];
345+
result_single += ((short)*(src2_ptr))*current_coeff[i];
347346
src2_ptr += src_pitch1;
348347
}
349-
result = result >> FPScale8bits;
350-
result = (result>TabMax[x & 0x03]) ? TabMax[x & 0x3] : (result<16) ? 16 : result;
351-
dst[x] = (BYTE) result;
348+
result_single = result_single >> FPScale8bits;
349+
result_single = (result_single>TabMax[x & 0x03]) ? TabMax[x & 0x3] : (result_single<16) ? 16 : result_single;
350+
dst[x] = (BYTE) result_single;
352351
}
353352

354353
dst += dst_pitch1;
@@ -359,34 +358,33 @@ static void resize_v_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
359358
{
360359
for (int y = MinY; y < MaxY; y++)
361360
{
362-
//const int kernel_size = program->kernel_sizes[y];
363361
const BYTE *src_ptr = src + pitch_table[program->pixel_offset[y]];
364362

365-
// perhaps helps vectorizing decision
366-
//const int ksmod4 = (kernel_size >> 2) << 2;
367-
368363
for (int x = 0; x < width; x++)
369364
{
370365
const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + x;
371366

372-
int result = rounder;
367+
int resultx4[4] = {rounder,0,0,0};
373368

374369
for (int i = 0; i < ksmod4; i += 4)
375370
{
376-
result +=((int)*(src2_ptr))*current_coeff[i];
377-
result +=((int)*(src2_ptr+src_pitch1))*current_coeff[i+1];
378-
result +=((int)*(src2_ptr+src_pitch2))*current_coeff[i+2];
379-
result +=((int)*(src2_ptr+src_pitch3))*current_coeff[i+3];
371+
resultx4[0] += ((short)*(src2_ptr))*current_coeff[i];
372+
resultx4[1] += ((short)*(src2_ptr+src_pitch1))*current_coeff[i+1];
373+
resultx4[2] += ((short)*(src2_ptr+src_pitch2))*current_coeff[i+2];
374+
resultx4[3] += ((short)*(src2_ptr+src_pitch3))*current_coeff[i+3];
380375
src2_ptr += src_pitch4;
381376
}
377+
378+
int result_single = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
379+
382380
for (int i = ksmod4; i < kernel_size; i++)
383381
{
384-
result +=((int)*(src2_ptr))*current_coeff[i];
382+
result_single += ((short)*(src2_ptr))*current_coeff[i];
385383
src2_ptr += src_pitch1;
386384
}
387-
result = result >> FPScale8bits;
388-
result = (result>val_max) ? val_max : (result<val_min) ? val_min : result;
389-
dst[x] = (BYTE) result;
385+
result_single = result_single >> FPScale8bits;
386+
result_single = (result_single>val_max) ? val_max : (result_single<val_min) ? val_min : result_single;
387+
dst[x] = (BYTE) result_single;
390388
}
391389

392390
dst += dst_pitch1;
@@ -429,57 +427,59 @@ static void resize_v_c_planar_u16(BYTE* dst8, const BYTE* src8, int dst_pitch, i
429427

430428
for (int y = MinY; y < MaxY; y++)
431429
{
432-
//const int kernel_size = program->kernel_sizes[y];
433430
const uint16_t *src_ptr = src + pitch_table[program->pixel_offset[y]];
434431

435-
// perhaps helps vectorizing decision
436-
//const int ksmod4 = (kernel_size >> 2) << 2;
437-
438432
for (int x = 0; x < width; x++)
439433
{
440434

441435
// theoretically, no need for int64 accumulator,
442436
// sum of coeffs is 1.0 that is (1 << FPScale16bits) in integer arithmetic
443437
const uint16_t* JPSDR_RESTRICT src2_ptr = src_ptr + x;
444-
int result = rounder;
438+
int result_single,resultx4[4] = {rounder,0,0,0};
445439

446440
if JPSDR_CONSTEXPR (!lessthan16bit)
447441
{
448442
for (int i = 0; i < ksmod4; i+=4)
449443
{
450-
result +=((int)*(src2_ptr) + shifttosigned_short)*current_coeff[i];
451-
result +=((int)*(src2_ptr+src_pitch1) + shifttosigned_short)*current_coeff[i+1];
452-
result +=((int)*(src2_ptr+src_pitch2) + shifttosigned_short)*current_coeff[i+2];
453-
result +=((int)*(src2_ptr+src_pitch3) + shifttosigned_short)*current_coeff[i+3];
444+
resultx4[0] += ((short)(*(src2_ptr) + shifttosigned_short))*current_coeff[i];
445+
resultx4[1] += ((short)(*(src2_ptr+src_pitch1) + shifttosigned_short))*current_coeff[i+1];
446+
resultx4[2] += ((short)(*(src2_ptr+src_pitch2) + shifttosigned_short))*current_coeff[i+2];
447+
resultx4[3] += ((short)(*(src2_ptr+src_pitch3) + shifttosigned_short))*current_coeff[i+3];
454448
src2_ptr += src_pitch4;
455449
}
450+
451+
result_single = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
452+
456453
for (int i = ksmod4; i < kernel_size; i++)
457454
{
458-
result +=((int)*(src2_ptr) + shifttosigned_short)*current_coeff[i];
455+
result_single +=((short)(*(src2_ptr) + shifttosigned_short))*current_coeff[i];
459456
src2_ptr += src_pitch1;
460457
}
461-
result += shiftfromsigned_int;
458+
result_single += shiftfromsigned_int;
462459
}
463460
else
464461
{
465462
for (int i = 0; i < ksmod4; i+=4)
466463
{
467-
result +=((int)*(src2_ptr))*current_coeff[i];
468-
result +=((int)*(src2_ptr+src_pitch1))*current_coeff[i+1];
469-
result +=((int)*(src2_ptr+src_pitch2))*current_coeff[i+2];
470-
result +=((int)*(src2_ptr+src_pitch3))*current_coeff[i+3];
464+
resultx4[0] += ((short)*(src2_ptr))*current_coeff[i];
465+
resultx4[1] += ((short)*(src2_ptr+src_pitch1))*current_coeff[i+1];
466+
resultx4[2] += ((short)*(src2_ptr+src_pitch2))*current_coeff[i+2];
467+
resultx4[3] += ((short)*(src2_ptr+src_pitch3))*current_coeff[i+3];
471468
src2_ptr += src_pitch4;
472469
}
470+
471+
result_single = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
472+
473473
for (int i = ksmod4; i < kernel_size; i++)
474474
{
475-
result +=((int)*(src2_ptr))*current_coeff[i];
475+
result_single += ((short)*(src2_ptr))*current_coeff[i];
476476
src2_ptr += src_pitch1;
477477
}
478478
}
479479

480-
result = result >> FPScale16bits;
481-
result = (result>val_max) ? val_max : (result<val_min) ? val_min : result;
482-
dst[x] = (uint16_t)result;
480+
result_single = result_single >> FPScale16bits;
481+
result_single = (result_single>val_max) ? val_max : (result_single<val_min) ? val_min : result_single;
482+
dst[x] = (uint16_t)result_single;
483483
}
484484

485485
dst += dst_pitch1;
@@ -509,31 +509,30 @@ static void resize_v_c_planar_f(BYTE* dst8, const BYTE* src8, int dst_pitch, int
509509

510510
for (int y = MinY; y < MaxY; y++)
511511
{
512-
//const int kernel_size = program->kernel_sizes[y];
513512
const float *src_ptr = src + pitch_table[program->pixel_offset[y]];
514513

515-
// perhaps helps vectorizing decision
516-
//const int ksmod4 = (kernel_size >> 2) << 2;
517-
518514
for (int x = 0; x < width; x++)
519515
{
520516
const float* JPSDR_RESTRICT src2_ptr = src_ptr + x;
521-
float result = 0;
517+
float resultx4[4]={0.0,0.0,0.0,0.0};
522518

523519
for (int i = 0; i < ksmod4; i += 4)
524520
{
525-
result += (*(src2_ptr))*current_coeff[i];
526-
result += (*(src2_ptr+src_pitch1))*current_coeff[i+1];
527-
result += (*(src2_ptr+src_pitch2))*current_coeff[i+2];
528-
result += (*(src2_ptr+src_pitch3))*current_coeff[i+3];
521+
resultx4[0] += (*(src2_ptr))*current_coeff[i];
522+
resultx4[1] += (*(src2_ptr+src_pitch1))*current_coeff[i+1];
523+
resultx4[2] += (*(src2_ptr+src_pitch2))*current_coeff[i+2];
524+
resultx4[3] += (*(src2_ptr+src_pitch3))*current_coeff[i+3];
529525
src2_ptr += src_pitch4;
530526
}
527+
528+
float result_single = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
529+
531530
for (int i = ksmod4; i < kernel_size; i++)
532531
{
533-
result += (*src2_ptr)*current_coeff[i];
532+
result_single += (*src2_ptr)*current_coeff[i];
534533
src2_ptr += src_pitch1;
535534
}
536-
dst[x] = result;
535+
dst[x] = result_single;
537536
}
538537

539538
dst += dst_pitch1;
@@ -593,32 +592,40 @@ static void resize_h_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
593592
for (int x = 0; x < width; x++)
594593
{
595594
const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + program->pixel_offset[x];
596-
int result = rounder;
595+
int resultx8[8] = {rounder,0,0,0,0,0,0,0},resultx4[4] = {0,0,0,0};
597596

598597
for (int i = 0; i < ksmod8; i += 8)
599598
{
600-
result += ((short)src2_ptr[i])*current_coeff[i];
601-
result += ((short)src2_ptr[i+1])*current_coeff[i+1];
602-
result += ((short)src2_ptr[i+2])*current_coeff[i+2];
603-
result += ((short)src2_ptr[i+3])*current_coeff[i+3];
604-
result += ((short)src2_ptr[i+4])*current_coeff[i+4];
605-
result += ((short)src2_ptr[i+5])*current_coeff[i+5];
606-
result += ((short)src2_ptr[i+6])*current_coeff[i+6];
607-
result += ((short)src2_ptr[i+7])*current_coeff[i+7];
599+
resultx8[0] += ((short)src2_ptr[i])*current_coeff[i];
600+
resultx8[1] += ((short)src2_ptr[i+1])*current_coeff[i+1];
601+
resultx8[2] += ((short)src2_ptr[i+2])*current_coeff[i+2];
602+
resultx8[3] += ((short)src2_ptr[i+3])*current_coeff[i+3];
603+
resultx8[4] += ((short)src2_ptr[i+4])*current_coeff[i+4];
604+
resultx8[5] += ((short)src2_ptr[i+5])*current_coeff[i+5];
605+
resultx8[6] += ((short)src2_ptr[i+6])*current_coeff[i+6];
606+
resultx8[7] += ((short)src2_ptr[i+7])*current_coeff[i+7];
608607
}
608+
609+
int result_singlex8 = resultx8[0]+resultx8[1]+resultx8[2]+resultx8[3]+resultx8[4]
610+
+resultx8[5]+resultx8[6]+resultx8[7];
611+
609612
for (int i = ksmod8; i < ksmod4; i += 4)
610613
{
611-
result += ((short)src2_ptr[i])*current_coeff[i];
612-
result += ((short)src2_ptr[i+1])*current_coeff[i+1];
613-
result += ((short)src2_ptr[i+2])*current_coeff[i+2];
614-
result += ((short)src2_ptr[i+3])*current_coeff[i+3];
614+
resultx4[0] += ((short)src2_ptr[i])*current_coeff[i];
615+
resultx4[1] += ((short)src2_ptr[i+1])*current_coeff[i+1];
616+
resultx4[2] += ((short)src2_ptr[i+2])*current_coeff[i+2];
617+
resultx4[3] += ((short)src2_ptr[i+3])*current_coeff[i+3];
615618
}
619+
620+
int result_singlex4 = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
621+
int result_single = result_singlex8 + result_singlex4;
622+
616623
for (int i = ksmod4; i < kernel_size; i++)
617-
result += ((short)src2_ptr[i])*current_coeff[i];
624+
result_single += ((short)src2_ptr[i])*current_coeff[i];
618625

619-
result = result >> FPScale8bits;
620-
result = (result>TabMax[x & 0x03]) ? TabMax[x & 0x3] : (result<16) ? 16 : result;
621-
dst2_ptr[x] = (BYTE) result;
626+
result_single = result_single >> FPScale8bits;
627+
result_single = (result_single>TabMax[x & 0x03]) ? TabMax[x & 0x3] : (result_single<16) ? 16 : result_single;
628+
dst2_ptr[x] = (BYTE)result_single;
622629

623630
current_coeff+=filter_size;
624631
}
@@ -637,32 +644,40 @@ static void resize_h_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
637644
for (int x = 0; x < width; x++)
638645
{
639646
const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + program->pixel_offset[x];
640-
int result = rounder;
647+
int resultx8[8] = {rounder,0,0,0,0,0,0,0},resultx4[4] = {0,0,0,0};
641648

642649
for (int i = 0; i < ksmod8; i += 8)
643650
{
644-
result += ((short)src2_ptr[i])*current_coeff[i];
645-
result += ((short)src2_ptr[i+1])*current_coeff[i+1];
646-
result += ((short)src2_ptr[i+2])*current_coeff[i+2];
647-
result += ((short)src2_ptr[i+3])*current_coeff[i+3];
648-
result += ((short)src2_ptr[i+4])*current_coeff[i+4];
649-
result += ((short)src2_ptr[i+5])*current_coeff[i+5];
650-
result += ((short)src2_ptr[i+6])*current_coeff[i+6];
651-
result += ((short)src2_ptr[i+7])*current_coeff[i+7];
651+
resultx8[0] += ((short)src2_ptr[i])*current_coeff[i];
652+
resultx8[1] += ((short)src2_ptr[i+1])*current_coeff[i+1];
653+
resultx8[2] += ((short)src2_ptr[i+2])*current_coeff[i+2];
654+
resultx8[3] += ((short)src2_ptr[i+3])*current_coeff[i+3];
655+
resultx8[4] += ((short)src2_ptr[i+4])*current_coeff[i+4];
656+
resultx8[5] += ((short)src2_ptr[i+5])*current_coeff[i+5];
657+
resultx8[6] += ((short)src2_ptr[i+6])*current_coeff[i+6];
658+
resultx8[7] += ((short)src2_ptr[i+7])*current_coeff[i+7];
652659
}
660+
661+
int result_singlex8 = resultx8[0]+resultx8[1]+resultx8[2]+resultx8[3]+resultx8[4]
662+
+resultx8[5]+resultx8[6]+resultx8[7];
663+
653664
for (int i = ksmod8; i < ksmod4; i += 4)
654665
{
655-
result += ((short)src2_ptr[i])*current_coeff[i];
656-
result += ((short)src2_ptr[i+1])*current_coeff[i+1];
657-
result += ((short)src2_ptr[i+2])*current_coeff[i+2];
658-
result += ((short)src2_ptr[i+3])*current_coeff[i+3];
666+
resultx4[0] += ((short)src2_ptr[i])*current_coeff[i];
667+
resultx4[1] += ((short)src2_ptr[i+1])*current_coeff[i+1];
668+
resultx4[2] += ((short)src2_ptr[i+2])*current_coeff[i+2];
669+
resultx4[3] += ((short)src2_ptr[i+3])*current_coeff[i+3];
659670
}
671+
672+
int result_singlex4 = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
673+
int result_single = result_singlex8 + result_singlex4;
674+
660675
for (int i = ksmod4; i < kernel_size; i++)
661-
result += ((short)src2_ptr[i])*current_coeff[i];
676+
result_single += ((short)src2_ptr[i])*current_coeff[i];
662677

663-
result = result >> FPScale8bits;
664-
result = (result>val_max) ? val_max : (result<val_min) ? val_min : result;
665-
dst2_ptr[x] = (BYTE) result;
678+
result_single = result_single >> FPScale8bits;
679+
result_single = (result_single>val_max) ? val_max : (result_single<val_min) ? val_min : result_single;
680+
dst2_ptr[x] = (BYTE)result_single;
666681

667682
current_coeff+=filter_size;
668683
}
@@ -710,39 +725,43 @@ static void resize_h_c_planar_u16(BYTE* dst8, const BYTE* src8, int dst_pitch, i
710725

711726
// theoretically, no need for int64 accumulator,
712727
// sum of coeffs is 1.0 that is (1 << FPScale16bits) in integer arithmetic
713-
int result=rounder;
728+
int result_single,resultx4[4] = {rounder,0,0,0};
714729

715730
if JPSDR_CONSTEXPR (!lessthan16bit)
716731
{
717732
for (int i = 0; i < ksmod4; i += 4)
718733
{
719-
result += ((int)src2_ptr[i]+shifttosigned_short)*current_coeff[i];
720-
result += ((int)src2_ptr[i+1]+shifttosigned_short)*current_coeff[i+1];
721-
result += ((int)src2_ptr[i+2]+shifttosigned_short)*current_coeff[i+2];
722-
result += ((int)src2_ptr[i+3]+shifttosigned_short)*current_coeff[i+3];
734+
resultx4[0] += ((short)(src2_ptr[i]+shifttosigned_short))*current_coeff[i];
735+
resultx4[1] += ((short)(src2_ptr[i+1]+shifttosigned_short))*current_coeff[i+1];
736+
resultx4[2] += ((short)(src2_ptr[i+2]+shifttosigned_short))*current_coeff[i+2];
737+
resultx4[3] += ((short)(src2_ptr[i+3]+shifttosigned_short))*current_coeff[i+3];
723738
}
724-
739+
740+
result_single = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
741+
725742
for (int i = ksmod4; i < kernel_size; i++)
726-
result += ((int)src2_ptr[i]+shifttosigned_short)*current_coeff[i];
727-
result += shiftfromsigned_int;
743+
result_single += ((short)(src2_ptr[i]+shifttosigned_short))*current_coeff[i];
744+
result_single += shiftfromsigned_int;
728745
}
729746
else
730747
{
731748
for (int i = 0; i < ksmod4; i += 4)
732749
{
733-
result += ((int)src2_ptr[i])*current_coeff[i];
734-
result += ((int)src2_ptr[i+1])*current_coeff[i+1];
735-
result += ((int)src2_ptr[i+2])*current_coeff[i+2];
736-
result += ((int)src2_ptr[i+3])*current_coeff[i+3];
750+
resultx4[0] += ((short)src2_ptr[i])*current_coeff[i];
751+
resultx4[1] += ((short)src2_ptr[i+1])*current_coeff[i+1];
752+
resultx4[2] += ((short)src2_ptr[i+2])*current_coeff[i+2];
753+
resultx4[3] += ((short)src2_ptr[i+3])*current_coeff[i+3];
737754
}
738-
755+
756+
result_single = resultx4[0]+resultx4[1]+resultx4[2]+resultx4[3];
757+
739758
for (int i = ksmod4; i < kernel_size; i++)
740-
result += ((int)src2_ptr[i])*current_coeff[i];
759+
result_single += ((short)src2_ptr[i])*current_coeff[i];
741760
}
742761

743-
result = result >> FPScale16bits;
744-
result = (result>val_max) ? val_max : (result<val_min) ? val_min : result;
745-
dst2_ptr[x] = (uint16_t)result;
762+
result_single = result_single >> FPScale16bits;
763+
result_single = (result_single>val_max) ? val_max : (result_single<val_min) ? val_min : result_single;
764+
dst2_ptr[x] = (uint16_t)result_single;
746765

747766
current_coeff += filter_size;
748767
}

0 commit comments

Comments
 (0)