@@ -321,34 +321,33 @@ static void resize_v_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
321321
322322 for (int y = MinY; y < MaxY; y++)
323323 {
324- // const int kernel_size = program->kernel_sizes[y];
325324 const BYTE *src_ptr = src + pitch_table[program->pixel_offset [y]];
326325
327- // perhaps helps vectorizing decision
328- // const int ksmod4 = (kernel_size >> 2) << 2;
329-
330326 for (int x = 0 ; x < width; x++)
331327 {
332328 const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + x;
333329
334- int result = rounder;
330+ int resultx4[ 4 ] = { rounder, 0 , 0 , 0 } ;
335331
336332 for (int i = 0 ; i < ksmod4; i += 4 )
337333 {
338- result +=(( int )*(src2_ptr))*current_coeff[i];
339- result +=(( int )*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
340- result +=(( int )*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
341- result +=(( int )*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
334+ resultx4[ 0 ] += (( short )*(src2_ptr))*current_coeff[i];
335+ resultx4[ 1 ] += (( short )*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
336+ resultx4[ 2 ] += (( short )*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
337+ resultx4[ 3 ] += (( short )*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
342338 src2_ptr += src_pitch4;
343339 }
340+
341+ int result_single = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
342+
344343 for (int i = ksmod4; i < kernel_size; i++)
345344 {
346- result +=(( int )*(src2_ptr))*current_coeff[i];
345+ result_single += (( short )*(src2_ptr))*current_coeff[i];
347346 src2_ptr += src_pitch1;
348347 }
349- result = result >> FPScale8bits;
350- result = (result >TabMax[x & 0x03 ]) ? TabMax[x & 0x3 ] : (result <16 ) ? 16 : result ;
351- dst[x] = (BYTE) result ;
348+ result_single = result_single >> FPScale8bits;
349+ result_single = (result_single >TabMax[x & 0x03 ]) ? TabMax[x & 0x3 ] : (result_single <16 ) ? 16 : result_single ;
350+ dst[x] = (BYTE) result_single ;
352351 }
353352
354353 dst += dst_pitch1;
@@ -359,34 +358,33 @@ static void resize_v_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
359358 {
360359 for (int y = MinY; y < MaxY; y++)
361360 {
362- // const int kernel_size = program->kernel_sizes[y];
363361 const BYTE *src_ptr = src + pitch_table[program->pixel_offset [y]];
364362
365- // perhaps helps vectorizing decision
366- // const int ksmod4 = (kernel_size >> 2) << 2;
367-
368363 for (int x = 0 ; x < width; x++)
369364 {
370365 const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + x;
371366
372- int result = rounder;
367+ int resultx4[ 4 ] = { rounder, 0 , 0 , 0 } ;
373368
374369 for (int i = 0 ; i < ksmod4; i += 4 )
375370 {
376- result +=(( int )*(src2_ptr))*current_coeff[i];
377- result +=(( int )*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
378- result +=(( int )*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
379- result +=(( int )*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
371+ resultx4[ 0 ] += (( short )*(src2_ptr))*current_coeff[i];
372+ resultx4[ 1 ] += (( short )*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
373+ resultx4[ 2 ] += (( short )*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
374+ resultx4[ 3 ] += (( short )*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
380375 src2_ptr += src_pitch4;
381376 }
377+
378+ int result_single = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
379+
382380 for (int i = ksmod4; i < kernel_size; i++)
383381 {
384- result +=(( int )*(src2_ptr))*current_coeff[i];
382+ result_single += (( short )*(src2_ptr))*current_coeff[i];
385383 src2_ptr += src_pitch1;
386384 }
387- result = result >> FPScale8bits;
388- result = (result >val_max) ? val_max : (result <val_min) ? val_min : result ;
389- dst[x] = (BYTE) result ;
385+ result_single = result_single >> FPScale8bits;
386+ result_single = (result_single >val_max) ? val_max : (result_single <val_min) ? val_min : result_single ;
387+ dst[x] = (BYTE) result_single ;
390388 }
391389
392390 dst += dst_pitch1;
@@ -429,57 +427,59 @@ static void resize_v_c_planar_u16(BYTE* dst8, const BYTE* src8, int dst_pitch, i
429427
430428 for (int y = MinY; y < MaxY; y++)
431429 {
432- // const int kernel_size = program->kernel_sizes[y];
433430 const uint16_t *src_ptr = src + pitch_table[program->pixel_offset [y]];
434431
435- // perhaps helps vectorizing decision
436- // const int ksmod4 = (kernel_size >> 2) << 2;
437-
438432 for (int x = 0 ; x < width; x++)
439433 {
440434
441435 // theoretically, no need for int64 accumulator,
442436 // sum of coeffs is 1.0 that is (1 << FPScale16bits) in integer arithmetic
443437 const uint16_t * JPSDR_RESTRICT src2_ptr = src_ptr + x;
444- int result = rounder;
438+ int result_single,resultx4[ 4 ] = { rounder, 0 , 0 , 0 } ;
445439
446440 if JPSDR_CONSTEXPR (!lessthan16bit)
447441 {
448442 for (int i = 0 ; i < ksmod4; i+=4 )
449443 {
450- result +=(( int ) *(src2_ptr) + shifttosigned_short)*current_coeff[i];
451- result +=(( int ) *(src2_ptr+src_pitch1) + shifttosigned_short)*current_coeff[i+1 ];
452- result +=(( int ) *(src2_ptr+src_pitch2) + shifttosigned_short)*current_coeff[i+2 ];
453- result +=(( int ) *(src2_ptr+src_pitch3) + shifttosigned_short)*current_coeff[i+3 ];
444+ resultx4[ 0 ] += (( short )( *(src2_ptr) + shifttosigned_short) )*current_coeff[i];
445+ resultx4[ 1 ] += (( short )( *(src2_ptr+src_pitch1) + shifttosigned_short) )*current_coeff[i+1 ];
446+ resultx4[ 2 ] += (( short )( *(src2_ptr+src_pitch2) + shifttosigned_short) )*current_coeff[i+2 ];
447+ resultx4[ 3 ] += (( short )( *(src2_ptr+src_pitch3) + shifttosigned_short) )*current_coeff[i+3 ];
454448 src2_ptr += src_pitch4;
455449 }
450+
451+ result_single = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
452+
456453 for (int i = ksmod4; i < kernel_size; i++)
457454 {
458- result +=((int ) *(src2_ptr) + shifttosigned_short)*current_coeff[i];
455+ result_single +=((short )( *(src2_ptr) + shifttosigned_short) )*current_coeff[i];
459456 src2_ptr += src_pitch1;
460457 }
461- result += shiftfromsigned_int;
458+ result_single += shiftfromsigned_int;
462459 }
463460 else
464461 {
465462 for (int i = 0 ; i < ksmod4; i+=4 )
466463 {
467- result +=(( int )*(src2_ptr))*current_coeff[i];
468- result +=(( int )*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
469- result +=(( int )*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
470- result +=(( int )*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
464+ resultx4[ 0 ] += (( short )*(src2_ptr))*current_coeff[i];
465+ resultx4[ 1 ] += (( short )*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
466+ resultx4[ 2 ] += (( short )*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
467+ resultx4[ 3 ] += (( short )*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
471468 src2_ptr += src_pitch4;
472469 }
470+
471+ result_single = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
472+
473473 for (int i = ksmod4; i < kernel_size; i++)
474474 {
475- result +=(( int )*(src2_ptr))*current_coeff[i];
475+ result_single += (( short )*(src2_ptr))*current_coeff[i];
476476 src2_ptr += src_pitch1;
477477 }
478478 }
479479
480- result = result >> FPScale16bits;
481- result = (result >val_max) ? val_max : (result <val_min) ? val_min : result ;
482- dst[x] = (uint16_t )result ;
480+ result_single = result_single >> FPScale16bits;
481+ result_single = (result_single >val_max) ? val_max : (result_single <val_min) ? val_min : result_single ;
482+ dst[x] = (uint16_t )result_single ;
483483 }
484484
485485 dst += dst_pitch1;
@@ -509,31 +509,30 @@ static void resize_v_c_planar_f(BYTE* dst8, const BYTE* src8, int dst_pitch, int
509509
510510 for (int y = MinY; y < MaxY; y++)
511511 {
512- // const int kernel_size = program->kernel_sizes[y];
513512 const float *src_ptr = src + pitch_table[program->pixel_offset [y]];
514513
515- // perhaps helps vectorizing decision
516- // const int ksmod4 = (kernel_size >> 2) << 2;
517-
518514 for (int x = 0 ; x < width; x++)
519515 {
520516 const float * JPSDR_RESTRICT src2_ptr = src_ptr + x;
521- float result = 0 ;
517+ float resultx4[ 4 ]={ 0.0 , 0.0 , 0.0 , 0.0 } ;
522518
523519 for (int i = 0 ; i < ksmod4; i += 4 )
524520 {
525- result += (*(src2_ptr))*current_coeff[i];
526- result += (*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
527- result += (*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
528- result += (*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
521+ resultx4[ 0 ] += (*(src2_ptr))*current_coeff[i];
522+ resultx4[ 1 ] += (*(src2_ptr+src_pitch1))*current_coeff[i+1 ];
523+ resultx4[ 2 ] += (*(src2_ptr+src_pitch2))*current_coeff[i+2 ];
524+ resultx4[ 3 ] += (*(src2_ptr+src_pitch3))*current_coeff[i+3 ];
529525 src2_ptr += src_pitch4;
530526 }
527+
528+ float result_single = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
529+
531530 for (int i = ksmod4; i < kernel_size; i++)
532531 {
533- result += (*src2_ptr)*current_coeff[i];
532+ result_single += (*src2_ptr)*current_coeff[i];
534533 src2_ptr += src_pitch1;
535534 }
536- dst[x] = result ;
535+ dst[x] = result_single ;
537536 }
538537
539538 dst += dst_pitch1;
@@ -593,32 +592,40 @@ static void resize_h_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
593592 for (int x = 0 ; x < width; x++)
594593 {
595594 const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + program->pixel_offset [x];
596- int result = rounder;
595+ int resultx8[ 8 ] = { rounder, 0 , 0 , 0 , 0 , 0 , 0 , 0 },resultx4[ 4 ] = { 0 , 0 , 0 , 0 } ;
597596
598597 for (int i = 0 ; i < ksmod8; i += 8 )
599598 {
600- result += ((short )src2_ptr[i])*current_coeff[i];
601- result += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
602- result += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
603- result += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
604- result += ((short )src2_ptr[i+4 ])*current_coeff[i+4 ];
605- result += ((short )src2_ptr[i+5 ])*current_coeff[i+5 ];
606- result += ((short )src2_ptr[i+6 ])*current_coeff[i+6 ];
607- result += ((short )src2_ptr[i+7 ])*current_coeff[i+7 ];
599+ resultx8[ 0 ] += ((short )src2_ptr[i])*current_coeff[i];
600+ resultx8[ 1 ] += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
601+ resultx8[ 2 ] += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
602+ resultx8[ 3 ] += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
603+ resultx8[ 4 ] += ((short )src2_ptr[i+4 ])*current_coeff[i+4 ];
604+ resultx8[ 5 ] += ((short )src2_ptr[i+5 ])*current_coeff[i+5 ];
605+ resultx8[ 6 ] += ((short )src2_ptr[i+6 ])*current_coeff[i+6 ];
606+ resultx8[ 7 ] += ((short )src2_ptr[i+7 ])*current_coeff[i+7 ];
608607 }
608+
609+ int result_singlex8 = resultx8[0 ]+resultx8[1 ]+resultx8[2 ]+resultx8[3 ]+resultx8[4 ]
610+ +resultx8[5 ]+resultx8[6 ]+resultx8[7 ];
611+
609612 for (int i = ksmod8; i < ksmod4; i += 4 )
610613 {
611- result += ((short )src2_ptr[i])*current_coeff[i];
612- result += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
613- result += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
614- result += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
614+ resultx4[ 0 ] += ((short )src2_ptr[i])*current_coeff[i];
615+ resultx4[ 1 ] += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
616+ resultx4[ 2 ] += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
617+ resultx4[ 3 ] += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
615618 }
619+
620+ int result_singlex4 = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
621+ int result_single = result_singlex8 + result_singlex4;
622+
616623 for (int i = ksmod4; i < kernel_size; i++)
617- result += ((short )src2_ptr[i])*current_coeff[i];
624+ result_single += ((short )src2_ptr[i])*current_coeff[i];
618625
619- result = result >> FPScale8bits;
620- result = (result >TabMax[x & 0x03 ]) ? TabMax[x & 0x3 ] : (result <16 ) ? 16 : result ;
621- dst2_ptr[x] = (BYTE) result ;
626+ result_single = result_single >> FPScale8bits;
627+ result_single = (result_single >TabMax[x & 0x03 ]) ? TabMax[x & 0x3 ] : (result_single <16 ) ? 16 : result_single ;
628+ dst2_ptr[x] = (BYTE)result_single ;
622629
623630 current_coeff+=filter_size;
624631 }
@@ -637,32 +644,40 @@ static void resize_h_c_planar_u8(BYTE* dst8, const BYTE* src8, int dst_pitch, in
637644 for (int x = 0 ; x < width; x++)
638645 {
639646 const BYTE* JPSDR_RESTRICT src2_ptr = src_ptr + program->pixel_offset [x];
640- int result = rounder;
647+ int resultx8[ 8 ] = { rounder, 0 , 0 , 0 , 0 , 0 , 0 , 0 },resultx4[ 4 ] = { 0 , 0 , 0 , 0 } ;
641648
642649 for (int i = 0 ; i < ksmod8; i += 8 )
643650 {
644- result += ((short )src2_ptr[i])*current_coeff[i];
645- result += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
646- result += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
647- result += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
648- result += ((short )src2_ptr[i+4 ])*current_coeff[i+4 ];
649- result += ((short )src2_ptr[i+5 ])*current_coeff[i+5 ];
650- result += ((short )src2_ptr[i+6 ])*current_coeff[i+6 ];
651- result += ((short )src2_ptr[i+7 ])*current_coeff[i+7 ];
651+ resultx8[ 0 ] += ((short )src2_ptr[i])*current_coeff[i];
652+ resultx8[ 1 ] += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
653+ resultx8[ 2 ] += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
654+ resultx8[ 3 ] += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
655+ resultx8[ 4 ] += ((short )src2_ptr[i+4 ])*current_coeff[i+4 ];
656+ resultx8[ 5 ] += ((short )src2_ptr[i+5 ])*current_coeff[i+5 ];
657+ resultx8[ 6 ] += ((short )src2_ptr[i+6 ])*current_coeff[i+6 ];
658+ resultx8[ 7 ] += ((short )src2_ptr[i+7 ])*current_coeff[i+7 ];
652659 }
660+
661+ int result_singlex8 = resultx8[0 ]+resultx8[1 ]+resultx8[2 ]+resultx8[3 ]+resultx8[4 ]
662+ +resultx8[5 ]+resultx8[6 ]+resultx8[7 ];
663+
653664 for (int i = ksmod8; i < ksmod4; i += 4 )
654665 {
655- result += ((short )src2_ptr[i])*current_coeff[i];
656- result += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
657- result += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
658- result += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
666+ resultx4[ 0 ] += ((short )src2_ptr[i])*current_coeff[i];
667+ resultx4[ 1 ] += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
668+ resultx4[ 2 ] += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
669+ resultx4[ 3 ] += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
659670 }
671+
672+ int result_singlex4 = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
673+ int result_single = result_singlex8 + result_singlex4;
674+
660675 for (int i = ksmod4; i < kernel_size; i++)
661- result += ((short )src2_ptr[i])*current_coeff[i];
676+ result_single += ((short )src2_ptr[i])*current_coeff[i];
662677
663- result = result >> FPScale8bits;
664- result = (result >val_max) ? val_max : (result <val_min) ? val_min : result ;
665- dst2_ptr[x] = (BYTE) result ;
678+ result_single = result_single >> FPScale8bits;
679+ result_single = (result_single >val_max) ? val_max : (result_single <val_min) ? val_min : result_single ;
680+ dst2_ptr[x] = (BYTE)result_single ;
666681
667682 current_coeff+=filter_size;
668683 }
@@ -710,39 +725,43 @@ static void resize_h_c_planar_u16(BYTE* dst8, const BYTE* src8, int dst_pitch, i
710725
711726 // theoretically, no need for int64 accumulator,
712727 // sum of coeffs is 1.0 that is (1 << FPScale16bits) in integer arithmetic
713- int result= rounder;
728+ int result_single,resultx4[ 4 ] = { rounder, 0 , 0 , 0 } ;
714729
715730 if JPSDR_CONSTEXPR (!lessthan16bit)
716731 {
717732 for (int i = 0 ; i < ksmod4; i += 4 )
718733 {
719- result += ((int ) src2_ptr[i]+shifttosigned_short)*current_coeff[i];
720- result += ((int ) src2_ptr[i+1 ]+shifttosigned_short)*current_coeff[i+1 ];
721- result += ((int ) src2_ptr[i+2 ]+shifttosigned_short)*current_coeff[i+2 ];
722- result += ((int ) src2_ptr[i+3 ]+shifttosigned_short)*current_coeff[i+3 ];
734+ resultx4[ 0 ] += ((short )( src2_ptr[i]+shifttosigned_short) )*current_coeff[i];
735+ resultx4[ 1 ] += ((short )( src2_ptr[i+1 ]+shifttosigned_short) )*current_coeff[i+1 ];
736+ resultx4[ 2 ] += ((short )( src2_ptr[i+2 ]+shifttosigned_short) )*current_coeff[i+2 ];
737+ resultx4[ 3 ] += ((short )( src2_ptr[i+3 ]+shifttosigned_short) )*current_coeff[i+3 ];
723738 }
724-
739+
740+ result_single = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
741+
725742 for (int i = ksmod4; i < kernel_size; i++)
726- result += ((int ) src2_ptr[i]+shifttosigned_short)*current_coeff[i];
727- result += shiftfromsigned_int;
743+ result_single += ((short )( src2_ptr[i]+shifttosigned_short) )*current_coeff[i];
744+ result_single += shiftfromsigned_int;
728745 }
729746 else
730747 {
731748 for (int i = 0 ; i < ksmod4; i += 4 )
732749 {
733- result += ((int )src2_ptr[i])*current_coeff[i];
734- result += ((int )src2_ptr[i+1 ])*current_coeff[i+1 ];
735- result += ((int )src2_ptr[i+2 ])*current_coeff[i+2 ];
736- result += ((int )src2_ptr[i+3 ])*current_coeff[i+3 ];
750+ resultx4[ 0 ] += ((short )src2_ptr[i])*current_coeff[i];
751+ resultx4[ 1 ] += ((short )src2_ptr[i+1 ])*current_coeff[i+1 ];
752+ resultx4[ 2 ] += ((short )src2_ptr[i+2 ])*current_coeff[i+2 ];
753+ resultx4[ 3 ] += ((short )src2_ptr[i+3 ])*current_coeff[i+3 ];
737754 }
738-
755+
756+ result_single = resultx4[0 ]+resultx4[1 ]+resultx4[2 ]+resultx4[3 ];
757+
739758 for (int i = ksmod4; i < kernel_size; i++)
740- result += ((int )src2_ptr[i])*current_coeff[i];
759+ result_single += ((short )src2_ptr[i])*current_coeff[i];
741760 }
742761
743- result = result >> FPScale16bits;
744- result = (result >val_max) ? val_max : (result <val_min) ? val_min : result ;
745- dst2_ptr[x] = (uint16_t )result ;
762+ result_single = result_single >> FPScale16bits;
763+ result_single = (result_single >val_max) ? val_max : (result_single <val_min) ? val_min : result_single ;
764+ dst2_ptr[x] = (uint16_t )result_single ;
746765
747766 current_coeff += filter_size;
748767 }
0 commit comments