@@ -479,290 +479,6 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479
479
return ret ;
480
480
}
481
481
482
- #pragma push_macro("__INTRINSIC_LOAD")
483
- #define __INTRINSIC_LOAD (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
484
- __Clobber ) \
485
- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
486
- __TmpType __ret; \
487
- asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
488
- return (__DeclType)__ret; \
489
- }
490
-
491
- #pragma push_macro("__INTRINSIC_LOAD2")
492
- #define __INTRINSIC_LOAD2 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
493
- __Clobber ) \
494
- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
495
- __DeclType __ret; \
496
- __TmpType __tmp; \
497
- asm(__AsmOp " {%0,%1}, [%2];" \
498
- : __AsmType(__tmp.x), __AsmType(__tmp.y) \
499
- : "l"(__ptr)__Clobber); \
500
- using __ElementType = decltype(__ret.x); \
501
- __ret.x = (__ElementType)(__tmp.x); \
502
- __ret.y = (__ElementType)__tmp.y; \
503
- return __ret; \
504
- }
505
-
506
- #pragma push_macro("__INTRINSIC_LOAD4")
507
- #define __INTRINSIC_LOAD4 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
508
- __Clobber ) \
509
- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
510
- __DeclType __ret; \
511
- __TmpType __tmp; \
512
- asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
513
- : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
514
- __AsmType(__tmp.w) \
515
- : "l"(__ptr)__Clobber); \
516
- using __ElementType = decltype(__ret.x); \
517
- __ret.x = (__ElementType)__tmp.x; \
518
- __ret.y = (__ElementType)__tmp.y; \
519
- __ret.z = (__ElementType)__tmp.z; \
520
- __ret.w = (__ElementType)__tmp.w; \
521
- return __ret; \
522
- }
523
-
524
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , char , unsigned int , "=r" , );
525
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , signed char , unsigned int , "=r" , );
526
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s16" , short , unsigned short , "=h" , );
527
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s32" , int , unsigned int , "=r" , );
528
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s64" , long long , unsigned long long,
529
- "=l" , );
530
-
531
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s8" , char2 , int2 , "=r" , );
532
- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s8" , char4 , int4 , "=r" , );
533
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s16" , short2 , short2 , "=h" , );
534
- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s16" , short4 , short4 , "=h" , );
535
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s32" , int2 , int2 , "=r" , );
536
- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s32" , int4 , int4 , "=r" , );
537
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s64 " , longlong2 , longlong2 , "=l" , );
538
-
539
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u8" , unsigned char , unsigned int ,
540
- "=r" , );
541
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u16" , unsigned short , unsigned short ,
542
- "=h" , );
543
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u32" , unsigned int , unsigned int ,
544
- "=r" , );
545
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u64" , unsigned long long,
546
- unsigned long long, "=l" , );
547
-
548
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u8" , uchar2 , int2 , "=r" , );
549
- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u8" , uchar4 , int4 , "=r" , );
550
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u16" , ushort2 , ushort2 , "=h" , );
551
- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u16" , ushort4 , ushort4 , "=h" , );
552
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u32" , uint2 , uint2 , "=r" , );
553
- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u32" , uint4 , uint4 , "=r" , );
554
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u64" , ulonglong2 , ulonglong2 ,
555
- "=l" , );
556
-
557
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f32" , float , float , "=f" , );
558
- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f64" , double , double , "=d" , );
559
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f32" , float2 , float2 , "=f" , );
560
- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.f32" , float4 , float4 , "=f" , );
561
- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f64" , double2 , double2 , "=d" , );
562
-
563
- inline __device__ long __ldcg (const long * __ptr ) {
564
- unsigned long __ret ;
565
- if (sizeof (long ) == 8 ) {
566
- asm("ld.global.cg.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
567
- } else {
568
- asm("ld.global.cg.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
569
- }
570
- return (long )__ret ;
571
- }
572
-
573
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u8" , unsigned char , unsigned int ,
574
- "=r" , : "memory" );
575
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u16" , unsigned short , unsigned short ,
576
- "=h" , : "memory" );
577
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u32" , unsigned int , unsigned int ,
578
- "=r" , : "memory" );
579
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u64" , unsigned long long,
580
- unsigned long long, "=l" , : "memory" );
581
-
582
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , char , unsigned int ,
583
- "=r" , : "memory" );
584
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , signed char , unsigned int ,
585
- "=r" , : "memory" );
586
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s16" , short , unsigned short ,
587
- "=h" , : "memory" );
588
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s32" , int , unsigned int ,
589
- "=r" , : "memory" );
590
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s64" , long long , unsigned long long,
591
- "=l" , : "memory" );
592
-
593
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u8" , uchar2 , uint2 ,
594
- "=r" , : "memory" );
595
- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u8" , uchar4 , uint4 ,
596
- "=r" , : "memory" );
597
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u16" , ushort2 , ushort2 ,
598
- "=h" , : "memory" );
599
- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u16" , ushort4 , ushort4 ,
600
- "=h" , : "memory" );
601
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u32" , uint2 , uint2 ,
602
- "=r" , : "memory" );
603
- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u32" , uint4 , uint4 ,
604
- "=r" , : "memory" );
605
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u64" , ulonglong2 , ulonglong2 ,
606
- "=l" , : "memory" );
607
-
608
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s8" , char2 , int2 , "=r" , : "memory" );
609
- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s8" , char4 , int4 , "=r" , : "memory" );
610
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s16" , short2 , short2 ,
611
- "=h" , : "memory" );
612
- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s16" , short4 , short4 ,
613
- "=h" , : "memory" );
614
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s32" , int2 , int2 , "=r" , : "memory" );
615
- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s32" , int4 , int4 , "=r" , : "memory" );
616
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s64" , longlong2 , longlong2 ,
617
- "=l" , : "memory" );
618
-
619
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f32" , float , float , "=f" , : "memory" );
620
- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f64" , double , double , "=d" , : "memory" );
621
-
622
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f32" , float2 , float2 ,
623
- "=f" , : "memory" );
624
- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.f32" , float4 , float4 ,
625
- "=f" , : "memory" );
626
- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f64" , double2 , double2 ,
627
- "=d" , : "memory" );
628
-
629
- inline __device__ long __ldcv (const long * __ptr ) {
630
- unsigned long __ret ;
631
- if (sizeof (long ) == 8 ) {
632
- asm("ld.global.cv.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
633
- } else {
634
- asm("ld.global.cv.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
635
- }
636
- return (long )__ret ;
637
- }
638
-
639
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , char , unsigned int , "=r" , );
640
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , signed char , signed int , "=r" , );
641
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s16" , short , unsigned short , "=h" , );
642
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s32" , int , unsigned int , "=r" , );
643
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s64" , long long , unsigned long long,
644
- "=l" , );
645
-
646
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s8" , char2 , int2 , "=r" , );
647
- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s8" , char4 , int4 , "=r" , );
648
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s16" , short2 , short2 , "=h" , );
649
- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s16" , short4 , short4 , "=h" , );
650
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s32" , int2 , int2 , "=r" , );
651
- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s32" , int4 , int4 , "=r" , );
652
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s64" , longlong2 , longlong2 , "=l" , );
653
-
654
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u8" , unsigned char , unsigned int ,
655
- "=r" , );
656
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u16" , unsigned short , unsigned short ,
657
- "=h" , );
658
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u32" , unsigned int , unsigned int ,
659
- "=r" , );
660
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u64" , unsigned long long,
661
- unsigned long long, "=l" , );
662
-
663
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u8" , uchar2 , uint2 , "=r" , );
664
- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u8" , uchar4 , uint4 , "=r" , );
665
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u16" , ushort2 , ushort2 , "=h" , );
666
- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u16" , ushort4 , ushort4 , "=h" , );
667
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u32" , uint2 , uint2 , "=r" , );
668
- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u32" , uint4 , uint4 , "=r" , );
669
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u64" , ulonglong2 , ulonglong2 ,
670
- "=l" , );
671
-
672
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f32" , float , float , "=f" , );
673
- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f64" , double , double , "=d" , );
674
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f32" , float2 , float2 , "=f" , );
675
- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.f32" , float4 , float4 , "=f" , );
676
- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f64" , double2 , double2 , "=d" , );
677
-
678
- #pragma pop_macro("__INTRINSIC_LOAD")
679
- #pragma pop_macro("__INTRINSIC_LOAD2")
680
- #pragma pop_macro("__INTRINSIC_LOAD4")
681
-
682
- inline __device__ long __ldcs (const long * __ptr ) {
683
- unsigned long __ret ;
684
- if (sizeof (long ) == 8 ) {
685
- asm("ld.global.cs.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
686
- } else {
687
- asm("ld.global.cs.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
688
- }
689
- return (long )__ret ;
690
- }
691
-
692
- #pragma push_macro("__INTRINSIC_STORE")
693
- #define __INTRINSIC_STORE (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType ) \
694
- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
695
- __TmpType __tmp = (__TmpType)__value; \
696
- asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
697
- }
698
-
699
- #pragma push_macro("__INTRINSIC_STORE2")
700
- #define __INTRINSIC_STORE2 (__FnName , __AsmOp , __DeclType , __TmpType , \
701
- __AsmType ) \
702
- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
703
- __TmpType __tmp; \
704
- using __ElementType = decltype(__tmp.x); \
705
- __tmp.x = (__ElementType)(__value.x); \
706
- __tmp.y = (__ElementType)(__value.y); \
707
- asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
708
- __AsmType(__tmp.y) \
709
- : "memory"); \
710
- }
711
-
712
- #pragma push_macro("__INTRINSIC_STORE4")
713
- #define __INTRINSIC_STORE4 (__FnName , __AsmOp , __DeclType , __TmpType , \
714
- __AsmType ) \
715
- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
716
- __TmpType __tmp; \
717
- using __ElementType = decltype(__tmp.x); \
718
- __tmp.x = (__ElementType)(__value.x); \
719
- __tmp.y = (__ElementType)(__value.y); \
720
- __tmp.z = (__ElementType)(__value.z); \
721
- __tmp.w = (__ElementType)(__value.w); \
722
- asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
723
- __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
724
- : "memory"); \
725
- }
726
-
727
- __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , char , int , "r" );
728
- __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , signed char , int , "r" );
729
- __INTRINSIC_STORE (__stwt , "st.global.wt.s16" , short , short , "h" );
730
- __INTRINSIC_STORE (__stwt , "st.global.wt.s32" , int , int , "r" );
731
- __INTRINSIC_STORE (__stwt , "st.global.wt.s64" , long long , long long , "l" );
732
-
733
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s8" , char2 , int2 , "r" );
734
- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s8" , char4 , int4 , "r" );
735
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s16" , short2 , short2 , "h" );
736
- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s16" , short4 , short4 , "h" );
737
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s32" , int2 , int2 , "r" );
738
- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s32" , int4 , int4 , "r" );
739
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s64" , longlong2 , longlong2 , "l" );
740
-
741
- __INTRINSIC_STORE (__stwt , "st.global.wt.u8" , unsigned char , int , "r" );
742
- __INTRINSIC_STORE (__stwt , "st.global.wt.u16" , unsigned short , unsigned short ,
743
- "h" );
744
- __INTRINSIC_STORE (__stwt , "st.global.wt.u32" , unsigned int , unsigned int , "r" );
745
- __INTRINSIC_STORE (__stwt , "st.global.wt.u64" , unsigned long long,
746
- unsigned long long, "l" );
747
-
748
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u8" , uchar2 , uchar2 , "r" );
749
- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u8" , uchar4 , uint4 , "r" );
750
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u16" , ushort2 , ushort2 , "h" );
751
- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u16" , ushort4 , ushort4 , "h" );
752
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u32" , uint2 , uint2 , "r" );
753
- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u32" , uint4 , uint4 , "r" );
754
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u64" , ulonglong2 , ulonglong2 , "l" );
755
-
756
- __INTRINSIC_STORE (__stwt , "st.global.wt.f32" , float , float , "f" );
757
- __INTRINSIC_STORE (__stwt , "st.global.wt.f64" , double , double , "d" );
758
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f32" , float2 , float2 , "f" );
759
- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.f32" , float4 , float4 , "f" );
760
- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f64" , double2 , double2 , "d" );
761
-
762
- #pragma pop_macro("__INTRINSIC_STORE")
763
- #pragma pop_macro("__INTRINSIC_STORE2")
764
- #pragma pop_macro("__INTRINSIC_STORE4")
765
-
766
482
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
767
483
768
484
#if CUDA_VERSION >= 11000
0 commit comments