-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Move the AVX512 VSELECT(COND, 0, X) -> VSELECT(!COND, X, 0) fold to DAGToDAG #145724
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesInitial attempt to remove the fold out of ISel to avoid regressions identified in #145473 It still doesn't handle predicate widening which might not be very pretty...... Patch is 38.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145724.diff 7 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 32c7d2bfea6c2..768f033356959 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1139,24 +1139,51 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
break;
}
case ISD::VSELECT: {
- // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
- EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
- if (EleVT == MVT::i1)
- break;
-
- assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
- assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
- "We can't replace VSELECT with BLENDV in vXi16!");
+ SDValue Cond = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT CondVT = Cond.getValueType();
+ EVT EleVT = CondVT.getVectorElementType();
SDValue R;
- if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
- EleVT.getSizeInBits()) {
- R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1), N->getOperand(2),
- CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+
+ if (EleVT == MVT::i1) {
+ assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
+ if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
+ ISD::isBuildVectorAllZeros(RHS.getNode()))
+ break;
+ // If this an avx512 target we can improve the use of zero masking by
+ // swapping the operands and inverting the condition.
+ // vselect cond, op1, op2 = vselect not(cond), op2, op1
+ if (Cond.getOpcode() == ISD::SETCC &&
+ !ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
+ R = CurDAG->getSetCC(SDLoc(N), CondVT, Cond.getOperand(0),
+ Cond.getOperand(1), CC);
+ } else if (Cond.getOpcode() == X86ISD::CMPM &&
+ Cond.getConstantOperandVal(2) == 0) {
+ // FLIP FCMP EQ -> (U)NE
+ R = CurDAG->getNode(Cond.getOpcode(), SDLoc(N), CondVT,
+ Cond.getOperand(0), Cond.getOperand(1),
+ CurDAG->getTargetConstant(4, SDLoc(N), MVT::i8));
+ } else {
+ R = CurDAG->getNOT(SDLoc(N), Cond, CondVT);
+ }
+ R = CurDAG->getSelect(SDLoc(N), N->getValueType(0), R, RHS, LHS);
} else {
- R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1),
- N->getOperand(2));
+ // Replace VSELECT with non-mask conditions with BLENDV/VPTERNLOG.
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
+ "We can't replace VSELECT with BLENDV in vXi16!");
+ if (Subtarget->hasVLX() &&
+ CurDAG->ComputeNumSignBits(Cond) == EleVT.getSizeInBits()) {
+ R = CurDAG->getNode(
+ X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0), Cond, LHS, RHS,
+ CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
+ } else {
+ R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+ Cond, LHS, RHS);
+ }
}
--I;
CurDAG->ReplaceAllUsesWith(N, R.getNode());
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7c26dd6e2dc2f..ac9097f8f40af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48049,19 +48049,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
- // Check if the first operand is all zeros and Cond type is vXi1.
- // If this an avx512 target we can improve the use of zero masking by
- // swapping the operands and inverting the condition.
- if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
- Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
- ISD::isBuildVectorAllZeros(LHS.getNode()) &&
- !ISD::isBuildVectorAllZeros(RHS.getNode())) {
- // Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
- // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
- }
-
// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
// get split by legalization.
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 210513fe31783..5843fc11f95d1 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1564,7 +1564,8 @@ define <2 x i32> @narrow_cmp_select_reverse(<2 x i64> %x, <2 x i32> %y) nounwind
;
; SKX-LABEL: narrow_cmp_select_reverse:
; SKX: ## %bb.0:
-; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc8]
+; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc0]
+; SKX-NEXT: knotw %k0, %k1 ## encoding: [0xc5,0xf8,0x44,0xc8]
; SKX-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xc1]
; SKX-NEXT: retq ## encoding: [0xc3]
%mask = icmp eq <2 x i64> %x, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
index 96c8e773d5edd..1997323ed61a6 100644
--- a/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
+++ b/llvm/test/CodeGen/X86/extract-vselect-setcc.ll
@@ -5,7 +5,8 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 {
; CHECK-LABEL: PR117684:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpnltss %xmm1, %xmm0, %k1
+; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0
+; CHECK-NEXT: knotb %k0, %k1
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index e10b360b35b56..cc3aee4feba2d 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -981,9 +981,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX512-LABEL: test14:
; AVX512: # %bb.0: # %vector.ph
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vpmovdb %zmm1, %xmm3
; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpsubb %xmm0, %xmm3, %xmm0 {%k1} {z}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
vector.ph:
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 7f4111e65cc17..067d4c569d276 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -170,25 +170,26 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
+; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -355,24 +356,26 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
; AVX512-LABEL: var_shuffle_zero_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmd %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
@@ -600,12 +603,12 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleuw %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqu16 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
@@ -923,12 +926,12 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
;
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VL-NEXT: vpcmpnleub %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleub %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqu8 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
%or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
@@ -1139,25 +1142,25 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v2f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
+; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0 {%k1} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -1324,24 +1327,25 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
; AVX512-LABEL: var_shuffle_zero_v4f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shuffle_zero_v4f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
-; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
+; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k2
+; AVX512VL-NEXT: vmovdqa32 %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k2} {z}
; AVX512VL-NEXT: retq
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 283c6a303a581..f5287ecad7aef 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=INT256,AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512,AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=INT256,AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLF
@@ -143,27 +143,50 @@ define <4 x i64> @var_shuffle_zero_v4i64(<4 x i64> %v, <4 x i64> %indices) nounw
; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: var_shuffle_zero_v4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,3,3,3]
-; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
-; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k2
-; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
-; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: var_shuffle_zero_v4i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %k1
+; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: knotw %k1, %k1
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: var_shuffle_zero_v4i64:
+; AVX512DQ:...
[truncated]
|
…LECT(!COND, X, 0) fold to DAGToDAG Initial attempt to remove the fold out of ISel to avoid regressions identified in llvm#145473 It still doesn't handle predicate widening which might not be very pretty......
4790e68
to
a992805
Compare
; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0 | ||
; CHECK-NEXT: knotb %k0, %k1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems we should take care for scalar select too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have a helper to inverse FCMP codes? We have one for swapping/commutation but I can't find anything for NOT(FCMP(X,Y,I)) -> FCMP(X,Y,NOT(I)).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't getSetCCInverseImpl work for FCMP?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, different encodings for CondCode
@topperc IIRC you did some work to clean this up?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm going to tentatively just flip using "CC ^ 4" but I'm not certain if all the inf/nan handling will be preserved?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"CC ^ 4" looks correct to me.
@davemgreen this should remove the need for the addition of isTargetCanonicalSelect in #145473 - please can you confirm? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Hi - Yes I gave this a check and it looks like that will be OK, if the vselect zero DAG combine is removed. Thanks for doing this. I will say that this looks like it creates a new layer of combining between final optimization and selection - one that could get arbitrarily complex in the extreme. It does not feel to me like the new isTargetCanonicalSelect method in 145473 is a "hack" and it would seem like a better design overall to me to have the target able to control the canonicalizations in DAGCombining. If there is one that it does not want, or wants the opposite of, we shouldn't have the policy of not allowing the targets ways of turning them off. But thanks for doing this. |
Attempt to remove the fold out of ISel to avoid regressions identified in #145473