Skip to content

Commit

Permalink
Enable AVX512 embedded masking for most other intrinsics (dotnet#101886)
Browse files Browse the repository at this point in the history
* Remove HW_Flag_MultiIns in favor of using HW_Flag_SpecialCodeGen

* Add a new flag HW_Flag_InvalidNodeId

* Change HW_Flag_EmbMaskingIncompatible to be HW_Flag_EmbMaskingCompatible

* Mark various compare intrinsics with HW_Flag_NoEvexSemantics

* Marking various intrinsics as EmbBroadcastCompatible, EmbMaskingCompatible, or Commutative

* Applying formatting patch

* Ensure WithLower/WithUpper are not marked as InvalidNodeId

* Ensure that instOptions are being passed down all relevant hwintrinsic code paths

* Ensure the insOpts are plumbed through for EVEX instructions

* Ensure EVEX instructions are properly annotated with EmbeddedBroadcastSupported

* Ensure that embedded broadcast/masking is displayed in the disassembly

* Applying formatting patch

* Updating the hwintrinsic tests to cover embedded broadcast/masking

* Fix some handling in the JIT related to embedded broadcast/masking

* Fixup some tests where validating embedded masking is non-trivial

* Cleanup some cases found by SPMI

* Ensure that CompareLessThan has its operands swapped back if its being converted to the AVX512 form

* Don't regress a scenario around op_Equality and TYP_MASK

* Adjusting hardware intrinsic tests to test non-zero masks

* Avoid some messiness around operand swapping

* Ensure embedded masks mark TYP_SIMD16 and TYP_SIMD32 instructions as needing EVEX

* Mark Sse2_r/Sse2_ro as AotIncompatible due to runtime/102037
  • Loading branch information
tannergooding authored and Ruihan-Yin committed May 30, 2024
1 parent ea99b50 commit 6b83110
Show file tree
Hide file tree
Showing 37 changed files with 4,723 additions and 1,943 deletions.
47 changes: 26 additions & 21 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -959,38 +959,35 @@ class CodeGen final : public CodeGenInterface
#ifdef FEATURE_HW_INTRINSICS
void genHWIntrinsic(GenTreeHWIntrinsic* node);
#if defined(TARGET_XARCH)
void genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node,
instruction ins,
emitAttr attr,
regNumber reg,
GenTree* rmOp,
insOpts instOptions = INS_OPTS_NONE);
void genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
void genHWIntrinsic_R_RM(
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber reg, GenTree* rmOp, insOpts instOptions);
void genHWIntrinsic_R_RM_I(
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival, insOpts instOptions);
void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, insOpts instOptions);
void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
void genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr);
void genHWIntrinsic_R_R_RM_I(
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival, insOpts instOptions);
void genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, insOpts instOptions);
void genHWIntrinsic_R_R_R_RM(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
GenTree* op3,
insOpts instOptions = INS_OPTS_NONE);
void genHWIntrinsic_R_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
insOpts instOptions);
void genHWIntrinsic_R_R_R_RM_I(
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival, insOpts instOptions);

void genBaseIntrinsic(GenTreeHWIntrinsic* node);
void genX86BaseIntrinsic(GenTreeHWIntrinsic* node);
void genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSSEIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSSE2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node);
void genSSE42Intrinsic(GenTreeHWIntrinsic* node);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSSE42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genAESIntrinsic(GenTreeHWIntrinsic* node);
void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genFMAIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genPermuteVar2x(GenTreeHWIntrinsic* node);
void genPermuteVar2x(GenTreeHWIntrinsic* node, insOpts instOptions);
void genLZCNTIntrinsic(GenTreeHWIntrinsic* node);
void genPCLMULQDQIntrinsic(GenTreeHWIntrinsic* node);
void genPOPCNTIntrinsic(GenTreeHWIntrinsic* node);
void genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins);
void genX86SerializeIntrinsic(GenTreeHWIntrinsic* node);
Expand All @@ -1003,6 +1000,8 @@ class CodeGen final : public CodeGenInterface
HWIntrinsicSwitchCaseBody emitSwCase);

void genNonTableDrivenHWIntrinsicsJumpTableFallback(GenTreeHWIntrinsic* node, GenTree* lastOp);

static insOpts AddEmbBroadcastMode(insOpts instOptions);
#endif // defined(TARGET_XARCH)

#ifdef TARGET_ARM64
Expand Down Expand Up @@ -1576,16 +1575,22 @@ class CodeGen final : public CodeGenInterface
void inst_TT(instruction ins, emitAttr size, GenTree* op1);
void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2);
void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival);
void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival);
void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival, insOpts instOptions);
void inst_RV_RV_TT(instruction ins,
emitAttr size,
regNumber targetReg,
regNumber op1Reg,
GenTree* op2,
bool isRMW,
insOpts instOptions);
void inst_RV_RV_TT_IV(
instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW);
void inst_RV_RV_TT_IV(instruction ins,
emitAttr size,
regNumber targetReg,
regNumber op1Reg,
GenTree* op2,
int8_t ival,
bool isRMW,
insOpts instOptions);
#endif

void inst_set_SV_var(GenTree* tree);
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3856,7 +3856,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
}
#elif defined(TARGET_XARCH)
// XORPS is the fastest and smallest way to initialize a XMM register to zero.
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg);
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg, INS_OPTS_NONE);
dblInitReg = reg;
#elif defined(TARGET_ARM64)
// We will just zero out the entire vector register. This sets it to a double/float zero value
Expand Down Expand Up @@ -3896,7 +3896,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
}
#elif defined(TARGET_XARCH)
// XORPS is the fastest and smallest way to initialize a XMM register to zero.
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg);
GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg, INS_OPTS_NONE);
fltInitReg = reg;
#elif defined(TARGET_ARM64)
// We will just zero out the entire vector register. This sets it to a double/float zero value
Expand Down
Loading

0 comments on commit 6b83110

Please sign in to comment.