Skip to content

Commit b1cd0f4

Browse files
authored
JIT: Support bitwise field extractions from parameter registers (dotnet#112740)
Recent work now allows us to finally add support for the backend to extract fields out of parameters without spilling them to stack. Previously this was only supported when the fields mapped cleanly to registers. A win-x64 example: ```csharp static int Foo(int? foo) { return foo.HasValue ? foo.Value : 0; } ``` ```diff ; Method Program:Foo(System.Nullable`1[int]):int (FullOpts) G_M19236_IG01: ;; offset=0x0000 - mov qword ptr [rsp+0x08], rcx - ;; size=5 bbWeight=0.50 PerfScore 0.50 + ;; size=0 bbWeight=0.50 PerfScore 0.00 -G_M19236_IG02: ;; offset=0x0005 - movzx rcx, cl - xor eax, eax - test ecx, ecx - cmovne eax, dword ptr [rsp+0x0C] - ;; size=12 bbWeight=0.50 PerfScore 1.38 +G_M19236_IG02: ;; offset=0x0000 + movzx rax, cl + shr rcx, 32 + xor edx, edx + test eax, eax + mov eax, edx + cmovne eax, ecx + ;; size=16 bbWeight=0.50 PerfScore 0.88 -G_M19236_IG03: ;; offset=0x0011 +G_M19236_IG03: ;; offset=0x0010 ret ;; size=1 bbWeight=0.50 PerfScore 0.50 -; Total bytes of code: 18 - +; Total bytes of code: 17 ``` Another win-x64 example: ```csharp static float Sum(PointF p) { return p.X + p.Y; } ``` ```diff ; Method Program:Sum(System.Drawing.PointF):float (FullOpts) G_M48891_IG01: ;; offset=0x0000 - mov qword ptr [rsp+0x08], rcx - ;; size=5 bbWeight=1 PerfScore 1.00 + ;; size=0 bbWeight=1 PerfScore 0.00 -G_M48891_IG02: ;; offset=0x0005 - vmovss xmm0, dword ptr [rsp+0x08] - vaddss xmm0, xmm0, dword ptr [rsp+0x0C] - ;; size=12 bbWeight=1 PerfScore 8.00 +G_M48891_IG02: ;; offset=0x0000 + vmovd xmm0, ecx + shr rcx, 32 + vmovd xmm1, ecx + vaddss xmm0, xmm0, xmm1 + ;; size=16 bbWeight=1 PerfScore 7.50 -G_M48891_IG03: ;; offset=0x0011 +G_M48891_IG03: ;; offset=0x0010 ret ;; size=1 bbWeight=1 PerfScore 1.00 -; Total bytes of code: 18 +; Total bytes of code: 17 ``` An arm64 example: ```csharp static bool Test(Memory<int> mem) { return mem.Length > 10; } ``` ```diff ; Method Program:Test(System.Memory`1[int]):ubyte (FullOpts) G_M53448_IG01: ;; offset=0x0000 - stp fp, lr, [sp, #-0x20]! + stp fp, lr, [sp, #-0x10]! mov fp, sp - stp x0, x1, [fp, #0x10] // [V00 arg0], [V00 arg0+0x08] - ;; size=12 bbWeight=1 PerfScore 2.50 + ;; size=8 bbWeight=1 PerfScore 1.50 -G_M53448_IG02: ;; offset=0x000C - ldr w0, [fp, #0x1C] // [V00 arg0+0x0c] +G_M53448_IG02: ;; offset=0x0008 + lsr x0, x1, dotnet#32 cmp w0, #10 cset x0, gt - ;; size=12 bbWeight=1 PerfScore 3.00 + ;; size=12 bbWeight=1 PerfScore 2.00 -G_M53448_IG03: ;; offset=0x0018 - ldp fp, lr, [sp], #0x20 +G_M53448_IG03: ;; offset=0x0014 + ldp fp, lr, [sp], #0x10 ret lr ;; size=8 bbWeight=1 PerfScore 2.00 -; Total bytes of code: 32 +; Total bytes of code: 28 ``` Float -> float extractions that do not map cleanly is still not supported, but should be doable (via vector register extractions). Float -> int extractions are not supported, but I'm not sure we see these. This is often not a code size improvement, but typically a perfscore improvement. Also this seems to have some bad interactions with call arguments since they do not yet support something similar, but hopefully that can be improved separately.
1 parent fde7c8f commit b1cd0f4

File tree

2 files changed

+108
-89
lines changed

2 files changed

+108
-89
lines changed

src/coreclr/jit/lower.cpp

Lines changed: 93 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -8379,34 +8379,6 @@ void Lowering::WidenSIMD12IfNecessary(GenTreeLclVarCommon* node)
83798379
#ifdef FEATURE_SIMD
83808380
if (node->TypeIs(TYP_SIMD12))
83818381
{
8382-
// Assumption 1:
8383-
// RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off
8384-
// to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for
8385-
// reading and writing purposes.
8386-
//
8387-
// Assumption 2:
8388-
// RyuJit backend is making another implicit assumption that Vector3 type args when passed in
8389-
// registers or on stack, the upper most 4-bytes will be zero.
8390-
//
8391-
// For P/Invoke return and Reverse P/Invoke argument passing, native compiler doesn't guarantee
8392-
// that upper 4-bytes of a Vector3 type struct is zero initialized and hence assumption 2 is
8393-
// invalid.
8394-
//
8395-
// RyuJIT x64 Windows: arguments are treated as passed by ref and hence read/written just 12
8396-
// bytes. In case of Vector3 returns, Caller allocates a zero initialized Vector3 local and
8397-
// passes it retBuf arg and Callee method writes only 12 bytes to retBuf. For this reason,
8398-
// there is no need to clear upper 4-bytes of Vector3 type args.
8399-
//
8400-
// RyuJIT x64 Unix: arguments are treated as passed by value and read/writen as if TYP_SIMD16.
8401-
// Vector3 return values are returned two return registers and Caller assembles them into a
8402-
// single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3
8403-
// type args in prolog and Vector3 type return value of a call
8404-
//
8405-
// RyuJIT x86 Windows: all non-param Vector3 local vars are allocated as 16 bytes. Vector3 arguments
8406-
// are pushed as 12 bytes. For return values, a 16-byte local is allocated and the address passed
8407-
// as a return buffer pointer. The callee doesn't write the high 4 bytes, and we don't need to clear
8408-
// it either.
8409-
84108382
if (comp->lvaMapSimd12ToSimd16(node->AsLclVarCommon()->GetLclNum()))
84118383
{
84128384
JITDUMP("Mapping TYP_SIMD12 lclvar node to TYP_SIMD16:\n");
@@ -8706,20 +8678,22 @@ void Lowering::FindInducedParameterRegisterLocals()
87068678
assert(fld->GetLclOffs() <= comp->lvaLclExactSize(fld->GetLclNum()));
87078679
unsigned structAccessedSize =
87088680
min(genTypeSize(fld), comp->lvaLclExactSize(fld->GetLclNum()) - fld->GetLclOffs());
8709-
if ((segment.Offset != fld->GetLclOffs()) || (structAccessedSize > segment.Size) ||
8710-
(varTypeUsesIntReg(fld) != genIsValidIntReg(segment.GetRegister())))
8681+
if ((fld->GetLclOffs() < segment.Offset) ||
8682+
(fld->GetLclOffs() + structAccessedSize > segment.Offset + segment.Size))
87118683
{
87128684
continue;
87138685
}
87148686

8715-
// This is a match, but check if it is already remapped.
8716-
// TODO-CQ: If it is already remapped, we can reuse the value from
8717-
// the remapping.
8718-
if (comp->FindParameterRegisterLocalMappingByRegister(segment.GetRegister()) == nullptr)
8687+
// TODO-CQ: Float -> !float extractions are not supported
8688+
// TODO-CQ: Float -> float extractions with non-zero offset is not supported
8689+
if (genIsValidFloatReg(segment.GetRegister()) &&
8690+
(!varTypeUsesFloatReg(fld) || (fld->GetLclOffs() != segment.Offset)))
87198691
{
8720-
regSegment = &segment;
8692+
continue;
87218693
}
87228694

8695+
// Found a register segment this field is contained in
8696+
regSegment = &segment;
87238697
break;
87248698
}
87258699

@@ -8728,7 +8702,7 @@ void Lowering::FindInducedParameterRegisterLocals()
87288702
continue;
87298703
}
87308704

8731-
JITDUMP("LCL_FLD use [%06u] of unenregisterable parameter corresponds to ", Compiler::dspTreeID(fld));
8705+
JITDUMP("LCL_FLD use [%06u] of unenregisterable parameter is contained in ", Compiler::dspTreeID(fld));
87328706
DBEXEC(VERBOSE, regSegment->Dump());
87338707
JITDUMP("\n");
87348708

@@ -8742,76 +8716,109 @@ void Lowering::FindInducedParameterRegisterLocals()
87428716
continue;
87438717
}
87448718

8745-
unsigned remappedLclNum = TryReuseLocalForParameterAccess(use, storedToLocals);
8719+
const ParameterRegisterLocalMapping* existingMapping =
8720+
comp->FindParameterRegisterLocalMappingByRegister(regSegment->GetRegister());
87468721

8747-
if (remappedLclNum == BAD_VAR_NUM)
8722+
unsigned remappedLclNum = BAD_VAR_NUM;
8723+
if (existingMapping == nullptr)
87488724
{
8749-
// If we have seen register kills then avoid creating a new local.
8750-
// The local is going to have to move from the parameter register
8751-
// into a callee saved register, and the callee saved register will
8752-
// need to be saved/restored to/from stack anyway.
8753-
if (hasRegisterKill)
8725+
remappedLclNum = comp->lvaGrabTemp(
8726+
false DEBUGARG(comp->printfAlloc("V%02u.%s", fld->GetLclNum(), getRegName(regSegment->GetRegister()))));
8727+
8728+
// We always use the full width for integer registers even if the
8729+
// width is shorter, because various places in the JIT will type
8730+
// accesses larger to generate smaller code.
8731+
var_types registerType =
8732+
genIsValidIntReg(regSegment->GetRegister()) ? TYP_I_IMPL : regSegment->GetRegisterType();
8733+
if ((registerType == TYP_I_IMPL) && varTypeIsGC(fld))
87548734
{
8755-
JITDUMP(" ..but use happens after a call and is deemed unprofitable to create a local for\n");
8756-
continue;
8735+
registerType = fld->TypeGet();
87578736
}
87588737

8759-
remappedLclNum = comp->lvaGrabTemp(
8760-
false DEBUGARG(comp->printfAlloc("V%02u.%s", fld->GetLclNum(), getRegName(regSegment->GetRegister()))));
8761-
comp->lvaGetDesc(remappedLclNum)->lvType = fld->TypeGet();
8738+
LclVarDsc* varDsc = comp->lvaGetDesc(remappedLclNum);
8739+
varDsc->lvType = genActualType(registerType);
87628740
JITDUMP("Created new local V%02u for the mapping\n", remappedLclNum);
8741+
8742+
comp->m_paramRegLocalMappings->Emplace(regSegment, remappedLclNum, 0);
8743+
varDsc->lvIsParamRegTarget = true;
8744+
8745+
JITDUMP("New mapping: ");
8746+
DBEXEC(VERBOSE, regSegment->Dump());
8747+
JITDUMP(" -> V%02u\n", remappedLclNum);
87638748
}
87648749
else
87658750
{
8766-
JITDUMP("Reusing local V%02u for store from struct parameter register %s. Store:\n", remappedLclNum,
8767-
getRegName(regSegment->GetRegister()));
8768-
DISPTREERANGE(LIR::AsRange(comp->fgFirstBB), use.User());
8751+
remappedLclNum = existingMapping->LclNum;
8752+
}
87698753

8770-
// The store will be a no-op, so get rid of it
8771-
LIR::AsRange(comp->fgFirstBB).Remove(use.User(), true);
8772-
use = LIR::Use();
8754+
GenTree* value = comp->gtNewLclVarNode(remappedLclNum);
87738755

8774-
#ifdef DEBUG
8775-
LclVarDsc* remappedLclDsc = comp->lvaGetDesc(remappedLclNum);
8776-
remappedLclDsc->lvReason =
8777-
comp->printfAlloc("%s%sV%02u.%s", remappedLclDsc->lvReason == nullptr ? "" : remappedLclDsc->lvReason,
8778-
remappedLclDsc->lvReason == nullptr ? "" : " ", fld->GetLclNum(),
8779-
getRegName(regSegment->GetRegister()));
8756+
if (varTypeUsesFloatReg(value))
8757+
{
8758+
assert(fld->GetLclOffs() == regSegment->Offset);
8759+
8760+
value->gtType = fld->TypeGet();
8761+
8762+
#ifdef FEATURE_SIMD
8763+
// SIMD12s should be widened. We cannot do that with
8764+
// WidenSIMD12IfNecessary as it does not expect to see SIMD12
8765+
// accesses of SIMD16 locals here.
8766+
if (value->TypeIs(TYP_SIMD12))
8767+
{
8768+
value->gtType = TYP_SIMD16;
8769+
}
87808770
#endif
87818771
}
8772+
else
8773+
{
8774+
var_types registerType = value->TypeGet();
87828775

8783-
comp->m_paramRegLocalMappings->Emplace(regSegment, remappedLclNum, 0);
8784-
comp->lvaGetDesc(remappedLclNum)->lvIsParamRegTarget = true;
8776+
if (fld->GetLclOffs() > regSegment->Offset)
8777+
{
8778+
assert(value->TypeIs(TYP_INT, TYP_LONG));
8779+
GenTree* shiftAmount = comp->gtNewIconNode((fld->GetLclOffs() - regSegment->Offset) * 8, TYP_INT);
8780+
value = comp->gtNewOperNode(varTypeIsSmall(fld) && varTypeIsSigned(fld) ? GT_RSH : GT_RSZ,
8781+
value->TypeGet(), value, shiftAmount);
8782+
}
87858783

8786-
JITDUMP("New mapping: ");
8787-
DBEXEC(VERBOSE, regSegment->Dump());
8788-
JITDUMP(" -> V%02u\n", remappedLclNum);
8784+
// Insert explicit normalization for small types (the LCL_FLD we
8785+
// are replacing comes with this normalization). This is only required
8786+
// if we didn't get the normalization via a right shift.
8787+
if (varTypeIsSmall(fld) && (regSegment->Offset + genTypeSize(fld) != genTypeSize(registerType)))
8788+
{
8789+
value = comp->gtNewCastNode(TYP_INT, value, false, fld->TypeGet());
8790+
}
87898791

8790-
// Insert explicit normalization for small types (the LCL_FLD we
8791-
// are replacing comes with this normalization).
8792-
if (varTypeIsSmall(fld))
8793-
{
8794-
GenTree* lcl = comp->gtNewLclvNode(remappedLclNum, genActualType(fld));
8795-
GenTree* normalizeLcl = comp->gtNewCastNode(TYP_INT, lcl, false, fld->TypeGet());
8796-
GenTree* storeNormalizedLcl = comp->gtNewStoreLclVarNode(remappedLclNum, normalizeLcl);
8797-
LIR::AsRange(comp->fgFirstBB).InsertAtBeginning(LIR::SeqTree(comp, storeNormalizedLcl));
8792+
// If the node is still too large then get it to the right size
8793+
if (genTypeSize(value) != genTypeSize(genActualType((fld))))
8794+
{
8795+
assert(genTypeSize(value) == 8);
8796+
assert(genTypeSize(genActualType(fld)) == 4);
87988797

8799-
JITDUMP("Parameter normalization:\n");
8800-
DISPTREERANGE(LIR::AsRange(comp->fgFirstBB), storeNormalizedLcl);
8801-
}
8798+
if (value->OperIsScalarLocal())
8799+
{
8800+
// We can use lower bits directly
8801+
value->gtType = TYP_INT;
8802+
}
8803+
else
8804+
{
8805+
value = comp->gtNewCastNode(TYP_INT, value, false, TYP_INT);
8806+
}
8807+
}
88028808

8803-
// If we still have a valid use, then replace the LCL_FLD with a
8804-
// LCL_VAR of the remapped parameter register local.
8805-
if (use.IsInitialized())
8806-
{
8807-
GenTree* lcl = comp->gtNewLclvNode(remappedLclNum, genActualType(fld));
8808-
LIR::AsRange(comp->fgFirstBB).InsertAfter(fld, lcl);
8809-
use.ReplaceWith(lcl);
8810-
LowerNode(lcl);
8811-
JITDUMP("New user tree range:\n");
8812-
DISPTREERANGE(LIR::AsRange(comp->fgFirstBB), use.User());
8809+
// Finally insert a bitcast if necessary
8810+
if (value->TypeGet() != genActualType(fld))
8811+
{
8812+
value = comp->gtNewBitCastNode(genActualType(fld), value);
8813+
}
88138814
}
88148815

8816+
// Now replace the LCL_FLD.
8817+
LIR::AsRange(comp->fgFirstBB).InsertAfter(fld, LIR::SeqTree(comp, value));
8818+
use.ReplaceWith(value);
8819+
JITDUMP("New user tree range:\n");
8820+
DISPTREERANGE(LIR::AsRange(comp->fgFirstBB), use.User());
8821+
88158822
fld->gtBashToNOP();
88168823
}
88178824
}

src/coreclr/jit/promotion.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3053,11 +3053,23 @@ bool Promotion::MapsToParameterRegister(Compiler* comp, unsigned lclNum, unsigne
30533053

30543054
for (const ABIPassingSegment& seg : abiInfo.Segments())
30553055
{
3056-
if ((offset == seg.Offset) && (genTypeSize(accessType) == seg.Size) &&
3057-
(varTypeUsesIntReg(accessType) == genIsValidIntReg(seg.GetRegister())))
3056+
// This code corresponds to code in Lower::FindInducedParameterRegisterLocals
3057+
if ((offset < seg.Offset) || (offset + genTypeSize(accessType) > seg.Offset + seg.Size))
30583058
{
3059-
return true;
3059+
continue;
3060+
}
3061+
3062+
if (!genIsValidIntReg(seg.GetRegister()) && varTypeUsesFloatReg(accessType))
3063+
{
3064+
continue;
30603065
}
3066+
3067+
if (genIsValidFloatReg(seg.GetRegister()) && (offset != seg.Offset))
3068+
{
3069+
continue;
3070+
}
3071+
3072+
return true;
30613073
}
30623074

30633075
return false;

0 commit comments

Comments
 (0)