diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index bd15513f199d71..6247d3437ad7ff 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -438,6 +438,7 @@ class CodeGen final : public CodeGenInterface #if defined(TARGET_ARM64) void genUnknownSizeFrame(); + void genZeroInitializeUnknownSizeFrame(); #endif #elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) @@ -464,6 +465,9 @@ class CodeGen final : public CodeGenInterface void genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn); void genPoisonFrame(regMaskTP bbRegLiveIn); +#ifdef TARGET_ARM64 + void genPoisonUnknownSizeVariable(int varNum, char poisonVal); +#endif #if defined(TARGET_ARM) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 47ecfbea7dc7de..288629f254761d 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -6115,4 +6115,20 @@ BasicBlock* CodeGen::genGetThrowHelper(SpecialCodeKind codeKind) return excpRaisingBlock; } +// +void CodeGen::genPoisonUnknownSizeVariable(int varNum, char poisonVal) +{ + assert(varNum >= 0); + LclVarDsc* varDsc = m_compiler->lvaGetDesc(varNum); + + // We should not see mask locals being address exposed. + assert(varDsc->IsAddressExposed()); + noway_assert(varDsc->TypeGet() == TYP_SIMD); + + // mov z9.b, #poisonVal + GetEmitter()->emitIns_R_I(INS_sve_mov, EA_SCALABLE, REG_SCRATCH_V, (ssize_t)poisonVal, INS_OPTS_SCALABLE_B); + // str z9, [x19, $index MUL VL] + GetEmitter()->emitIns_S_R(INS_sve_str, EA_SCALABLE, REG_SCRATCH_V, varNum, 0); +} + #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 699e1b103fe1d3..2432c297efc06d 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -4917,6 +4917,54 @@ void CodeGen::genUnknownSizeFrame() GetEmitter()->emitIns_R_R_R_R(INS_msub, EA_8BYTE, REG_SP, rsvd, REG_SCRATCH, REG_SP); } } + +//---------------------------------------------------------------------------- +// +// genZeroInitializeUnknownSizeFrame: Zero-initialize the UnknownSizeFrame stack space. +// +// Remarks: +// This function emits code that assumes the state of sp has not been modified since +// establishing the UnknownSizeFrame. sp must point to the end of the UnknownSizeFrame. +// +void CodeGen::genZeroInitializeUnknownSizeFrame() +{ + assert(m_compiler->compUsesUnknownSizeFrame); + + unsigned vectorCount = m_compiler->unkSizeFrame.FrameSizeInVectors(); + + assert(vectorCount > 0); + + // z9 <== {0, 0, ...} + GetEmitter()->emitIns_R_I(INS_sve_mov, EA_SCALABLE, REG_SCRATCH_V, 0, INS_OPTS_SCALABLE_B); + + // For small vector counts, emit unrolled loop of vector stores. + // Unrolling to a maximum of 5 stores optimizes for code size rather than performance. + // TODO-SVE: Does unrolling further improve performance? + if (vectorCount <= 5) + { + for (unsigned i = 0; i < vectorCount; i++) + { + // str z9, [sp, #i MUL VL] + GetEmitter()->emitIns_R_R_I(INS_sve_str, EA_SCALABLE, REG_SCRATCH_V, REG_SP, i); + } + } + else + { + // $cursor <== x19 + inst_Mov(TYP_BYREF, REG_SCRATCH, REG_UNKBASE, false); + BasicBlock* loop = genCreateTempLabel(); + // loop: + genDefineInlineTempLabel(loop); + // addvl $cursor, $cursor, #-1 + GetEmitter()->emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, REG_SCRATCH, REG_SCRATCH, -1); + // str z9, [$cursor] + GetEmitter()->emitIns_R_R(INS_sve_str, EA_SCALABLE, REG_SCRATCH_V, REG_SCRATCH); + // cmp sp, $cursor + GetEmitter()->emitIns_R_R(INS_cmp, EA_8BYTE, REG_SP, REG_SCRATCH, INS_OPTS_UXTX); + // b.ne loop + GetEmitter()->emitIns_J(INS_bne, loop); + } +} #endif /***************************************************************************** diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 790bf2b96a9d8e..30df7edf8d78ca 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -8318,6 +8318,14 @@ void CodeGen::genPoisonFrame(regMaskTP regLiveIn) assert(varDsc->lvOnFrame); +#ifdef TARGET_ARM64 + if (m_compiler->lvaIsUnknownSizeLocal(varNum)) + { + genPoisonUnknownSizeVariable(varNum, (char)poisonVal); + continue; + } +#endif + unsigned int size = m_compiler->lvaLclStackHomeSize(varNum); if ((size / TARGET_POINTER_SIZE) > 16) { diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index b93ca108bb29b4..bdefc8aa3da8db 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -468,14 +468,24 @@ void CodeGen::genCodeForBlock(BasicBlock* block) } #endif -#ifndef TARGET_WASM // TODO-WASM: enable genPoisonFrame - // Emit poisoning into the init BB that comes right after prolog. - // We cannot emit this code in the prolog as it might make the prolog too large. - if (m_compiler->compShouldPoisonFrame() && block->IsFirst()) + // Emit any code that needs to occur straight after the prolog, but does not want + // to be part of the prolog itself. + if (block->IsFirst()) { - genPoisonFrame(newLiveRegSet); - } +#ifdef TARGET_ARM64 + if (m_compiler->compUsesUnknownSizeFrame) + { + genZeroInitializeUnknownSizeFrame(); + } +#endif + +#ifndef TARGET_WASM // TODO-WASM: enable genPoisonFrame + if (m_compiler->compShouldPoisonFrame()) + { + genPoisonFrame(newLiveRegSet); + } #endif // !TARGET_WASM + } // Traverse the block in linear order, generating code for each node as we // as we encounter it. diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 25add148df4844..ab42a7f0e4a198 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -429,7 +429,7 @@ void emitter::emitInsSanityCheck(instrDesc* id) case IF_DR_2A: // DR_2A X..........mmmmm ......nnnnn..... Rn Rm assert(isValidGeneralDatasize(id->idOpSize())); - assert(isGeneralRegister(id->idReg1())); + assert(isGeneralRegisterOrZR(id->idReg1())); assert(isGeneralRegister(id->idReg2())); break; @@ -2497,13 +2497,6 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) return (imm >= -256) && (imm <= 255); } -// true if this 'imm' can be encoded as the offset in an unscaled ldr/str instruction -/*static*/ bool emitter::emitIns_valid_imm_for_scaled_sve_ldst_offset(INT64 imm) -{ - // TODO-SVE: This assumes 128bit SVE. - return ((imm % 16) == 0 && (imm / 16) <= 255 && (imm / 16) >= -256); -} - // true if this 'imm' can be encoded as the offset in a ldr/str instruction /*static*/ bool emitter::emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr attr) { @@ -8240,8 +8233,23 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va case INS_lea: // We shouldn't be materializing the address of a mask. assert(m_compiler->lvaGetActualType(varx) != TYP_MASK); - // addvl reg1, x19, #imm - emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, reg1, REG_UNKBASE, imm); + if (isValidSimm<6>(imm)) + { + // addvl reg1, x19, #imm + emitIns_R_R_I(INS_sve_addvl, EA_8BYTE, reg1, REG_UNKBASE, imm); + } + else + { + // Cannot encode immediate, generate `addr = fp + imm * VL`. + // + // set reg1 = imm + // rdvl rsvd, #1 + // madd reg1, reg1, rsvd, x19 + regNumber rsvd = codeGen->rsGetRsvdReg(); + codeGen->instGen_Set_Reg_To_Imm(EA_8BYTE, reg1, imm); + emitIns_R_I(INS_sve_rdvl, EA_8BYTE, rsvd, 1); + emitIns_R_R_R_R(INS_madd, EA_8BYTE, reg1, reg1, rsvd, REG_UNKBASE); + } return; case INS_sve_ldr: diff --git a/src/coreclr/jit/emitarm64sve.cpp b/src/coreclr/jit/emitarm64sve.cpp index 2246bfcb29bcfe..95f4a2932be8cb 100644 --- a/src/coreclr/jit/emitarm64sve.cpp +++ b/src/coreclr/jit/emitarm64sve.cpp @@ -2741,27 +2741,12 @@ void emitter::emitInsSve_R_R_I(instruction ins, case INS_sve_ldr: assert(insOptsNone(opt)); assert(isScalableVectorSize(size)); - assert(isGeneralRegister(reg2)); // nnnnn + assert(isGeneralRegisterOrSP(reg2)); // nnnnn assert(insScalableOptsNone(sopt)); - - // imm is the number of bytes to offset by. The instruction requires a multiple of the - // vector length ([#imm mul vl]). If it doesn't fit then stash the resulting address - // into a register. - if (emitIns_valid_imm_for_scaled_sve_ldst_offset(imm)) - { - // TODO-SVE: This assumes 128bit SVE. - imm = imm / 16; - } - else - { - regNumber rsvdReg = codeGen->rsGetRsvdReg(); - codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm); - reg2 = rsvdReg; - imm = 0; - } - assert(isValidSimm<9>(imm)); + reg2 = encodingSPtoZR(reg2); + if (isVectorRegister(reg1)) { fmt = IF_SVE_IE_2A; @@ -2776,27 +2761,12 @@ void emitter::emitInsSve_R_R_I(instruction ins, case INS_sve_str: assert(insOptsNone(opt)); assert(isScalableVectorSize(size)); - assert(isGeneralRegister(reg2)); // nnnnn + assert(isGeneralRegisterOrSP(reg2)); // nnnnn assert(insScalableOptsNone(sopt)); - - // imm is the number of bytes to offset by. The instruction requires a multiple of the - // vector length ([#imm mul vl]). If it doesn't fit then stash the resulting address - // into a register. - if (emitIns_valid_imm_for_scaled_sve_ldst_offset(imm)) - { - // TODO-SVE: This assumes 128bit SVE. - imm = imm / 16; - } - else - { - regNumber rsvdReg = codeGen->rsGetRsvdReg(); - codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm); - reg2 = rsvdReg; - imm = 0; - } - assert(isValidSimm<9>(imm)); + reg2 = encodingSPtoZR(reg2); + if (isVectorRegister(reg1)) { fmt = IF_SVE_JH_2A; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 3ff89b315e0c63..a15b502e959f9b 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -2488,6 +2488,18 @@ void LinearScan::buildIntervals() currentLoc += 2; } +#ifdef TARGET_ARM64 + if (m_compiler->compUsesUnknownSizeFrame && (block == m_compiler->fgFirstBB)) + { + regMaskTP killed; + killed.AddRegNumInMask(REG_SCRATCH); + killed.AddRegNumInMask(REG_SCRATCH_V); + + addKillForRegs(killed, currentLoc + 1); + currentLoc += 2; + } +#endif + LIR::Range& blockRange = LIR::AsRange(block); for (GenTree* node : blockRange) { diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 34ebcf5afebcce..121e030a4ef432 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -138,8 +138,9 @@ #define REG_SHIFT REG_NA #define RBM_SHIFT RBM_ALLINT -// This is a general scratch register that does not conflict with the argument registers +// Scratch registers that do not conflict with the argument registers, usually for use in function prolog #define REG_SCRATCH REG_R9 +#define REG_SCRATCH_V REG_V9 // This is a general register that can be optionally reserved for other purposes during codegen #define REG_OPT_RSVD REG_IP1