diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 33f3570deae2e..0b58393f1f304 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -782,17 +782,18 @@ class emitter
         unsigned _idNoGC : 1;       // Some helpers don't get recorded in GC tables
 
 #ifdef TARGET_ARM64
-        opSize   _idOpSize : 3; // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16
-        insOpts  _idInsOpt : 6; // options for instructions
-        unsigned _idLclVar : 1; // access a local on stack
+        opSize   _idOpSize : 3;    // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16
+        insOpts  _idInsOpt : 6;    // options for instructions
+        unsigned _idLclVar : 1;    // access a local on stack
+        unsigned _idLclVarPair : 1 // carries information for 2 GC lcl vars.
 #endif
 
 #ifdef TARGET_LOONGARCH64
-        // TODO-LoongArch64: maybe delete on future.
-        opSize  _idOpSize : 3;  // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16
-        insOpts _idInsOpt : 6;  // loongarch options for special: placeholders. e.g emitIns_R_C, also identifying the
-                                // accessing a local on stack.
-        unsigned _idLclVar : 1; // access a local on stack.
+            // TODO-LoongArch64: maybe delete on future.
+            opSize _idOpSize : 3; // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16
+        insOpts    _idInsOpt : 6; // loongarch options for special: placeholders. e.g emitIns_R_C, also identifying the
+                                  // accessing a local on stack.
+        unsigned _idLclVar : 1;   // access a local on stack.
 #endif
 
 #ifdef TARGET_RISCV64
@@ -815,7 +816,7 @@ class emitter
         // x86:   46 bits
         // amd64: 46 bits
         // arm:   48 bits
-        // arm64: 49 bits
+        // arm64: 50 bits
         // loongarch64: 46 bits
 
         //
@@ -827,7 +828,7 @@ class emitter
 #if defined(TARGET_ARM)
 #define ID_EXTRA_BITFIELD_BITS (16)
 #elif defined(TARGET_ARM64)
-#define ID_EXTRA_BITFIELD_BITS (17)
+#define ID_EXTRA_BITFIELD_BITS (18)
 #elif defined(TARGET_XARCH) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
 #define ID_EXTRA_BITFIELD_BITS (14)
 #else
@@ -867,7 +868,7 @@ class emitter
         // x86:   52/48 bits
         // amd64: 53/48 bits
         // arm:   54/50 bits
-        // arm64: 56/51 bits
+        // arm64: 57/52 bits
         // loongarch64: 53/48 bits
         CLANG_FORMAT_COMMENT_ANCHOR;
 
@@ -885,7 +886,7 @@ class emitter
         // x86:   12/16 bits
         // amd64: 11/16 bits
         // arm:   10/14 bits
-        // arm64: 8/13 bits
+        // arm64: 7/12 bits
         // loongarch64: 11/16 bits
 
         unsigned _idSmallCns : ID_BIT_SMALL_CNS;
@@ -1432,6 +1433,16 @@ class emitter
         {
             _idLclVar = 1;
         }
+#ifdef TARGET_ARM64
+        bool idIsLclVarPair() const
+        {
+            return _idLclVarPair != 0;
+        }
+        void idSetIsLclVarPair()
+        {
+            _idLclVarPair = 1;
+        }
+#endif // TARGET_ARM64
 #endif // TARGET_ARMARCH
 
 #if defined(TARGET_ARM)
@@ -1819,6 +1830,22 @@ class emitter
 
 #endif // TARGET_XARCH
 
+#ifdef TARGET_ARM64
+    struct instrDescLclVarPair : instrDesc // contains 2 gc vars to be tracked
+    {
+        instrDescLclVarPair() = delete;
+
+        emitLclVarAddr iiaLclVar2;
+    };
+
+    struct instrDescLclVarPairCns : instrDescCns // contains 2 gc vars to be tracked, with large cons
+    {
+        instrDescLclVarPairCns() = delete;
+
+        emitLclVarAddr iiaLclVar2;
+    };
+#endif
+
     struct instrDescCGCA : instrDesc // call with ...
     {
         instrDescCGCA() = delete;
@@ -2600,7 +2627,26 @@ class emitter
 #endif // EMITTER_STATS
         return (instrDescLbl*)emitAllocAnyInstr(sizeof(instrDescLbl), EA_4BYTE);
     }
-#endif // !TARGET_ARM64
+#endif // TARGET_ARM64
+
+#if defined(TARGET_ARM64)
+    instrDescLclVarPair* emitAllocInstrLclVarPair(emitAttr attr)
+    {
+        instrDescLclVarPair* result = (instrDescLclVarPair*)emitAllocAnyInstr(sizeof(instrDescLclVarPair), attr);
+        result->idSetIsLclVarPair();
+        return result;
+    }
+
+    instrDescLclVarPairCns* emitAllocInstrLclVarPairCns(emitAttr attr, cnsval_size_t cns)
+    {
+        instrDescLclVarPairCns* result =
+            (instrDescLclVarPairCns*)emitAllocAnyInstr(sizeof(instrDescLclVarPairCns), attr);
+        result->idSetIsLargeCns();
+        result->idSetIsLclVarPair();
+        result->idcCnsVal = cns;
+        return result;
+    }
+#endif // TARGET_ARM64
 
     instrDescCns* emitAllocInstrCns(emitAttr attr)
     {
@@ -2686,6 +2732,8 @@ class emitter
 
 #if !defined(TARGET_ARM64)
     instrDescLbl* emitNewInstrLbl();
+#else
+    instrDesc* emitNewInstrLclVarPair(emitAttr attr, cnsval_ssize_t cns);
 #endif // !TARGET_ARM64
 
     static const BYTE emitFmtToOps[];
@@ -3249,6 +3297,36 @@ inline emitter::instrDescLbl* emitter::emitNewInstrLbl()
 {
     return emitAllocInstrLbl();
 }
+#else
+inline emitter::instrDesc* emitter::emitNewInstrLclVarPair(emitAttr attr, cnsval_ssize_t cns)
+{
+#if EMITTER_STATS
+    emitTotalIDescCnt++;
+    emitTotalIDescCnsCnt++;
+#endif // EMITTER_STATS
+
+    if (instrDesc::fitsInSmallCns(cns))
+    {
+        instrDescLclVarPair* id = emitAllocInstrLclVarPair(attr);
+        id->idSmallCns(cns);
+#if EMITTER_STATS
+        emitSmallCnsCnt++;
+        if ((cns - ID_MIN_SMALL_CNS) >= (SMALL_CNS_TSZ - 1))
+            emitSmallCns[SMALL_CNS_TSZ - 1]++;
+        else
+            emitSmallCns[cns - ID_MIN_SMALL_CNS]++;
+#endif
+        return id;
+    }
+    else
+    {
+        instrDescLclVarPairCns* id = emitAllocInstrLclVarPairCns(attr, cns);
+#if EMITTER_STATS
+        emitLargeCnsCnt++;
+#endif
+        return id;
+    }
+}
 #endif // !TARGET_ARM64
 
 inline emitter::instrDesc* emitter::emitNewInstrDsp(emitAttr attr, target_ssize_t dsp)
@@ -3329,10 +3407,22 @@ inline size_t emitter::emitGetInstrDescSize(const instrDesc* id)
     }
     else if (id->idIsLargeCns())
     {
+#ifdef TARGET_ARM64
+        if (id->idIsLclVarPair())
+        {
+            return sizeof(instrDescLclVarPairCns);
+        }
+#endif
         return sizeof(instrDescCns);
     }
     else
     {
+#ifdef TARGET_ARM64
+        if (id->idIsLclVarPair())
+        {
+            return sizeof(instrDescLclVarPair);
+        }
+#endif
         return sizeof(instrDesc);
     }
 }
diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index 657f9645de04c..b537146be5b40 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -128,15 +128,29 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id)
 
     if (id->idIsLargeCns())
     {
-        if (id->idIsLargeDsp())
+        if (id->idIsLclVarPair())
+        {
+            return sizeof(instrDescLclVarPairCns);
+        }
+        else if (id->idIsLargeDsp())
+        {
             return sizeof(instrDescCnsDsp);
+        }
         else
+        {
             return sizeof(instrDescCns);
+        }
     }
     else
     {
-        if (id->idIsLargeDsp())
+        if (id->idIsLclVarPair())
+        {
+            return sizeof(instrDescLclVarPair);
+        }
+        else if (id->idIsLargeDsp())
+        {
             return sizeof(instrDescDsp);
+        }
         else
         {
 #if FEATURE_LOOP_ALIGN
@@ -2399,18 +2413,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
 /*static*/ unsigned emitter::NaturalScale_helper(emitAttr size)
 {
     assert(size == EA_1BYTE || size == EA_2BYTE || size == EA_4BYTE || size == EA_8BYTE || size == EA_16BYTE);
-
-    unsigned result = 0;
-    unsigned utemp  = (unsigned)size;
-
-    // Compute log base 2 of utemp (aka 'size')
-    while (utemp > 1)
-    {
-        result++;
-        utemp >>= 1;
-    }
-
-    return result;
+    return BitOperations::Log2((unsigned)size);
 }
 
 /************************************************************************
@@ -5686,7 +5689,8 @@ void emitter::emitIns_R_R_I(
         }
 
         // Try to optimize a load/store with an alternative instruction.
-        if (isLdrStr && emitComp->opts.OptimizationEnabled() && OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt))
+        if (isLdrStr && emitComp->opts.OptimizationEnabled() &&
+            OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt, false, -1, -1 DEBUG_ARG(false)))
         {
             return;
         }
@@ -6493,6 +6497,132 @@ void emitter::emitIns_R_R_R(
     appendToCurIG(id);
 }
 
+//-----------------------------------------------------------------------------------
+// emitIns_R_R_R_I_LdStPair: Add an instruction storing 2 registers into a memory
+//                     (pointed by reg3) and the offset (immediate).
+//
+// Arguments:
+//     ins      - The instruction code
+//     attr     - The emit attribute for register 1
+//     attr2    - The emit attribute for register 2
+//     reg1     - Register 1
+//     reg2     - Register 2
+//     reg3     - Register 3
+//     imm      - Immediate offset, prior to scaling by operand size
+//     varx1    - LclVar number 1
+//     varx2    - LclVar number 2
+//     offs1    - Memory offset of lclvar number 1
+//     offs2    - Memory offset of lclvar number 2
+//
+void emitter::emitIns_R_R_R_I_LdStPair(instruction ins,
+                                       emitAttr    attr,
+                                       emitAttr    attr2,
+                                       regNumber   reg1,
+                                       regNumber   reg2,
+                                       regNumber   reg3,
+                                       ssize_t     imm,
+                                       int         varx1,
+                                       int         varx2,
+                                       int         offs1,
+                                       int         offs2)
+{
+    assert((ins == INS_stp) || (ins == INS_ldp));
+    emitAttr  size  = EA_SIZE(attr);
+    insFormat fmt   = IF_NONE;
+    unsigned  scale = 0;
+
+    // Is the target a vector register?
+    if (isVectorRegister(reg1))
+    {
+        assert(isValidVectorLSPDatasize(size));
+        assert(isVectorRegister(reg2));
+
+        scale = NaturalScale_helper(size);
+        assert((scale >= 2) && (scale <= 4));
+    }
+    else
+    {
+        assert(isValidGeneralDatasize(size));
+        assert(isGeneralRegisterOrZR(reg2));
+        scale = (size == EA_8BYTE) ? 3 : 2;
+    }
+
+    reg3 = encodingSPtoZR(reg3);
+
+    fmt          = IF_LS_3C;
+    ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
+    if (imm == 0)
+    {
+        fmt = IF_LS_3B;
+    }
+    else
+    {
+        if ((imm & mask) == 0)
+        {
+            imm >>= scale; // The immediate is scaled by the size of the ld/st
+        }
+        else
+        {
+            // Unlike emitIns_S_S_R_R(), we would never come here when
+            // (imm & mask) != 0.
+            unreached();
+        }
+    }
+
+    bool validVar1 = varx1 != -1;
+    bool validVar2 = varx2 != -1;
+
+    instrDesc* id;
+
+    if (validVar1 && validVar2)
+    {
+        id = emitNewInstrLclVarPair(attr, imm);
+        id->idAddr()->iiaLclVar.initLclVarAddr(varx1, offs1);
+        id->idSetIsLclVar();
+
+        emitGetLclVarPairLclVar2(id)->initLclVarAddr(varx2, offs2);
+    }
+    else
+    {
+        id = emitNewInstrCns(attr, imm);
+        if (validVar1)
+        {
+            id->idAddr()->iiaLclVar.initLclVarAddr(varx1, offs1);
+            id->idSetIsLclVar();
+        }
+        if (validVar2)
+        {
+            id->idAddr()->iiaLclVar.initLclVarAddr(varx2, offs2);
+            id->idSetIsLclVar();
+        }
+    }
+
+    id->idIns(ins);
+    id->idInsFmt(fmt);
+    id->idInsOpt(INS_OPTS_NONE);
+
+    id->idReg1(reg1);
+    id->idReg2(reg2);
+    id->idReg3(reg3);
+
+    // Record the attribute for the second register in the pair
+    if (EA_IS_GCREF(attr2))
+    {
+        id->idGCrefReg2(GCT_GCREF);
+    }
+    else if (EA_IS_BYREF(attr2))
+    {
+        id->idGCrefReg2(GCT_BYREF);
+    }
+    else
+    {
+        id->idGCrefReg2(GCT_NONE);
+    }
+
+    dispIns(id);
+    appendToCurIG(id);
+}
+
 /*****************************************************************************
  *
  *  Add an instruction referencing three registers and a constant.
@@ -7610,8 +7740,9 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
     disp = base + offs;
     assert((scale >= 0) && (scale <= 4));
 
-    regNumber reg2 = FPbased ? REG_FPBASE : REG_SPBASE;
-    reg2           = encodingSPtoZR(reg2);
+    bool      useRegForImm = false;
+    regNumber reg2         = FPbased ? REG_FPBASE : REG_SPBASE;
+    reg2                   = encodingSPtoZR(reg2);
 
     if (ins == INS_lea)
     {
@@ -7639,9 +7770,8 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
     }
     else
     {
-        bool    useRegForImm = false;
-        ssize_t mask         = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
-        imm                  = disp;
+        ssize_t mask = (1 << scale) - 1; // the mask of low bits that must be zero to encode the immediate
+        imm          = disp;
         if (imm == 0)
         {
             fmt = IF_LS_2A;
@@ -7683,7 +7813,7 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
 
     // Try to optimize a load/store with an alternative instruction.
     if (isLdrStr && emitComp->opts.OptimizationEnabled() &&
-        OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt, true, varx, offs))
+        OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt, true, varx, offs DEBUG_ARG(useRegForImm)))
     {
         return;
     }
@@ -7917,7 +8047,7 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
 
     // Try to optimize a store with an alternative instruction.
     if (isStr && emitComp->opts.OptimizationEnabled() &&
-        OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt, true, varx, offs))
+        OptimizeLdrStr(ins, attr, reg1, reg2, imm, size, fmt, true, varx, offs DEBUG_ARG(useRegForImm)))
     {
         return;
     }
@@ -11731,31 +11861,81 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 vt              = tmpDsc->tdTempType();
             }
             if (vt == TYP_REF || vt == TYP_BYREF)
+            {
                 emitGCvarDeadUpd(adr + ofs, dst DEBUG_ARG(varNum));
+            }
         }
         if (emitInsWritesToLclVarStackLocPair(id))
         {
-            unsigned ofs2 = ofs + TARGET_POINTER_SIZE;
+            int      varNum2 = varNum;
+            int      adr2    = adr;
+            unsigned ofs2    = ofs;
+            unsigned ofs2Dist;
+
+            if (id->idIsLclVarPair())
+            {
+                bool FPbased2;
+
+                emitLclVarAddr* lclVarAddr2 = emitGetLclVarPairLclVar2(id);
+                varNum2                     = lclVarAddr2->lvaVarNum();
+                ofs2                        = lclVarAddr2->lvaOffset();
+
+                // If there are 2 GC vars in this instrDesc, get the 2nd variable
+                // that should be tracked.
+                adr2     = emitComp->lvaFrameAddress(varNum2, &FPbased2);
+                ofs2Dist = EA_SIZE_IN_BYTES(size);
+#ifdef DEBUG
+                assert(FPbased == FPbased2);
+                if (FPbased)
+                {
+                    assert(id->idReg3() == REG_FP);
+                }
+                else
+                {
+                    assert(id->idReg3() == REG_SP);
+                }
+                assert(varNum2 != -1);
+#endif // DEBUG
+            }
+            else
+            {
+                ofs2Dist = TARGET_POINTER_SIZE;
+                ofs2 += ofs2Dist;
+            }
+
+            ofs2 = AlignDown(ofs2, ofs2Dist);
+
             if (id->idGCrefReg2() != GCT_NONE)
             {
-                emitGCvarLiveUpd(adr + ofs2, varNum, id->idGCrefReg2(), dst DEBUG_ARG(varNum));
+#ifdef DEBUG
+                if (id->idGCref() != GCT_NONE)
+                {
+                    // If 1st register was a gc-var, then make sure the offset
+                    // are correctly set for the 2nd register that is holding
+                    // another gc-var.
+                    assert((adr + ofs + ofs2Dist) == (adr2 + ofs2));
+                }
+#endif
+                emitGCvarLiveUpd(adr2 + ofs2, varNum2, id->idGCrefReg2(), dst DEBUG_ARG(varNum2));
             }
             else
             {
                 // If the type of the local is a gc ref type, update the liveness.
                 var_types vt;
-                if (varNum >= 0)
+                if (varNum2 >= 0)
                 {
                     // "Regular" (non-spill-temp) local.
-                    vt = var_types(emitComp->lvaTable[varNum].lvType);
+                    vt = var_types(emitComp->lvaTable[varNum2].lvType);
                 }
                 else
                 {
-                    TempDsc* tmpDsc = codeGen->regSet.tmpFindNum(varNum);
+                    TempDsc* tmpDsc = codeGen->regSet.tmpFindNum(varNum2);
                     vt              = tmpDsc->tdTempType();
                 }
                 if (vt == TYP_REF || vt == TYP_BYREF)
-                    emitGCvarDeadUpd(adr + ofs2, dst DEBUG_ARG(varNum));
+                {
+                    emitGCvarDeadUpd(adr2 + ofs2, dst DEBUG_ARG(varNum2));
+                }
             }
         }
     }
@@ -16180,6 +16360,71 @@ bool emitter::IsRedundantLdStr(
     return false;
 }
 
+//-----------------------------------------------------------------------------------
+// OptimizeLdrStr: Try to optimize "ldr" or "str" instruction with an alternative
+//                 instruction.
+//
+// Arguments:
+//     ins      - The instruction code
+//     reg1Attr - The emit attribute for register 1
+//     reg1     - Register 1
+//     reg2     - Register 2
+//     imm      - Immediate offset, prior to scaling by operand size
+//     size     - Operand size
+//     fmt               - Instruction format
+//     localVar          - If current instruction has local var
+//     varx     - LclVarNum if this instruction contains local variable
+//     offs     - Stack offset where it is accessed (loaded / stored).
+//     useRsvdReg     - If this instruction needs reserved register.
+//
+// Return Value:
+//    "true" if the previous instruction has been overwritten.
+//
+bool emitter::OptimizeLdrStr(instruction ins,
+                             emitAttr    reg1Attr,
+                             regNumber   reg1,
+                             regNumber   reg2,
+                             ssize_t     imm,
+                             emitAttr    size,
+                             insFormat   fmt,
+                             bool        localVar,
+                             int         varx,
+                             int offs DEBUG_ARG(bool useRsvdReg))
+{
+    assert(ins == INS_ldr || ins == INS_str);
+
+    if (!emitCanPeepholeLastIns() || (emitLastIns->idIns() != ins))
+    {
+        return false;
+    }
+
+    // Is the ldr/str even necessary?
+    if (IsRedundantLdStr(ins, reg1, reg2, imm, size, fmt))
+    {
+        return true;
+    }
+
+    // Register 2 needs conversion to unencoded value for following optimisation checks.
+    reg2 = encodingZRtoSP(reg2);
+
+    // If the previous instruction was a matching load/store, then try to replace it instead of emitting.
+    //
+    if (ReplaceLdrStrWithPairInstr(ins, reg1Attr, reg1, reg2, imm, size, fmt, localVar, varx, offs))
+    {
+        assert(!useRsvdReg);
+        return true;
+    }
+
+    // If we have a second LDR instruction from the same source, then try to replace it with a MOV.
+    if (IsOptimizableLdrToMov(ins, reg1, reg2, imm, size, fmt))
+    {
+        emitIns_Mov(INS_mov, reg1Attr, reg1, emitLastIns->idReg1(), true);
+        return true;
+    }
+
+    return false;
+}
+
 //-----------------------------------------------------------------------------------
 // ReplaceLdrStrWithPairInstr: Potentially, overwrite a previously-emitted "ldr" or "str"
 //                             instruction with an "ldp" or "stp" instruction.
@@ -16191,66 +16436,92 @@ bool emitter::IsRedundantLdStr(
 //     reg2     - Register 2
 //     imm      - Immediate offset, prior to scaling by operand size
 //     size     - Operand size
-//     fmt      - Instruction format
+//     fmt               - Instruction format
+//     localVar          - If current instruction has local var
+//     currLclVarNum     - LclVarNum if this instruction contains local variable
+//     offs     - Stack offset where it is accessed (loaded / stored).
 //
 // Return Value:
 //    "true" if the previous instruction has been overwritten.
 //
-bool emitter::ReplaceLdrStrWithPairInstr(
-    instruction ins, emitAttr reg1Attr, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt)
+bool emitter::ReplaceLdrStrWithPairInstr(instruction ins,
+                                         emitAttr    reg1Attr,
+                                         regNumber   reg1,
+                                         regNumber   reg2,
+                                         ssize_t     imm,
+                                         emitAttr    size,
+                                         insFormat   fmt,
+                                         bool        isCurrLclVar,
+                                         int         varx,
+                                         int         offs)
 {
     RegisterOrder optimizationOrder = IsOptimizableLdrStrWithPair(ins, reg1, reg2, imm, size, fmt);
 
-    if (optimizationOrder != eRO_none)
+    if (optimizationOrder == eRO_none)
     {
-        regNumber oldReg1 = emitLastIns->idReg1();
+        return false;
+    }
 
-        ssize_t     oldImm = emitGetInsSC(emitLastIns);
-        instruction optIns = (ins == INS_ldr) ? INS_ldp : INS_stp;
+    regNumber prevReg1 = emitLastIns->idReg1();
 
-        emitAttr oldReg1Attr;
-        switch (emitLastIns->idGCref())
-        {
-            case GCT_GCREF:
-                oldReg1Attr = EA_GCREF;
-                break;
-            case GCT_BYREF:
-                oldReg1Attr = EA_BYREF;
-                break;
-            default:
-                oldReg1Attr = emitLastIns->idOpSize();
-                break;
-        }
+    ssize_t     prevImm = emitGetInsSC(emitLastIns);
+    instruction optIns  = (ins == INS_ldr) ? INS_ldp : INS_stp;
 
-        // Remove the last instruction written.
-        emitRemoveLastInstruction();
+    emitAttr prevReg1Attr;
+    ssize_t  prevImmSize   = prevImm * size;
+    ssize_t  newImmSize    = imm * size;
+    bool     isLastLclVar  = emitLastIns->idIsLclVar();
+    int      prevOffset    = -1;
+    int      prevLclVarNum = -1;
 
-        // Combine two 32 bit stores of value zero into one 64 bit store
-        if (ins == INS_str && reg1 == REG_ZR && oldReg1 == REG_ZR && size == EA_4BYTE)
-        {
+    if (emitLastIns->idIsLclVar())
+    {
+        prevOffset    = emitLastIns->idAddr()->iiaLclVar.lvaOffset();
+        prevLclVarNum = emitLastIns->idAddr()->iiaLclVar.lvaVarNum();
+    }
 
-            // The first register is at the lower offset for the ascending order
-            ssize_t offset = (optimizationOrder == eRO_ascending ? oldImm : imm) * size;
-            emitIns_R_R_I(INS_str, EA_8BYTE, REG_ZR, reg2, offset, INS_OPTS_NONE);
-            return true;
-        }
+    if (!isCurrLclVar)
+    {
+        assert((varx == -1) && (offs == -1));
+    }
 
-        // Emit the new instruction. Make sure to scale the immediate value by the operand size.
-        if (optimizationOrder == eRO_ascending)
-        {
-            // The FIRST register is at the lower offset
-            emitIns_R_R_R_I(optIns, oldReg1Attr, oldReg1, reg1, reg2, oldImm * size, INS_OPTS_NONE, reg1Attr);
-        }
-        else
-        {
-            // The SECOND register is at the lower offset
-            emitIns_R_R_R_I(optIns, reg1Attr, reg1, oldReg1, reg2, imm * size, INS_OPTS_NONE, oldReg1Attr);
-        }
+    switch (emitLastIns->idGCref())
+    {
+        case GCT_GCREF:
+            prevReg1Attr = EA_GCREF;
+            break;
+        case GCT_BYREF:
+            prevReg1Attr = EA_BYREF;
+            break;
+        default:
+            prevReg1Attr = emitLastIns->idOpSize();
+            break;
+    }
+
+    // Remove the last instruction written.
+    emitRemoveLastInstruction();
 
+    // Combine two 32 bit stores of value zero into one 64 bit store
+    if ((ins == INS_str) && (reg1 == REG_ZR) && (prevReg1 == REG_ZR) && (size == EA_4BYTE))
+    {
+        // The first register is at the lower offset for the ascending order
+        ssize_t offset = (optimizationOrder == eRO_ascending ? prevImm : imm) * size;
+        emitIns_R_R_I(INS_str, EA_8BYTE, REG_ZR, reg2, offset, INS_OPTS_NONE);
         return true;
     }
 
-    return false;
+    if (optimizationOrder == eRO_ascending)
+    {
+        emitIns_R_R_R_I_LdStPair(optIns, prevReg1Attr, reg1Attr, prevReg1, reg1, reg2, prevImmSize, prevLclVarNum, varx,
+                                 prevOffset, offs);
+    }
+    else
+    {
+        emitIns_R_R_R_I_LdStPair(optIns, reg1Attr, prevReg1Attr, reg1, prevReg1, reg2, newImmSize, varx, prevLclVarNum,
+                                 offs, prevOffset);
+    }
+
+    return true;
 }
 
 //-----------------------------------------------------------------------------------
@@ -16308,7 +16579,7 @@ emitter::RegisterOrder emitter::IsOptimizableLdrStrWithPair(
     // For LDR/ STR, there are 9 bits, so we need to limit the range explicitly in software.
     if ((imm < -64) || (imm > 63) || (prevImm < -64) || (prevImm > 63))
     {
-        // Then one or more of the immediate values is out of range, so we cannot optimise.
+        // Then one or more of the immediate values is out of range, so we cannot optimize.
         return eRO_none;
     }
 
diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h
index ee0e029c84b68..240f4e03dd585 100644
--- a/src/coreclr/jit/emitarm64.h
+++ b/src/coreclr/jit/emitarm64.h
@@ -128,66 +128,39 @@ bool IsRedundantMov(instruction ins, emitAttr size, regNumber dst, regNumber src
 bool IsRedundantLdStr(instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);
 RegisterOrder IsOptimizableLdrStrWithPair(
     instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);
-bool ReplaceLdrStrWithPairInstr(
-    instruction ins, emitAttr reg1Attr, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);
+bool ReplaceLdrStrWithPairInstr(instruction ins,
+                                emitAttr    reg1Attr,
+                                regNumber   reg1,
+                                regNumber   reg2,
+                                ssize_t     imm,
+                                emitAttr    size,
+                                insFormat   fmt,
+                                bool        localVar = false,
+                                int         varx     = -1,
+                                int         offs     = -1);
 bool IsOptimizableLdrToMov(instruction ins, regNumber reg1, regNumber reg2, ssize_t imm, emitAttr size, insFormat fmt);
-
-// Try to optimize a Ldr or Str with an alternative instruction.
-inline bool OptimizeLdrStr(instruction ins,
-                           emitAttr    reg1Attr,
-                           regNumber   reg1,
-                           regNumber   reg2,
-                           ssize_t     imm,
-                           emitAttr    size,
-                           insFormat   fmt,
-                           bool        localVar = false,
-                           int         varx     = 0,
-                           int         offs     = 0)
+FORCEINLINE bool OptimizeLdrStr(instruction ins,
+                                emitAttr    reg1Attr,
+                                regNumber   reg1,
+                                regNumber   reg2,
+                                ssize_t     imm,
+                                emitAttr    size,
+                                insFormat   fmt,
+                                bool        localVar = false,
+                                int         varx     = -1,
+                                int offs = -1 DEBUG_ARG(bool useRsvdReg = false));
+
+emitLclVarAddr* emitGetLclVarPairLclVar2(instrDesc* id)
 {
-    assert(ins == INS_ldr || ins == INS_str);
-
-    if (!emitCanPeepholeLastIns() || (emitLastIns->idIns() != ins))
-    {
-        return false;
-    }
-
-    // Is the ldr/str even necessary?
-    if (IsRedundantLdStr(ins, reg1, reg2, imm, size, fmt))
+    assert(id->idIsLclVarPair());
+    if (id->idIsLargeCns())
     {
-        return true;
+        return &(((instrDescLclVarPairCns*)id)->iiaLclVar2);
     }
-
-    // Register 2 needs conversion to unencoded value for following optimisation checks.
-    reg2 = encodingZRtoSP(reg2);
-
-    // If the previous instruction was a matching load/store, then try to replace it instead of emitting.
-    //
-    bool canReplaceWithPair = true;
-    if (ins == INS_str)
+    else
     {
-        // For INS_str, don't do this if either instruction had a local GC variable.
-        // For INS_ldr, it is fine to perform this optimization because the output code already handles the code of
-        // updating the gc refs. We do not need offset tracking for load cases.
-        if ((localVar && EA_IS_GCREF_OR_BYREF(reg1Attr)) ||
-            (emitLastIns->idIsLclVar() && (emitLastIns->idGCref() != GCT_NONE)))
-        {
-            canReplaceWithPair = false;
-        }
+        return &(((instrDescLclVarPair*)id)->iiaLclVar2);
     }
-
-    if (canReplaceWithPair && ReplaceLdrStrWithPairInstr(ins, reg1Attr, reg1, reg2, imm, size, fmt))
-    {
-        return true;
-    }
-
-    // If we have a second LDR instruction from the same source, then try to replace it with a MOV.
-    if (IsOptimizableLdrToMov(ins, reg1, reg2, imm, size, fmt))
-    {
-        emitIns_Mov(INS_mov, reg1Attr, reg1, emitLastIns->idReg1(), true);
-        return true;
-    }
-
-    return false;
 }
 
 /************************************************************************
@@ -890,6 +863,18 @@ void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int o
 void emitIns_S_S_R_R(
     instruction ins, emitAttr attr, emitAttr attr2, regNumber ireg, regNumber ireg2, int varx, int offs);
 
+void emitIns_R_R_R_I_LdStPair(instruction ins,
+                              emitAttr    attr,
+                              emitAttr    attr2,
+                              regNumber   reg1,
+                              regNumber   reg2,
+                              regNumber   reg3,
+                              ssize_t     imm,
+                              int         varx1 = -1,
+                              int         varx2 = -1,
+                              int         offs1 = -1,
+                              int         offs2 = -1);
+
 void emitIns_R_S(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs);
 
 void emitIns_R_R_S_S(