diff --git a/llvm/lib/Target/AIE/AIECombine.td b/llvm/lib/Target/AIE/AIECombine.td index e3eec3b38247..c99f4acda299 100644 --- a/llvm/lib/Target/AIE/AIECombine.td +++ b/llvm/lib/Target/AIE/AIECombine.td @@ -361,6 +361,20 @@ def combine_add_vector_elt_undef : GICombineRule < (apply [{ applyAddVecEltUndef(*${root}, MRI, B); }] ) >; +def combine_insert_extract_vector_elt_to_copy : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_AIE_INSERT_VECTOR_ELT): $root, + [{ return matchInsertExtractVectorEltToCopy(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }]) +>; + +def combine_broadcast_extract_to_copy : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_AIE_BROADCAST_VECTOR): $root, + [{ return matchBroadcastExtractToCopy(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }]) +>; + def combine_load_store_split_matchdata: GIDefMatchData<"unsigned">; def combine_load_store_split : GICombineRule< (defs root:$root, combine_load_store_split_matchdata:$matchinfo), @@ -395,7 +409,8 @@ def combine_global_load_store_increment : GICombineRule < def AIE2PostLegalizerCustomCombiner - : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_global_load_store_increment, + : GICombiner<"AIE2PostLegalizerCustomCombinerImpl", [ combine_broadcast_extract_to_copy, + combine_global_load_store_increment, combine_load_store_split, ptr_add_immed_chain, combine_load_store_increment, @@ -404,12 +419,14 @@ def AIE2PostLegalizerCustomCombiner combine_add_vector_elt_undef, combine_extract_concat, combine_unmerge_concat, - combine_upd_to_concat, + combine_upd_to_concat, + combine_insert_extract_vector_elt_to_copy ]> { } def AIE2PPostLegalizerCustomCombiner : GICombiner<"AIE2PPostLegalizerCustomCombinerImpl", [ + combine_broadcast_extract_to_copy, combine_extract_vector_assert_combine, combine_global_load_store_increment, combine_load_store_increment, @@ -417,6 +434,6 @@ def AIE2PPostLegalizerCustomCombiner combine_offset_load_store_ptradd, combine_offset_load_store_share_ptradd, combine_add_vector_elt_undef, + combine_insert_extract_vector_elt_to_copy ]> { } - diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.cpp b/llvm/lib/Target/AIE/AIECombinerHelper.cpp index d78623c7effb..2e2c28400abc 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.cpp +++ b/llvm/lib/Target/AIE/AIECombinerHelper.cpp @@ -66,6 +66,95 @@ cl::opt MemsetOptimizations( "aie-optimize-memsets", cl::init(true), cl::Hidden, cl::desc("Apply memset optimizations (peeling/align/etc.).")); +namespace { + +bool isGenericExtractOpcode(unsigned Opc, const AIEBaseInstrInfo &TII) { + // Check if it's either SEXT or ZEXT extract + const unsigned ExtractSextOpc = TII.getGenericExtractVectorEltOpcode(true); + if (Opc == ExtractSextOpc) { + return true; + } + const unsigned ExtractZextOpc = TII.getGenericExtractVectorEltOpcode(false); + return Opc == ExtractZextOpc; +} + +/// We conservatively implement only known cases. +bool mayMIShiftElements(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + return false; + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { + switch (cast(MI)->getIntrinsicID()) { + case Intrinsic::aie2_v16accfloat_to_v16bf16: + case Intrinsic::aie2p_v16accfloat_to_v16bf16: + case Intrinsic::aie2p_v32accfloat_to_v32bf16: + case Intrinsic::aie2p_I512_I512_ACC1024_bf_mul_conf: + return false; + } + } + default: + return true; + } +} + +/// Verify that all uses of a broadcast vector through a chain of operations +/// only extract from position 0. The chain may include G_CONCAT_VECTORS, +/// G_UNMERGE_VALUES, and vector operations. +/// \param Reg The register to verify uses for +/// \param MRI Machine register info +/// \param TII Target instruction info +/// \return true if all uses only extract position 0 +bool verifyBroadcastUsesOnlyExtractZero(Register Reg, MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII) { + if (!MRI.hasOneNonDBGUser(Reg)) + return false; + + MachineInstr *UserMI = &*MRI.use_nodbg_instructions(Reg).begin(); + unsigned Opcode = UserMI->getOpcode(); + + // For concat, Reg should be the first src operand. + if (Opcode == TargetOpcode::G_CONCAT_VECTORS) { + if (UserMI->getOperand(1).getReg() != Reg) + return false; + return verifyBroadcastUsesOnlyExtractZero(UserMI->getOperand(0).getReg(), + MRI, TII); + // For unmerge, the useful operand should be the first one, + // the other ones, they should be dead. + } else if (Opcode == TargetOpcode::G_UNMERGE_VALUES) { + unsigned OpCount = 0; + for (auto &MO : UserMI->defs()) { + Register DefReg = MO.getReg(); + if (OpCount == 0 && !MRI.hasOneUse(DefReg)) + return false; + else if (OpCount && !MRI.use_empty(DefReg)) + return false; + OpCount++; + } + return verifyBroadcastUsesOnlyExtractZero(UserMI->getOperand(0).getReg(), + MRI, TII); + // If we extract from zero, we succeed, otherwise we fail. + } else if (isGenericExtractOpcode(Opcode, TII)) { + const Register UseIdxReg = UserMI->getOperand(2).getReg(); + auto UseIdx = getIConstantVRegValWithLookThrough(UseIdxReg, MRI); + return UseIdx && UseIdx->Value.getZExtValue() == 0; + // If we bitcast, we may need other lanes. + } else if (Opcode == TargetOpcode::G_BITCAST) { + return false; + } else { + if (mayMIShiftElements(UserMI)) + return false; + return verifyBroadcastUsesOnlyExtractZero(UserMI->getOperand(0).getReg(), + MRI, TII); + } + + return false; +} + +} // namespace + static unsigned getNumMaskUndefs(const ArrayRef &Mask, unsigned StartIndex) { unsigned Count = 0; @@ -4226,8 +4315,7 @@ namespace { MachineInstr *getBcstFeedByAssertExtVecExtr(MachineInstr &MI, MachineRegisterInfo &MRI, const AIEBaseInstrInfo &TII) { - assert(MI.getOpcode() == TII.getGenericExtractVectorEltOpcode(false) || - MI.getOpcode() == TII.getGenericExtractVectorEltOpcode(true)); + assert(isGenericExtractOpcode(MI.getOpcode(), TII)); /// Get single NonDebug User of \p MI with the opcode \p UseMIOpcode auto GetSingleNonDbgUser = [&MRI](MachineInstr &MI, @@ -4266,8 +4354,7 @@ bool llvm::matchExtractVecEltAssertBcst(MachineInstr &MI, const AIEBaseInstrInfo &TII, GISelChangeObserver &Observer, BuildFnTy &MatchInfo) { - assert((MI.getOpcode() == TII.getGenericExtractVectorEltOpcode(false) || - MI.getOpcode() == TII.getGenericExtractVectorEltOpcode(true)) && + assert(isGenericExtractOpcode(MI.getOpcode(), TII) && "Expected a extract_vector_elt"); const MachineInstr *BcstMI = getBcstFeedByAssertExtVecExtr(MI, MRI, TII); if (!BcstMI) @@ -4323,3 +4410,129 @@ bool llvm::matchMsbScalar(Register ScalarReg, Register BroadcastReg, return false; } + +/// Match a pattern where: +/// %18:_(<16 x s32>) = COPY $x0 +/// %10:_(<16 x s32>) = G_IMPLICIT_DEF +/// %9:_(s32) = G_CONSTANT i32 0 +/// %8:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %18(<16 x s32>), %9(s32) +/// %22:_(<16 x s32>) = G_AIE_INSERT_VECTOR_ELT %10, %8(s32), %9(s32) +/// +/// This can be simplified to: +/// %22:_(<16 x s32>) = COPY %18 +bool llvm::matchInsertExtractVectorEltToCopy(MachineInstr &MI, + MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TII.getGenericInsertVectorEltOpcode() && + "Expected G_AIE_INSERT_VECTOR_ELT"); + + // Get the insert operands + const Register InsertDstReg = MI.getOperand(0).getReg(); + const Register InsertSrcVecReg = MI.getOperand(1).getReg(); + const Register InsertedEltReg = MI.getOperand(2).getReg(); + const Register InsertIdxReg = MI.getOperand(3).getReg(); + + // Check that the insert source vector is G_IMPLICIT_DEF + const MachineInstr *InsertSrcMI = MRI.getVRegDef(InsertSrcVecReg); + if (!InsertSrcMI || InsertSrcMI->getOpcode() != TargetOpcode::G_IMPLICIT_DEF) + return false; + + // Get the definition of the inserted element + const MachineInstr *ExtractMI = MRI.getVRegDef(InsertedEltReg); + if (!ExtractMI) + return false; + + // Check if it's either SEXT or ZEXT extract + if (!isGenericExtractOpcode(ExtractMI->getOpcode(), TII)) + return false; + + // Get extract operands + const Register ExtractSrcVecReg = ExtractMI->getOperand(1).getReg(); + const Register ExtractIdxReg = ExtractMI->getOperand(2).getReg(); + + // Verify that the insert destination vector type matches the extract source + // vector type + const LLT InsertDstTy = MRI.getType(InsertDstReg); + const LLT ExtractSrcTy = MRI.getType(ExtractSrcVecReg); + + if (InsertDstTy != ExtractSrcTy) + return false; + + // Check that insert and extract indices are the same + // They can be the same register, or both constants with the same value + if (InsertIdxReg != ExtractIdxReg) { + auto InsertIdxCst = getIConstantVRegValWithLookThrough(InsertIdxReg, MRI); + auto ExtractIdxCst = getIConstantVRegValWithLookThrough(ExtractIdxReg, MRI); + if (!InsertIdxCst || !ExtractIdxCst || + InsertIdxCst->Value != ExtractIdxCst->Value) + return false; + } + + // Copy the extract source vector (the real vector) to the insert destination + MatchInfo = [=](MachineIRBuilder &B) { + B.buildCopy(InsertDstReg, ExtractSrcVecReg); + }; + + return true; +} + +/// Match a pattern where a broadcast is fed by an extract from position 0, +/// and all uses of the broadcast through a chain of operations only extract +/// from position 0. This allows us to replace the broadcast with a copy of +/// the original vector. +/// +/// Pattern: +/// %200:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %50(<16 x s32>), %3(s32) // pos +/// 0 %5:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR %200(s32) +/// ... (chain of concat/unmerge/vector ops) +/// %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %result(<16 x s32>), %3(s32) // +/// pos 0 +/// +/// Transforms to: +/// %200:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %50(<16 x s32>), %3(s32) +/// %5:_(<16 x s32>) = COPY %50(<16 x s32>) // Copy source vector instead of +/// broadcast +/// ... (chain of operations) +/// %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %result(<16 x s32>), %3(s32) +bool llvm::matchBroadcastExtractToCopy(MachineInstr &MI, + MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, + BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TII.getGenericBroadcastVectorOpcode() && + "Expected G_AIE_BROADCAST_VECTOR"); + + // 1. Verify broadcast source is extract from position 0 + const Register BroadcastSrcReg = MI.getOperand(1).getReg(); + const MachineInstr *ExtractMI = MRI.getVRegDef(BroadcastSrcReg); + + if (!ExtractMI || !isGenericExtractOpcode(ExtractMI->getOpcode(), TII)) + return false; + + // Verify extraction is from position 0 + const Register ExtractIdxReg = ExtractMI->getOperand(2).getReg(); + auto ExtractIdx = getIConstantVRegValWithLookThrough(ExtractIdxReg, MRI); + if (!ExtractIdx || ExtractIdx->Value.getZExtValue() != 0) + return false; + + // Get the source vector that was extracted from + const Register ExtractSrcVecReg = ExtractMI->getOperand(1).getReg(); + const LLT ExtractSrcVecTy = MRI.getType(ExtractSrcVecReg); + const LLT BroadcastDstTy = MRI.getType(MI.getOperand(0).getReg()); + + // Types must match exactly + if (ExtractSrcVecTy != BroadcastDstTy) + return false; + + // 2. Verify all uses through the chain only extract position 0 + // using the helper function with single-use checks + const Register BroadcastDstReg = MI.getOperand(0).getReg(); + if (!verifyBroadcastUsesOnlyExtractZero(BroadcastDstReg, MRI, TII)) + return false; + + MatchInfo = [ExtractSrcVecReg, BroadcastDstReg](MachineIRBuilder &B) { + B.buildCopy(BroadcastDstReg, ExtractSrcVecReg); + }; + + return true; +} diff --git a/llvm/lib/Target/AIE/AIECombinerHelper.h b/llvm/lib/Target/AIE/AIECombinerHelper.h index 84ec9ed75bc6..315dfbaffa5a 100644 --- a/llvm/lib/Target/AIE/AIECombinerHelper.h +++ b/llvm/lib/Target/AIE/AIECombinerHelper.h @@ -316,6 +316,15 @@ bool matchExtractVecEltAssertBcst(MachineInstr &MI, MachineRegisterInfo &MRI, bool matchMsbScalar(Register ScalarReg, Register BroadcastReg, MachineRegisterInfo &MRI); +bool matchInsertExtractVectorEltToCopy(MachineInstr &MI, + MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, + BuildFnTy &MatchInfo); + +bool matchBroadcastExtractToCopy(MachineInstr &MI, MachineRegisterInfo &MRI, + const AIEBaseInstrInfo &TII, + BuildFnTy &MatchInfo); + } // namespace llvm #endif diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-broadcast-extract.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-broadcast-extract.mir new file mode 100644 index 000000000000..fe3e9c7c8ccd --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-broadcast-extract.mir @@ -0,0 +1,522 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple=aie2p -run-pass=aie2p-postlegalizer-custom-combiner -global-isel %s -verify-machineinstrs -o - | FileCheck %s + +# Check that the combiner correctly optimizes a simple extract→broadcast→extract pattern. +# The broadcast is fed by an extract from position 0, and only position 0 is extracted +# from the broadcast result. The combiner should replace the broadcast with a COPY of +# the source vector, eliminating the unnecessary broadcast operation. +--- +name: test_broadcast_extract_to_copy_simple +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_to_copy_simple + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY [[COPY]](<32 x s16>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY1]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<32 x s16>) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<32 x s16>), %1(s32) + %3:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %2(s32) + %4:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %3(<32 x s16>), %1(s32) + $r0 = COPY %4(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# Check that the combiner handles patterns with G_FMUL operations (whitelisted in mayMIShiftElements). +# Similar to the FADD test, but using G_FMUL to verify the whitelist includes this operation. +--- +name: test_broadcast_extract_with_fmul +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + + ; CHECK-LABEL: name: test_broadcast_extract_with_fmul + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<32 x s16>) = COPY [[COPY]](<32 x s16>) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<32 x s16>) = G_FMUL [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[FMUL]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<32 x s16>) = COPY $x0 + %1:_(<32 x s16>) = COPY $x1 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<32 x s16>), %2(s32) + %4:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %3(s32) + %5:_(<32 x s16>) = G_FMUL %4, %1 + %6:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %5(<32 x s16>), %2(s32) + $r0 = COPY %6(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# Check that the combiner handles patterns with G_FSUB operations (whitelisted in mayMIShiftElements). +# Similar to the FADD test, but using G_FSUB to verify the whitelist includes this operation. +--- +name: test_broadcast_extract_with_fsub +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + + + ; CHECK-LABEL: name: test_broadcast_extract_with_fsub + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<32 x s16>) = COPY [[COPY]](<32 x s16>) + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(<32 x s16>) = G_FSUB [[COPY2]], [[COPY1]] + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[FSUB]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<32 x s16>) = COPY $x0 + %1:_(<32 x s16>) = COPY $x1 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<32 x s16>), %2(s32) + %4:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %3(s32) + %5:_(<32 x s16>) = G_FSUB %4, %1 + %6:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %5(<32 x s16>), %2(s32) + $r0 = COPY %6(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# Check that the combiner handles patterns with aie2p_v32accfloat_to_v32bf16 intrinsic. +# This intrinsic is whitelisted in mayMIShiftElements because it doesn't shift elements. +# The intrinsic converts accumulator floats (<32 x s32>) to bfloat16 (<32 x s16>). +--- +name: test_broadcast_extract_with_v32accfloat_to_v32bf16 +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + + ; CHECK-LABEL: name: test_broadcast_extract_with_v32accfloat_to_v32bf16 + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[COPY1]](<16 x s32>), [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<32 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.v32accfloat.to.v32bf16), [[CONCAT_VECTORS]](<32 x s32>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[INT]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<16 x s32>) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<16 x s32>), %1(s32) + %3:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR %2(s32) + %8:_(<32 x s32>) = G_CONCAT_VECTORS %3(<16 x s32>), %3(<16 x s32>) + %4:_(<32 x s16>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.v32accfloat.to.v32bf16), %8(<32 x s32>) + %5:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %4(<32 x s16>), %1(s32) + $r0 = COPY %5(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# Check that the combiner handles complex chains with AIE2P intrinsics and unmerge operations. +# The broadcast flows through a G_INTRINSIC_W_SIDE_EFFECTS (bf.mul.conf) that transforms +# the type from <32 x s16> to <32 x s32>, then G_UNMERGE_VALUES extracts the first part, +# and we extract from position 0. The combiner should apply because the intrinsic is in +# the whitelist (mayMIShiftElements) and only the first unmerge output is used. +--- +name: test_broadcast_extract_with_intrinsic +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $r1, $r2, $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_with_intrinsic + ; CHECK: liveins: $r1, $r2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r2 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<32 x s16>) = COPY [[COPY1]](<32 x s16>) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[COPY]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 60 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<32 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.I512.I512.ACC1024.bf.mul.conf), [[COPY2]](<32 x s16>), [[AIE_BROADCAST_VECTOR]](<32 x s16>), [[C1]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[INT]](<32 x s32>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(s32) = COPY $r1 + %1:_(s32) = COPY $r2 + %8:_(s32) = G_CONSTANT i32 0 + %50:_(<32 x s16>) = COPY $x0 + %200:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %50(<32 x s16>), %8(s32) + %10:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %200(s32) + %12:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %1(s32) + %14:_(s32) = G_CONSTANT i32 60 + %13:_(<32 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.I512.I512.ACC1024.bf.mul.conf), %10(<32 x s16>), %12(<32 x s16>), %14(s32) + %15:_(<16 x s32>), %16:_(<16 x s32>) = G_UNMERGE_VALUES %13(<32 x s32>) + %6:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %15(<16 x s32>), %8(s32) + $r0 = COPY %6(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# Check that the combiner handles patterns with G_FADD operations through concat/unmerge chains. +# The broadcast is in the first position of concat operations, flows through G_FADD (which is +# in the whitelist), then unmerge extracts the first part, and we extract from position 0. +# The combiner should apply the optimization. +--- +name: test_broadcast_extract_fmul +alignment: 1 +exposesReturnsTwice: false +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $r6, $r7, $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_fmul + ; CHECK: liveins: $r6, $r7, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r7 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<16 x s32>) = COPY [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[COPY]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[COPY2]](<16 x s32>), [[COPY3]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[COPY3]](<16 x s32>), [[COPY3]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[CONCAT_VECTORS]](<32 x s32>), [[CONCAT_VECTORS1]](<32 x s32>) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[AIE_BROADCAST_VECTOR]](<16 x s32>), [[COPY4]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[COPY4]](<16 x s32>), [[COPY4]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS5:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[CONCAT_VECTORS3]](<32 x s32>), [[CONCAT_VECTORS4]](<32 x s32>) + ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(<64 x s32>) = G_FADD [[CONCAT_VECTORS2]], [[CONCAT_VECTORS5]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<32 x s32>), [[UV1:%[0-9]+]]:_(<32 x s32>) = G_UNMERGE_VALUES [[FADD]](<64 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<16 x s32>), [[UV3:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[UV]](<32 x s32>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV2]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(s32) = COPY $r6 + %1:_(s32) = COPY $r7 + %3:_(s32) = G_CONSTANT i32 0 + %50:_(<16 x s32>) = COPY $x0 + %200:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %50(<16 x s32>), %3(s32) + %4:_(<16 x s32>) = G_IMPLICIT_DEF + %5:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR %200(s32) + %6:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR %1(s32) + %7:_(<16 x s32>) = COPY %4(<16 x s32>) + %16:_(<32 x s32>) = G_CONCAT_VECTORS %5(<16 x s32>), %7(<16 x s32>) + %17:_(<32 x s32>) = G_CONCAT_VECTORS %7(<16 x s32>), %7(<16 x s32>) + %8:_(<64 x s32>) = G_CONCAT_VECTORS %16(<32 x s32>), %17(<32 x s32>) + %9:_(<16 x s32>) = COPY %4(<16 x s32>) + %18:_(<32 x s32>) = G_CONCAT_VECTORS %6(<16 x s32>), %9(<16 x s32>) + %19:_(<32 x s32>) = G_CONCAT_VECTORS %9(<16 x s32>), %9(<16 x s32>) + %10:_(<64 x s32>) = G_CONCAT_VECTORS %18(<32 x s32>), %19(<32 x s32>) + %11:_(<64 x s32>) = G_FADD %8, %10 + %20:_(<32 x s32>), %21:_(<32 x s32>) = G_UNMERGE_VALUES %11(<64 x s32>) + %12:_(<16 x s32>), %13:_(<16 x s32>) = G_UNMERGE_VALUES %20(<32 x s32>) + %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %12(<16 x s32>), %3(s32) + $r0 = COPY %2(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# NEGATIVE: Check that the combiner rejects patterns where the initial extraction is from position 1. +# The broadcast source is extracted from position 1 (not position 0), so the combiner's check +# for "ExtractIdx->Value.getZExtValue() != 0" should reject this pattern. +--- +name: test_broadcast_extract_negative_not_position_zero +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_negative_not_position_zero + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[AIE_BROADCAST_VECTOR]](<32 x s16>), [[C1]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<32 x s16>) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<32 x s16>), %1(s32) + %3:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %2(s32) + %4:_(s32) = G_CONSTANT i32 0 + %5:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %3(<32 x s16>), %4(s32) + $r0 = COPY %5(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# NEGATIVE: Check that the combiner rejects patterns with G_BITCAST in the use chain. +# After the broadcast, there's a G_BITCAST which may require access to lanes beyond position 0. +# The combiner's check "if (Opcode == TargetOpcode::G_BITCAST) return false" should reject this. +--- +name: test_broadcast_extract_negative_bitcast +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_negative_bitcast + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[AIE_BROADCAST_VECTOR]](<32 x s16>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<32 x s16>) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<32 x s16>), %1(s32) + %3:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %2(s32) + %4:_(<16 x s32>) = G_BITCAST %3(<32 x s16>) + %5:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %4(<16 x s32>), %1(s32) + $r0 = COPY %5(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# NEGATIVE: Check that the combiner rejects patterns with type mismatch. +# The extract source is <16 x s32> but the broadcast destination is <32 x s16>. +# Even though both are 512 bits, the types don't match exactly, so the combiner's +# check "if (ExtractSrcVecTy != BroadcastDstTy)" should reject this. +--- +name: test_broadcast_extract_negative_type_mismatch +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_negative_type_mismatch + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[AIE_BROADCAST_VECTOR]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<16 x s32>) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<16 x s32>), %1(s32) + %3:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %2(s32) + %4:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %3(<32 x s16>), %1(s32) + $r0 = COPY %4(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# NEGATIVE: Check that the combiner rejects patterns with multiple different extracts. +# The broadcast is used to extract from both position 0 AND position 1, meaning we need +# more than just the first element. The combiner's single-use check "!MRI.hasOneUse(Reg)" +# should reject this because the broadcast has two extract uses. +--- +name: test_broadcast_extract_negative_multiple_extracts +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_negative_multiple_extracts + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[AIE_BROADCAST_VECTOR]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT2:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[AIE_BROADCAST_VECTOR]](<32 x s16>), [[C1]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AIE_SEXT_EXTRACT_VECTOR_ELT1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT2]] + ; CHECK-NEXT: $r0 = COPY [[ADD]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(<32 x s16>) = COPY $x0 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %0(<32 x s16>), %1(s32) + %3:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %2(s32) + %4:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %3(<32 x s16>), %1(s32) + %5:_(s32) = G_CONSTANT i32 1 + %6:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %3(<32 x s16>), %5(s32) + %7:_(s32) = G_ADD %4, %6 + $r0 = COPY %7(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# NEGATIVE: Check that the combiner rejects patterns where broadcast is not the first concat operand. +# The G_CONCAT_VECTORS has G_IMPLICIT_DEF as the first operand and the broadcast as the second. +# The combiner's check "if (UserMI->getOperand(1).getReg() != Reg) return false" should reject +# this because the broadcast must be in position 0 of the concat for the optimization to be valid. +--- +name: test_broadcast_extract_negative_concat_not_first +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $r7, $x0 + + ; CHECK-LABEL: name: test_broadcast_extract_negative_concat_not_first + ; CHECK: liveins: $r7, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[AIE_BROADCAST_VECTOR]](<16 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<32 x s32>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(s32) = COPY $r7 + %1:_(<16 x s32>) = COPY $x0 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %1(<16 x s32>), %2(s32) + %4:_(<16 x s32>) = G_IMPLICIT_DEF + %5:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR %3(s32) + %6:_(<32 x s32>) = G_CONCAT_VECTORS %4(<16 x s32>), %5(<16 x s32>) + %7:_(<16 x s32>), %8:_(<16 x s32>) = G_UNMERGE_VALUES %6(<32 x s32>) + %9:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %7(<16 x s32>), %2(s32) + $r0 = COPY %9(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# NEGATIVE: Check that the combiner rejects patterns where we extract from the second unmerge output. +# The G_UNMERGE_VALUES produces two outputs, but we extract from the second one (%16) instead +# of the first (%15). The combiner's check "if (OpCount && !MRI.use_empty(DefReg)) return false" +# should reject this because only the first unmerge output should be used. +--- +name: test_broadcast_extract_negative_second_unmerge +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $r1, $r2, $x0 + + + ; CHECK-LABEL: name: test_broadcast_extract_negative_second_unmerge + ; CHECK: liveins: $r1, $r2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r2 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY1]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR1:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[COPY]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 60 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<32 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.I512.I512.ACC1024.bf.mul.conf), [[AIE_BROADCAST_VECTOR]](<32 x s16>), [[AIE_BROADCAST_VECTOR1]](<32 x s16>), [[C1]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[INT]](<32 x s32>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV1]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0 + %0:_(s32) = COPY $r1 + %1:_(s32) = COPY $r2 + %8:_(s32) = G_CONSTANT i32 0 + %50:_(<32 x s16>) = COPY $x0 + %200:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %50(<32 x s16>), %8(s32) + %10:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %200(s32) + %12:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %1(s32) + %14:_(s32) = G_CONSTANT i32 60 + %13:_(<32 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.I512.I512.ACC1024.bf.mul.conf), %10(<32 x s16>), %12(<32 x s16>), %14(s32) + %15:_(<16 x s32>), %16:_(<16 x s32>) = G_UNMERGE_VALUES %13(<32 x s32>) + %6:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %16(<16 x s32>), %8(s32) + $r0 = COPY %6(s32) + PseudoRET implicit $lr, implicit $r0 + +... + +# NEGATIVE: Check that the combiner rejects patterns where the first unmerge output has multiple uses. +# The first unmerge output (%15) is both extracted from AND returned implicitly by PseudoRET. +# The combiner's check "if (OpCount == 0 && !MRI.hasOneUse(DefReg)) return false" should reject +# this because the first unmerge output must have exactly one use. +--- +name: test_broadcast_extract_negative_first_unmerge_multiple_uses +alignment: 1 +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $r1, $r2, $x0 + + + ; CHECK-LABEL: name: test_broadcast_extract_negative_first_unmerge_multiple_uses + ; CHECK: liveins: $r1, $r2, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $r2 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[COPY1]](<32 x s16>), [[C]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR1:%[0-9]+]]:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR [[COPY]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 60 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<32 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.I512.I512.ACC1024.bf.mul.conf), [[AIE_BROADCAST_VECTOR]](<32 x s16>), [[AIE_BROADCAST_VECTOR1]](<32 x s16>), [[C1]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[INT]](<32 x s32>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: $r0 = COPY [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $r0, implicit [[UV]](<16 x s32>) + %0:_(s32) = COPY $r1 + %1:_(s32) = COPY $r2 + %8:_(s32) = G_CONSTANT i32 0 + %50:_(<32 x s16>) = COPY $x0 + %200:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %50(<32 x s16>), %8(s32) + %10:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %200(s32) + %12:_(<32 x s16>) = G_AIE_BROADCAST_VECTOR %1(s32) + %14:_(s32) = G_CONSTANT i32 60 + %13:_(<32 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aie2p.I512.I512.ACC1024.bf.mul.conf), %10(<32 x s16>), %12(<32 x s16>), %14(s32) + %15:_(<16 x s32>), %16:_(<16 x s32>) = G_UNMERGE_VALUES %13(<32 x s32>) + %6:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %15(<16 x s32>), %8(s32) + $r0 = COPY %6(s32) + PseudoRET implicit $lr, implicit $r0, implicit %15 + +... diff --git a/llvm/test/CodeGen/AIE/GlobalISel/combine-insert-extract-vector-elt.mir b/llvm/test/CodeGen/AIE/GlobalISel/combine-insert-extract-vector-elt.mir new file mode 100644 index 000000000000..bace1143ee46 --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/combine-insert-extract-vector-elt.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple=aie2 -run-pass=aie2-postlegalizer-custom-combiner -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=aie2p -run-pass=aie2p-postlegalizer-custom-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_insert_extract_s32_vector +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: test_insert_extract_s32_vector + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src_vec:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: %result:_(<16 x s32>) = COPY %src_vec(<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %result(<16 x s32>) + %src_vec:_(<16 x s32>) = COPY $x0 + %idx:_(s32) = G_CONSTANT i32 0 + %undef_vec:_(<16 x s32>) = G_IMPLICIT_DEF + %extracted_elt:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %src_vec, %idx + %result:_(<16 x s32>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt, %idx + PseudoRET implicit $lr, implicit %result +... +--- +name: test_insert_extract_s16_vector +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + + ; CHECK-LABEL: name: test_insert_extract_s16_vector + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src_vec:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: %result:_(<32 x s16>) = COPY %src_vec(<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %result(<32 x s16>) + %src_vec:_(<32 x s16>) = COPY $x0 + %idx:_(s32) = G_CONSTANT i32 0 + %undef_vec:_(<32 x s16>) = G_IMPLICIT_DEF + %extracted_elt:_(s32) = G_AIE_ZEXT_EXTRACT_VECTOR_ELT %src_vec, %idx + %result:_(<32 x s16>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt, %idx + PseudoRET implicit $lr, implicit %result +... +--- +name: test_same_constant_non_zero +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_same_constant_non_zero + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src_vec:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: %result:_(<16 x s32>) = COPY %src_vec(<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %result(<16 x s32>) + %src_vec:_(<16 x s32>) = COPY $x0 + %idx:_(s32) = G_CONSTANT i32 5 + %undef_vec:_(<16 x s32>) = G_IMPLICIT_DEF + %extracted_elt:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %src_vec, %idx + %result:_(<16 x s32>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt, %idx + PseudoRET implicit $lr, implicit %result +... +--- +name: test_same_register_dynamic_index +tracksRegLiveness: true +body: | + bb.0: + liveins: $r0, $x0 + ; CHECK-LABEL: name: test_same_register_dynamic_index + ; CHECK: liveins: $r0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src_vec:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: %result:_(<16 x s32>) = COPY %src_vec(<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %result(<16 x s32>) + %idx:_(s32) = COPY $r0 + %src_vec:_(<16 x s32>) = COPY $x0 + %undef_vec:_(<16 x s32>) = G_IMPLICIT_DEF + %extracted_elt:_(s32) = G_AIE_ZEXT_EXTRACT_VECTOR_ELT %src_vec, %idx + %result:_(<16 x s32>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt, %idx + PseudoRET implicit $lr, implicit %result +... +--- +name: test_no_combine_different_constants +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_no_combine_different_constants + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src_vec:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: %idx1:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %idx2:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: %undef_vec:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: %extracted_elt:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %src_vec(<16 x s32>), %idx1(s32) + ; CHECK-NEXT: %result:_(<16 x s32>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt(s32), %idx2(s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %result(<16 x s32>) + %src_vec:_(<16 x s32>) = COPY $x0 + %idx1:_(s32) = G_CONSTANT i32 1 + %idx2:_(s32) = G_CONSTANT i32 2 + %undef_vec:_(<16 x s32>) = G_IMPLICIT_DEF + %extracted_elt:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %src_vec, %idx1 + %result:_(<16 x s32>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt, %idx2 + PseudoRET implicit $lr, implicit %result +... +--- +name: test_no_combine_type_mismatch +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: test_no_combine_type_mismatch + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src_vec:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: %idx:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: %undef_vec:_(<32 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: %extracted_elt:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %src_vec(<16 x s32>), %idx(s32) + ; CHECK-NEXT: %result:_(<32 x s16>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt(s32), %idx(s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit %result(<32 x s16>) + %src_vec:_(<16 x s32>) = COPY $x0 + %idx:_(s32) = G_CONSTANT i32 0 + %undef_vec:_(<32 x s16>) = G_IMPLICIT_DEF + %extracted_elt:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %src_vec, %idx + %result:_(<32 x s16>) = G_AIE_INSERT_VECTOR_ELT %undef_vec, %extracted_elt, %idx + PseudoRET implicit $lr, implicit %result +...