diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index c15263e0b06f8..7d7b5364d6b68 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -693,18 +693,19 @@ class CombinerHelper { /// feeding a G_AND instruction \p MI. bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const; - /// Given an G_UDIV \p MI expressing a divide by constant, return an - /// expression that implements it by multiplying by a magic number. + /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant, + /// return an expression that implements it by multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". - MachineInstr *buildUDivUsingMul(MachineInstr &MI) const; - /// Combine G_UDIV by constant into a multiply by magic constant. - bool matchUDivByConst(MachineInstr &MI) const; - void applyUDivByConst(MachineInstr &MI) const; + MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const; + /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant. + bool matchUDivorURemByConst(MachineInstr &MI) const; + void applyUDivorURemByConst(MachineInstr &MI) const; /// Given an G_SDIV \p MI expressing a signed divide by constant, return an /// expression that implements it by multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". MachineInstr *buildSDivUsingMul(MachineInstr &MI) const; + /// Combine G_SDIV by constant into a multiply by magic constant. bool matchSDivByConst(MachineInstr &MI) const; void applySDivByConst(MachineInstr &MI) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 4a92dc16c1bf4..6033d80e717d3 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1132,8 +1132,8 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg, def udiv_by_const : GICombineRule< (defs root:$root), (match (wip_match_opcode G_UDIV):$root, - [{ return Helper.matchUDivByConst(*${root}); }]), - (apply [{ Helper.applyUDivByConst(*${root}); }])>; + [{ return Helper.matchUDivorURemByConst(*${root}); }]), + (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; def sdiv_by_const : GICombineRule< (defs root:$root), @@ -1156,6 +1156,14 @@ def udiv_by_pow2 : GICombineRule< def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const, sdiv_by_pow2, udiv_by_pow2]>; +def urem_by_const : GICombineRule< + (defs root:$root), + (match (G_UREM $dst, $x, $y):$root, + [{ return Helper.matchUDivorURemByConst(*${root}); }]), + (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; + +def intrem_combines : GICombineGroup<[urem_by_const]>; + def reassoc_ptradd : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), (match (wip_match_opcode G_PTR_ADD):$root, @@ -2048,7 +2056,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, constant_fold_cast_op, fabs_fneg_fold, intdiv_combines, mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, - sub_add_reg, select_to_minmax, + intrem_combines, sub_add_reg, select_to_minmax, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 05dd269d48921..3b11d0848d300 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5295,12 +5295,13 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI, return false; } -MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_UDIV); - auto &UDiv = cast(MI); - Register Dst = UDiv.getReg(0); - Register LHS = UDiv.getReg(1); - Register RHS = UDiv.getReg(2); +MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); + auto &UDivorRem = cast(MI); + Register Dst = UDivorRem.getReg(0); + Register LHS = UDivorRem.getReg(1); + Register RHS = UDivorRem.getReg(2); LLT Ty = MRI.getType(Dst); LLT ScalarTy = Ty.getScalarType(); const unsigned EltBits = ScalarTy.getScalarSizeInBits(); @@ -5453,11 +5454,18 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const { auto IsOne = MIB.buildICmp( CmpInst::Predicate::ICMP_EQ, Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One); - return MIB.buildSelect(Ty, IsOne, LHS, Q); + auto ret = MIB.buildSelect(Ty, IsOne, LHS, Q); + + if (Opcode == TargetOpcode::G_UREM) { + auto Prod = MIB.buildMul(Ty, ret, RHS); + return MIB.buildSub(Ty, LHS, Prod); + } + return ret; } -bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_UDIV); +bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); Register Dst = MI.getOperand(0).getReg(); Register RHS = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(Dst); @@ -5474,7 +5482,8 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const { if (MF.getFunction().hasMinSize()) return false; - if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + if (Opcode == TargetOpcode::G_UDIV && + MI.getFlag(MachineInstr::MIFlag::IsExact)) { return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } @@ -5494,14 +5503,17 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const { {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1), DstTy}})) return false; + if (Opcode == TargetOpcode::G_UREM && + !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}})) + return false; } return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } -void CombinerHelper::applyUDivByConst(MachineInstr &MI) const { - auto *NewMI = buildUDivUsingMul(MI); +void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const { + auto *NewMI = buildUDivorURemUsingMul(MI); replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll index 88bab4af95d64..467ceb062f249 100644 --- a/llvm/test/CodeGen/AArch64/pr58431.ll +++ b/llvm/test/CodeGen/AArch64/pr58431.ll @@ -4,10 +4,12 @@ define i32 @f(i64 %0) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #10 // =0xa +; CHECK-NEXT: mov x8, #-7378697629483820647 // =0x9999999999999999 ; CHECK-NEXT: mov w9, w0 -; CHECK-NEXT: udiv x10, x9, x8 -; CHECK-NEXT: msub x0, x10, x8, x9 +; CHECK-NEXT: mov w10, #10 // =0xa +; CHECK-NEXT: eor x8, x8, #0x8000000000000003 +; CHECK-NEXT: umulh x8, x9, x8 +; CHECK-NEXT: msub x0, x8, x10, x9 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %2 = trunc i64 %0 to i32 diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll new file mode 100644 index 0000000000000..1376f5d9a380d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -0,0 +1,3616 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-none-eabi -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +define i8 @si8_7(i8 %a, i8 %b) { +; CHECK-SD-LABEL: si8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: mov w9, #-109 // =0xffffff93 +; CHECK-SD-NEXT: mul w8, w8, w9 +; CHECK-SD-NEXT: add w8, w0, w8, lsr #8 +; CHECK-SD-NEXT: sbfx w9, w8, #2, #6 +; CHECK-SD-NEXT: and w8, w8, #0x80 +; CHECK-SD-NEXT: add w8, w9, w8, lsr #7 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w0, w0, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: mov w9, #7 // =0x7 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w0, w0, w8 +; CHECK-GI-NEXT: ret +entry: + %s = srem i8 %a, 7 + ret i8 %s +} + +define i8 @si8_100(i8 %a, i8 %b) { +; CHECK-SD-LABEL: si8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sxtb w8, w0 +; CHECK-SD-NEXT: mov w9, #41 // =0x29 +; CHECK-SD-NEXT: mul w8, w8, w9 +; CHECK-SD-NEXT: asr w9, w8, #12 +; CHECK-SD-NEXT: add w8, w9, w8, lsr #31 +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: msub w0, w8, w9, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 +; CHECK-GI-NEXT: ret +entry: + %s = srem i8 %a, 100 + ret i8 %s +} + +define i8 @ui8_7(i8 %a, i8 %b) { +; CHECK-SD-LABEL: ui8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #37 // =0x25 +; CHECK-SD-NEXT: and w9, w0, #0xff +; CHECK-SD-NEXT: mul w8, w9, w8 +; CHECK-SD-NEXT: lsr w8, w8, #8 +; CHECK-SD-NEXT: sub w9, w0, w8 +; CHECK-SD-NEXT: and w9, w9, #0xfe +; CHECK-SD-NEXT: add w8, w8, w9, lsr #1 +; CHECK-SD-NEXT: lsr w8, w8, #2 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w0, w0, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #37 // =0x25 +; CHECK-GI-NEXT: and w9, w0, #0xff +; CHECK-GI-NEXT: mul w8, w9, w8 +; CHECK-GI-NEXT: lsr w8, w8, #8 +; CHECK-GI-NEXT: sub w9, w0, w8 +; CHECK-GI-NEXT: ubfx w9, w9, #1, #7 +; CHECK-GI-NEXT: add w8, w9, w8 +; CHECK-GI-NEXT: ubfx w8, w8, #2, #6 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w0, w0, w8 +; CHECK-GI-NEXT: ret +entry: + %s = urem i8 %a, 7 + ret i8 %s +} + +define i8 @ui8_100(i8 %a, i8 %b) { +; CHECK-SD-LABEL: ui8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #41 // =0x29 +; CHECK-SD-NEXT: and w9, w0, #0xff +; CHECK-SD-NEXT: mul w8, w9, w8 +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: lsr w8, w8, #12 +; CHECK-SD-NEXT: msub w0, w8, w9, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #41 // =0x29 +; CHECK-GI-NEXT: and w9, w0, #0xff +; CHECK-GI-NEXT: mul w8, w9, w8 +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: lsr w8, w8, #8 +; CHECK-GI-NEXT: lsr w8, w8, #4 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 +; CHECK-GI-NEXT: ret +entry: + %s = urem i8 %a, 100 + ret i8 %s +} + +define i16 @si16_7(i16 %a, i16 %b) { +; CHECK-SD-LABEL: si16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sxth w8, w0 +; CHECK-SD-NEXT: mov w9, #18725 // =0x4925 +; CHECK-SD-NEXT: mul w8, w8, w9 +; CHECK-SD-NEXT: asr w9, w8, #17 +; CHECK-SD-NEXT: add w8, w9, w8, lsr #31 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w0, w0, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: mov w9, #7 // =0x7 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w0, w0, w8 +; CHECK-GI-NEXT: ret +entry: + %s = srem i16 %a, 7 + ret i16 %s +} + +define i16 @si16_100(i16 %a, i16 %b) { +; CHECK-SD-LABEL: si16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sxth w8, w0 +; CHECK-SD-NEXT: mov w9, #5243 // =0x147b +; CHECK-SD-NEXT: mul w8, w8, w9 +; CHECK-SD-NEXT: asr w9, w8, #19 +; CHECK-SD-NEXT: add w8, w9, w8, lsr #31 +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: msub w0, w8, w9, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 +; CHECK-GI-NEXT: ret +entry: + %s = srem i16 %a, 100 + ret i16 %s +} + +define i16 @ui16_7(i16 %a, i16 %b) { +; CHECK-SD-LABEL: ui16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: and w9, w0, #0xffff +; CHECK-SD-NEXT: mul w8, w9, w8 +; CHECK-SD-NEXT: lsr w8, w8, #16 +; CHECK-SD-NEXT: sub w9, w0, w8 +; CHECK-SD-NEXT: and w9, w9, #0xfffe +; CHECK-SD-NEXT: add w8, w8, w9, lsr #1 +; CHECK-SD-NEXT: lsr w8, w8, #2 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w0, w0, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #9363 // =0x2493 +; CHECK-GI-NEXT: and w9, w0, #0xffff +; CHECK-GI-NEXT: mul w8, w9, w8 +; CHECK-GI-NEXT: lsr w8, w8, #16 +; CHECK-GI-NEXT: sub w9, w0, w8 +; CHECK-GI-NEXT: ubfx w9, w9, #1, #15 +; CHECK-GI-NEXT: add w8, w9, w8 +; CHECK-GI-NEXT: ubfx w8, w8, #2, #14 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w0, w0, w8 +; CHECK-GI-NEXT: ret +entry: + %s = urem i16 %a, 7 + ret i16 %s +} + +define i16 @ui16_100(i16 %a, i16 %b) { +; CHECK-SD-LABEL: ui16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ubfx w8, w0, #2, #14 +; CHECK-SD-NEXT: mov w9, #5243 // =0x147b +; CHECK-SD-NEXT: mul w8, w8, w9 +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: lsr w8, w8, #17 +; CHECK-SD-NEXT: msub w0, w8, w9, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ubfx w8, w0, #2, #14 +; CHECK-GI-NEXT: mov w9, #5243 // =0x147b +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: lsr w8, w8, #16 +; CHECK-GI-NEXT: lsr w8, w8, #1 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 +; CHECK-GI-NEXT: ret +entry: + %s = urem i16 %a, 100 + ret i16 %s +} + +define i32 @si32_7(i32 %a, i32 %b) { +; CHECK-SD-LABEL: si32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: movk w8, #37449, lsl #16 +; CHECK-SD-NEXT: smull x8, w0, w8 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: add w8, w8, w0 +; CHECK-SD-NEXT: asr w9, w8, #2 +; CHECK-SD-NEXT: add w8, w9, w8, lsr #31 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w0, w0, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: sdiv w8, w0, w8 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w0, w0, w8 +; CHECK-GI-NEXT: ret +entry: + %s = srem i32 %a, 7 + ret i32 %s +} + +define i32 @si32_100(i32 %a, i32 %b) { +; CHECK-SD-LABEL: si32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: smull x8, w0, w8 +; CHECK-SD-NEXT: asr x8, x8, #37 +; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 +; CHECK-SD-NEXT: msub w0, w8, w9, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: sdiv w9, w0, w8 +; CHECK-GI-NEXT: msub w0, w9, w8, w0 +; CHECK-GI-NEXT: ret +entry: + %s = srem i32 %a, 100 + ret i32 %s +} + +define i32 @ui32_7(i32 %a, i32 %b) { +; CHECK-SD-LABEL: ui32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: umull x8, w0, w8 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: sub w9, w0, w8 +; CHECK-SD-NEXT: add w8, w8, w9, lsr #1 +; CHECK-SD-NEXT: lsr w8, w8, #2 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w0, w0, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #18725 // =0x4925 +; CHECK-GI-NEXT: movk w8, #9362, lsl #16 +; CHECK-GI-NEXT: umull x8, w0, w8 +; CHECK-GI-NEXT: lsr x8, x8, #32 +; CHECK-GI-NEXT: sub w9, w0, w8 +; CHECK-GI-NEXT: add w8, w8, w9, lsr #1 +; CHECK-GI-NEXT: lsr w8, w8, #2 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w0, w0, w8 +; CHECK-GI-NEXT: ret +entry: + %s = urem i32 %a, 7 + ret i32 %s +} + +define i32 @ui32_100(i32 %a, i32 %b) { +; CHECK-SD-LABEL: ui32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: umull x8, w0, w8 +; CHECK-SD-NEXT: lsr x8, x8, #37 +; CHECK-SD-NEXT: msub w0, w8, w9, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #34079 // =0x851f +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: movk w8, #20971, lsl #16 +; CHECK-GI-NEXT: umull x8, w0, w8 +; CHECK-GI-NEXT: lsr x8, x8, #32 +; CHECK-GI-NEXT: lsr w8, w8, #5 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 +; CHECK-GI-NEXT: ret +entry: + %s = urem i32 %a, 100 + ret i32 %s +} + +define i64 @si64_7(i64 %a, i64 %b) { +; CHECK-SD-LABEL: si64_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #18725 // =0x4925 +; CHECK-SD-NEXT: movk x8, #9362, lsl #16 +; CHECK-SD-NEXT: movk x8, #37449, lsl #32 +; CHECK-SD-NEXT: movk x8, #18724, lsl #48 +; CHECK-SD-NEXT: smulh x8, x0, x8 +; CHECK-SD-NEXT: asr x9, x8, #1 +; CHECK-SD-NEXT: add x8, x9, x8, lsr #63 +; CHECK-SD-NEXT: sub x8, x8, x8, lsl #3 +; CHECK-SD-NEXT: add x0, x0, x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si64_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: sdiv x8, x0, x8 +; CHECK-GI-NEXT: lsl x9, x8, #3 +; CHECK-GI-NEXT: sub x8, x9, x8 +; CHECK-GI-NEXT: sub x0, x0, x8 +; CHECK-GI-NEXT: ret +entry: + %s = srem i64 %a, 7 + ret i64 %s +} + +define i64 @si64_100(i64 %a, i64 %b) { +; CHECK-SD-LABEL: si64_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #55051 // =0xd70b +; CHECK-SD-NEXT: movk x8, #28835, lsl #16 +; CHECK-SD-NEXT: movk x8, #2621, lsl #32 +; CHECK-SD-NEXT: movk x8, #41943, lsl #48 +; CHECK-SD-NEXT: smulh x8, x0, x8 +; CHECK-SD-NEXT: add x8, x8, x0 +; CHECK-SD-NEXT: asr x9, x8, #6 +; CHECK-SD-NEXT: add x8, x9, x8, lsr #63 +; CHECK-SD-NEXT: mov w9, #100 // =0x64 +; CHECK-SD-NEXT: msub x0, x8, x9, x0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: si64_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: sdiv x9, x0, x8 +; CHECK-GI-NEXT: msub x0, x9, x8, x0 +; CHECK-GI-NEXT: ret +entry: + %s = srem i64 %a, 100 + ret i64 %s +} + +define i64 @ui64_7(i64 %a, i64 %b) { +; CHECK-SD-LABEL: ui64_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #9363 // =0x2493 +; CHECK-SD-NEXT: movk x8, #37449, lsl #16 +; CHECK-SD-NEXT: movk x8, #18724, lsl #32 +; CHECK-SD-NEXT: movk x8, #9362, lsl #48 +; CHECK-SD-NEXT: umulh x8, x0, x8 +; CHECK-SD-NEXT: sub x9, x0, x8 +; CHECK-SD-NEXT: add x8, x8, x9, lsr #1 +; CHECK-SD-NEXT: lsr x8, x8, #2 +; CHECK-SD-NEXT: sub x8, x8, x8, lsl #3 +; CHECK-SD-NEXT: add x0, x0, x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui64_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov x8, #9363 // =0x2493 +; CHECK-GI-NEXT: movk x8, #37449, lsl #16 +; CHECK-GI-NEXT: movk x8, #18724, lsl #32 +; CHECK-GI-NEXT: movk x8, #9362, lsl #48 +; CHECK-GI-NEXT: umulh x8, x0, x8 +; CHECK-GI-NEXT: sub x9, x0, x8 +; CHECK-GI-NEXT: add x8, x8, x9, lsr #1 +; CHECK-GI-NEXT: lsr x8, x8, #2 +; CHECK-GI-NEXT: lsl x9, x8, #3 +; CHECK-GI-NEXT: sub x8, x9, x8 +; CHECK-GI-NEXT: sub x0, x0, x8 +; CHECK-GI-NEXT: ret +entry: + %s = urem i64 %a, 7 + ret i64 %s +} + +define i64 @ui64_100(i64 %a, i64 %b) { +; CHECK-LABEL: ui64_100: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x9, #62915 // =0xf5c3 +; CHECK-NEXT: lsr x8, x0, #2 +; CHECK-NEXT: movk x9, #23592, lsl #16 +; CHECK-NEXT: movk x9, #49807, lsl #32 +; CHECK-NEXT: movk x9, #10485, lsl #48 +; CHECK-NEXT: umulh x8, x8, x9 +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: lsr x8, x8, #2 +; CHECK-NEXT: msub x0, x8, x9, x0 +; CHECK-NEXT: ret +entry: + %s = urem i64 %a, 100 + ret i64 %s +} + +define i128 @si128_7(i128 %a, i128 %b) { +; CHECK-LABEL: si128_7: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov w2, #7 // =0x7 +; CHECK-NEXT: mov x3, xzr +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = srem i128 %a, 7 + ret i128 %s +} + +define i128 @si128_100(i128 %a, i128 %b) { +; CHECK-LABEL: si128_100: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov w2, #100 // =0x64 +; CHECK-NEXT: mov x3, xzr +; CHECK-NEXT: bl __modti3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %s = srem i128 %a, 100 + ret i128 %s +} + +define i128 @ui128_7(i128 %a, i128 %b) { +; CHECK-SD-LABEL: ui128_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: mov w2, #7 // =0x7 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui128_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov x8, #18725 // =0x4925 +; CHECK-GI-NEXT: mov x10, #9362 // =0x2492 +; CHECK-GI-NEXT: movk x8, #9362, lsl #16 +; CHECK-GI-NEXT: movk x10, #37449, lsl #16 +; CHECK-GI-NEXT: movk x8, #37449, lsl #32 +; CHECK-GI-NEXT: movk x10, #18724, lsl #32 +; CHECK-GI-NEXT: movk x8, #18724, lsl #48 +; CHECK-GI-NEXT: movk x10, #9362, lsl #48 +; CHECK-GI-NEXT: mul x9, x1, x8 +; CHECK-GI-NEXT: mul x11, x0, x10 +; CHECK-GI-NEXT: umulh x12, x0, x8 +; CHECK-GI-NEXT: mul x13, x1, x10 +; CHECK-GI-NEXT: adds x9, x9, x11 +; CHECK-GI-NEXT: umulh x14, x1, x8 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: cmn x9, x12 +; CHECK-GI-NEXT: and x9, x11, #0x1 +; CHECK-GI-NEXT: sub x12, x0, x0 +; CHECK-GI-NEXT: umulh x15, x0, x10 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: and x11, x11, #0x1 +; CHECK-GI-NEXT: add x12, x13, x12 +; CHECK-GI-NEXT: and x13, xzr, #0x1 +; CHECK-GI-NEXT: umulh x8, xzr, x8 +; CHECK-GI-NEXT: add x9, x9, x11 +; CHECK-GI-NEXT: and x11, xzr, #0x1 +; CHECK-GI-NEXT: adds x12, x12, x14 +; CHECK-GI-NEXT: add x11, x11, x13 +; CHECK-GI-NEXT: umulh x10, x1, x10 +; CHECK-GI-NEXT: cset w13, hs +; CHECK-GI-NEXT: adds x12, x12, x15 +; CHECK-GI-NEXT: and x13, x13, #0x1 +; CHECK-GI-NEXT: umulh x14, x0, xzr +; CHECK-GI-NEXT: cset w15, hs +; CHECK-GI-NEXT: adds x9, x12, x9 +; CHECK-GI-NEXT: add x11, x11, x13 +; CHECK-GI-NEXT: and x12, x15, #0x1 +; CHECK-GI-NEXT: cset w13, hs +; CHECK-GI-NEXT: add x11, x11, x12 +; CHECK-GI-NEXT: and x12, x13, #0x1 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: add x10, x11, x12 +; CHECK-GI-NEXT: add x8, x8, x14 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: subs x10, x0, x9 +; CHECK-GI-NEXT: sbc x11, x1, x8 +; CHECK-GI-NEXT: lsl x12, x11, #63 +; CHECK-GI-NEXT: lsr x11, x11, #1 +; CHECK-GI-NEXT: orr x10, x12, x10, lsr #1 +; CHECK-GI-NEXT: adds x9, x10, x9 +; CHECK-GI-NEXT: adc x8, x11, x8 +; CHECK-GI-NEXT: lsl x10, x8, #62 +; CHECK-GI-NEXT: lsr x8, x8, #2 +; CHECK-GI-NEXT: orr x9, x10, x9, lsr #2 +; CHECK-GI-NEXT: mov w10, #7 // =0x7 +; CHECK-GI-NEXT: lsl x12, x8, #3 +; CHECK-GI-NEXT: umulh x10, x9, x10 +; CHECK-GI-NEXT: lsl x11, x9, #3 +; CHECK-GI-NEXT: sub x8, x12, x8 +; CHECK-GI-NEXT: sub x9, x11, x9 +; CHECK-GI-NEXT: subs x0, x0, x9 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: sbc x1, x1, x8 +; CHECK-GI-NEXT: ret +entry: + %s = urem i128 %a, 7 + ret i128 %s +} + +define i128 @ui128_100(i128 %a, i128 %b) { +; CHECK-SD-LABEL: ui128_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: mov w2, #100 // =0x64 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: ui128_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov x8, #23593 // =0x5c29 +; CHECK-GI-NEXT: mov x10, #62914 // =0xf5c2 +; CHECK-GI-NEXT: movk x8, #49807, lsl #16 +; CHECK-GI-NEXT: movk x10, #23592, lsl #16 +; CHECK-GI-NEXT: movk x8, #10485, lsl #32 +; CHECK-GI-NEXT: movk x10, #49807, lsl #32 +; CHECK-GI-NEXT: movk x8, #36700, lsl #48 +; CHECK-GI-NEXT: movk x10, #10485, lsl #48 +; CHECK-GI-NEXT: mul x9, x1, x8 +; CHECK-GI-NEXT: mul x11, x0, x10 +; CHECK-GI-NEXT: umulh x12, x0, x8 +; CHECK-GI-NEXT: mul x13, x1, x10 +; CHECK-GI-NEXT: adds x9, x9, x11 +; CHECK-GI-NEXT: umulh x14, x1, x8 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: cmn x9, x12 +; CHECK-GI-NEXT: and x9, x11, #0x1 +; CHECK-GI-NEXT: sub x12, x0, x0 +; CHECK-GI-NEXT: umulh x15, x0, x10 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: and x11, x11, #0x1 +; CHECK-GI-NEXT: add x12, x13, x12 +; CHECK-GI-NEXT: and x13, xzr, #0x1 +; CHECK-GI-NEXT: umulh x8, xzr, x8 +; CHECK-GI-NEXT: add x9, x9, x11 +; CHECK-GI-NEXT: and x11, xzr, #0x1 +; CHECK-GI-NEXT: adds x12, x12, x14 +; CHECK-GI-NEXT: add x11, x11, x13 +; CHECK-GI-NEXT: umulh x10, x1, x10 +; CHECK-GI-NEXT: cset w13, hs +; CHECK-GI-NEXT: adds x12, x12, x15 +; CHECK-GI-NEXT: and x13, x13, #0x1 +; CHECK-GI-NEXT: umulh x14, x0, xzr +; CHECK-GI-NEXT: cset w15, hs +; CHECK-GI-NEXT: adds x9, x12, x9 +; CHECK-GI-NEXT: add x11, x11, x13 +; CHECK-GI-NEXT: and x12, x15, #0x1 +; CHECK-GI-NEXT: cset w13, hs +; CHECK-GI-NEXT: add x11, x11, x12 +; CHECK-GI-NEXT: and x12, x13, #0x1 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: add x10, x11, x12 +; CHECK-GI-NEXT: add x8, x8, x14 +; CHECK-GI-NEXT: add x8, x8, x10 +; CHECK-GI-NEXT: lsl x10, x8, #60 +; CHECK-GI-NEXT: lsr x8, x8, #4 +; CHECK-GI-NEXT: orr x9, x10, x9, lsr #4 +; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: umulh x11, x9, x10 +; CHECK-GI-NEXT: mul x9, x9, x10 +; CHECK-GI-NEXT: madd x8, x8, x10, x11 +; CHECK-GI-NEXT: subs x0, x0, x9 +; CHECK-GI-NEXT: sbc x1, x1, x8 +; CHECK-GI-NEXT: ret +entry: + %s = urem i128 %a, 100 + ret i128 %s +} + +define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) { +; CHECK-SD-LABEL: sv2i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v1.2s, v0.2s, #24 +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: movi v3.2s, #7 +; CHECK-SD-NEXT: movk w8, #37449, lsl #16 +; CHECK-SD-NEXT: dup v2.2s, w8 +; CHECK-SD-NEXT: sshr v0.2s, v1.2s, #24 +; CHECK-SD-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #32 +; CHECK-SD-NEXT: ssra v2.2s, v1.2s, #24 +; CHECK-SD-NEXT: sshr v1.2s, v2.2s, #2 +; CHECK-SD-NEXT: usra v1.2s, v2.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v3.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 +; CHECK-GI-NEXT: smov w11, v1.h[1] +; CHECK-GI-NEXT: sdiv w8, w10, w8 +; CHECK-GI-NEXT: smov w10, v1.h[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: fmov s1, w10 +; CHECK-GI-NEXT: mov v1.s[1], w11 +; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i8> %d, + ret <2 x i8> %s +} + +define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) { +; CHECK-SD-LABEL: sv2i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #37 +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: usra v1.2s, v1.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 +; CHECK-GI-NEXT: smov w11, v1.h[1] +; CHECK-GI-NEXT: sdiv w8, w10, w8 +; CHECK-GI-NEXT: smov w10, v1.h[0] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: fmov s1, w10 +; CHECK-GI-NEXT: mov v1.s[1], w11 +; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i8> %d, + ret <2 x i8> %s +} + +define <3 x i8> @sv3i8_7(<3 x i8> %d, <3 x i8> %e) { +; CHECK-SD-LABEL: sv3i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: sxtb x8, w0 +; CHECK-SD-NEXT: mov x9, #-56173 // =0xffffffffffff2493 +; CHECK-SD-NEXT: sxtb x10, w1 +; CHECK-SD-NEXT: sxtb x11, w2 +; CHECK-SD-NEXT: movk x9, #37449, lsl #16 +; CHECK-SD-NEXT: sxtb w12, w1 +; CHECK-SD-NEXT: smull x8, w8, w9 +; CHECK-SD-NEXT: sxtb w13, w0 +; CHECK-SD-NEXT: smull x10, w10, w9 +; CHECK-SD-NEXT: smull x9, w11, w9 +; CHECK-SD-NEXT: sxtb w11, w2 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: lsr x10, x10, #32 +; CHECK-SD-NEXT: lsr x9, x9, #32 +; CHECK-SD-NEXT: add w8, w8, w13 +; CHECK-SD-NEXT: add w10, w10, w12 +; CHECK-SD-NEXT: asr w14, w8, #2 +; CHECK-SD-NEXT: add w9, w9, w11 +; CHECK-SD-NEXT: asr w15, w10, #2 +; CHECK-SD-NEXT: asr w16, w9, #2 +; CHECK-SD-NEXT: add w8, w14, w8, lsr #31 +; CHECK-SD-NEXT: add w10, w15, w10, lsr #31 +; CHECK-SD-NEXT: add w9, w16, w9, lsr #31 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: sub w10, w10, w10, lsl #3 +; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 +; CHECK-SD-NEXT: add w0, w13, w8 +; CHECK-SD-NEXT: add w1, w12, w10 +; CHECK-SD-NEXT: add w2, w11, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: sxtb w11, w1 +; CHECK-GI-NEXT: sxtb w13, w2 +; CHECK-GI-NEXT: mov w9, #7 // =0x7 +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: sdiv w12, w11, w9 +; CHECK-GI-NEXT: lsl w14, w10, #3 +; CHECK-GI-NEXT: sub w10, w14, w10 +; CHECK-GI-NEXT: sub w0, w8, w10 +; CHECK-GI-NEXT: sdiv w9, w13, w9 +; CHECK-GI-NEXT: lsl w15, w12, #3 +; CHECK-GI-NEXT: sub w12, w15, w12 +; CHECK-GI-NEXT: sub w1, w11, w12 +; CHECK-GI-NEXT: lsl w16, w9, #3 +; CHECK-GI-NEXT: sub w9, w16, w9 +; CHECK-GI-NEXT: sub w2, w13, w9 +; CHECK-GI-NEXT: ret +entry: + %s = srem <3 x i8> %d, + ret <3 x i8> %s +} + +define <3 x i8> @sv3i8_100(<3 x i8> %d, <3 x i8> %e) { +; CHECK-SD-LABEL: sv3i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: sxtb x8, w0 +; CHECK-SD-NEXT: mov w9, #34079 // =0x851f +; CHECK-SD-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-SD-NEXT: sxtb x10, w1 +; CHECK-SD-NEXT: movk w9, #20971, lsl #16 +; CHECK-SD-NEXT: sxtb x11, w2 +; CHECK-SD-NEXT: sxtb w12, w0 +; CHECK-SD-NEXT: smull x8, w8, w9 +; CHECK-SD-NEXT: smull x10, w10, w9 +; CHECK-SD-NEXT: smull x9, w11, w9 +; CHECK-SD-NEXT: mov w11, #100 // =0x64 +; CHECK-SD-NEXT: asr x8, x8, #37 +; CHECK-SD-NEXT: asr x10, x10, #37 +; CHECK-SD-NEXT: asr x9, x9, #37 +; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 +; CHECK-SD-NEXT: add w10, w10, w10, lsr #31 +; CHECK-SD-NEXT: add w9, w9, w9, lsr #31 +; CHECK-SD-NEXT: msub w0, w8, w11, w12 +; CHECK-SD-NEXT: sxtb w8, w1 +; CHECK-SD-NEXT: msub w1, w10, w11, w8 +; CHECK-SD-NEXT: sxtb w8, w2 +; CHECK-SD-NEXT: msub w2, w9, w11, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: sxtb w11, w1 +; CHECK-GI-NEXT: sxtb w13, w2 +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: sdiv w10, w8, w9 +; CHECK-GI-NEXT: sdiv w12, w11, w9 +; CHECK-GI-NEXT: msub w0, w10, w9, w8 +; CHECK-GI-NEXT: sdiv w14, w13, w9 +; CHECK-GI-NEXT: msub w1, w12, w9, w11 +; CHECK-GI-NEXT: msub w2, w14, w9, w13 +; CHECK-GI-NEXT: ret +entry: + %s = srem <3 x i8> %d, + ret <3 x i8> %s +} + +define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) { +; CHECK-SD-LABEL: sv4i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: mov x8, #-56173 // =0xffffffffffff2493 +; CHECK-SD-NEXT: movk x8, #37449, lsl #16 +; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: smov x9, v0.h[0] +; CHECK-SD-NEXT: smov x10, v0.h[1] +; CHECK-SD-NEXT: smov w11, v0.h[0] +; CHECK-SD-NEXT: smov x12, v0.h[2] +; CHECK-SD-NEXT: smov w13, v0.h[1] +; CHECK-SD-NEXT: smov x14, v0.h[3] +; CHECK-SD-NEXT: smov w16, v0.h[2] +; CHECK-SD-NEXT: smull x9, w9, w8 +; CHECK-SD-NEXT: smull x10, w10, w8 +; CHECK-SD-NEXT: smull x12, w12, w8 +; CHECK-SD-NEXT: lsr x9, x9, #32 +; CHECK-SD-NEXT: smull x8, w14, w8 +; CHECK-SD-NEXT: smov w14, v0.h[3] +; CHECK-SD-NEXT: lsr x10, x10, #32 +; CHECK-SD-NEXT: add w9, w9, w11 +; CHECK-SD-NEXT: lsr x12, x12, #32 +; CHECK-SD-NEXT: asr w15, w9, #2 +; CHECK-SD-NEXT: add w10, w10, w13 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: asr w17, w10, #2 +; CHECK-SD-NEXT: add w12, w12, w16 +; CHECK-SD-NEXT: add w9, w15, w9, lsr #31 +; CHECK-SD-NEXT: asr w15, w12, #2 +; CHECK-SD-NEXT: add w8, w8, w14 +; CHECK-SD-NEXT: add w10, w17, w10, lsr #31 +; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 +; CHECK-SD-NEXT: sub w10, w10, w10, lsl #3 +; CHECK-SD-NEXT: add w9, w11, w9 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: add w10, w13, w10 +; CHECK-SD-NEXT: add w9, w15, w12, lsr #31 +; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: asr w10, w8, #2 +; CHECK-SD-NEXT: add w9, w16, w9 +; CHECK-SD-NEXT: add w8, w10, w8, lsr #31 +; CHECK-SD-NEXT: mov v0.h[2], w9 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w8, w14, w8 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: movi v3.4h, #7 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w11, v0.s[2] +; CHECK-GI-NEXT: mov w12, v0.s[3] +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: sdiv w9, w12, w8 +; CHECK-GI-NEXT: mov v1.s[2], w11 +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i8> %d, + ret <4 x i8> %s +} + +define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) { +; CHECK-SD-LABEL: sv4i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: mov w14, #100 // =0x64 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: sshr v1.4h, v0.4h, #8 +; CHECK-SD-NEXT: smov x9, v1.h[0] +; CHECK-SD-NEXT: smov x10, v1.h[1] +; CHECK-SD-NEXT: smov x11, v1.h[2] +; CHECK-SD-NEXT: smov w12, v1.h[0] +; CHECK-SD-NEXT: smov x13, v1.h[3] +; CHECK-SD-NEXT: smov w15, v1.h[1] +; CHECK-SD-NEXT: smull x9, w9, w8 +; CHECK-SD-NEXT: smull x10, w10, w8 +; CHECK-SD-NEXT: smull x11, w11, w8 +; CHECK-SD-NEXT: asr x9, x9, #37 +; CHECK-SD-NEXT: smull x8, w13, w8 +; CHECK-SD-NEXT: asr x10, x10, #37 +; CHECK-SD-NEXT: add w9, w9, w9, lsr #31 +; CHECK-SD-NEXT: asr x11, x11, #37 +; CHECK-SD-NEXT: add w10, w10, w10, lsr #31 +; CHECK-SD-NEXT: asr x8, x8, #37 +; CHECK-SD-NEXT: msub w9, w9, w14, w12 +; CHECK-SD-NEXT: msub w10, w10, w14, w15 +; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: add w9, w11, w11, lsr #31 +; CHECK-SD-NEXT: smov w11, v1.h[2] +; CHECK-SD-NEXT: msub w9, w9, w14, w11 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: smov w10, v1.h[3] +; CHECK-SD-NEXT: msub w8, w8, w14, w10 +; CHECK-SD-NEXT: mov v0.h[2], w9 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: movi v3.4h, #100 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w11, v0.s[2] +; CHECK-GI-NEXT: mov w12, v0.s[3] +; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: sdiv w9, w12, w8 +; CHECK-GI-NEXT: mov v1.s[2], w11 +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i8> %d, + ret <4 x i8> %s +} + +define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) { +; CHECK-SD-LABEL: sv8i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v1.8b, #147 +; CHECK-SD-NEXT: movi v2.8b, #7 +; CHECK-SD-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-SD-NEXT: add v1.8b, v1.8b, v0.8b +; CHECK-SD-NEXT: sshr v1.8b, v1.8b, #2 +; CHECK-SD-NEXT: usra v1.8b, v1.8b, #7 +; CHECK-SD-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv8i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: movi v4.8b, #7 +; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w13, s0 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w14, v0.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w15, v0.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: mov w16, v0.s[3] +; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: sdiv w13, w13, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s3, w13 +; CHECK-GI-NEXT: sdiv w14, w14, w8 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v3.s[1], w14 +; CHECK-GI-NEXT: sdiv w15, w15, w8 +; CHECK-GI-NEXT: mov v2.s[2], w11 +; CHECK-GI-NEXT: sdiv w12, w12, w8 +; CHECK-GI-NEXT: mov v3.s[2], w15 +; CHECK-GI-NEXT: sdiv w8, w16, w8 +; CHECK-GI-NEXT: mov v2.s[3], w12 +; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s +; CHECK-GI-NEXT: mov v3.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = srem <8 x i8> %d, + ret <8 x i8> %s +} + +define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) { +; CHECK-SD-LABEL: sv8i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v1.8b, #41 +; CHECK-SD-NEXT: movi v2.8b, #100 +; CHECK-SD-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-SD-NEXT: sshr v1.8b, v1.8b, #4 +; CHECK-SD-NEXT: usra v1.8b, v1.8b, #7 +; CHECK-SD-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv8i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: movi v4.8b, #100 +; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w13, s0 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w14, v0.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w15, v0.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: mov w16, v0.s[3] +; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: sdiv w13, w13, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s3, w13 +; CHECK-GI-NEXT: sdiv w14, w14, w8 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v3.s[1], w14 +; CHECK-GI-NEXT: sdiv w15, w15, w8 +; CHECK-GI-NEXT: mov v2.s[2], w11 +; CHECK-GI-NEXT: sdiv w12, w12, w8 +; CHECK-GI-NEXT: mov v3.s[2], w15 +; CHECK-GI-NEXT: sdiv w8, w16, w8 +; CHECK-GI-NEXT: mov v2.s[3], w12 +; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s +; CHECK-GI-NEXT: mov v3.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = srem <8 x i8> %d, + ret <8 x i8> %s +} + +define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) { +; CHECK-SD-LABEL: sv16i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v1.16b, #147 +; CHECK-SD-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: movi v2.16b, #7 +; CHECK-SD-NEXT: add v1.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: sshr v1.16b, v1.16b, #2 +; CHECK-SD-NEXT: usra v1.16b, v1.16b, #7 +; CHECK-SD-NEXT: mls v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv16i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: movi v16.8b, #7 +; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w13, s2 +; CHECK-GI-NEXT: fmov w17, s0 +; CHECK-GI-NEXT: fmov w2, s3 +; CHECK-GI-NEXT: mov w14, v2.s[1] +; CHECK-GI-NEXT: mov w18, v0.s[1] +; CHECK-GI-NEXT: mov w3, v3.s[1] +; CHECK-GI-NEXT: mov w15, v2.s[2] +; CHECK-GI-NEXT: mov w0, v0.s[2] +; CHECK-GI-NEXT: sdiv w11, w9, w8 +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov w4, v3.s[2] +; CHECK-GI-NEXT: mov w16, v2.s[3] +; CHECK-GI-NEXT: mov w1, v0.s[3] +; CHECK-GI-NEXT: mov w5, v3.s[3] +; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0 +; CHECK-GI-NEXT: sdiv w13, w13, w8 +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: sdiv w17, w17, w8 +; CHECK-GI-NEXT: fmov s5, w13 +; CHECK-GI-NEXT: sdiv w2, w2, w8 +; CHECK-GI-NEXT: fmov s6, w17 +; CHECK-GI-NEXT: sdiv w12, w9, w8 +; CHECK-GI-NEXT: mov w9, v1.s[2] +; CHECK-GI-NEXT: fmov s7, w2 +; CHECK-GI-NEXT: sdiv w14, w14, w8 +; CHECK-GI-NEXT: mov v4.s[1], w12 +; CHECK-GI-NEXT: sdiv w18, w18, w8 +; CHECK-GI-NEXT: mov v5.s[1], w14 +; CHECK-GI-NEXT: sdiv w3, w3, w8 +; CHECK-GI-NEXT: mov v6.s[1], w18 +; CHECK-GI-NEXT: sdiv w10, w9, w8 +; CHECK-GI-NEXT: mov w9, v1.s[3] +; CHECK-GI-NEXT: mov v7.s[1], w3 +; CHECK-GI-NEXT: sdiv w15, w15, w8 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: sdiv w0, w0, w8 +; CHECK-GI-NEXT: mov v5.s[2], w15 +; CHECK-GI-NEXT: sdiv w4, w4, w8 +; CHECK-GI-NEXT: mov v6.s[2], w0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: mov v7.s[2], w4 +; CHECK-GI-NEXT: sdiv w16, w16, w8 +; CHECK-GI-NEXT: mov v4.s[3], w9 +; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s +; CHECK-GI-NEXT: sdiv w1, w1, w8 +; CHECK-GI-NEXT: mov v5.s[3], w16 +; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s +; CHECK-GI-NEXT: sdiv w8, w5, w8 +; CHECK-GI-NEXT: mov v6.s[3], w1 +; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s +; CHECK-GI-NEXT: mov v7.s[3], w8 +; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret +entry: + %s = srem <16 x i8> %d, + ret <16 x i8> %s +} + +define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) { +; CHECK-SD-LABEL: sv16i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v1.16b, #41 +; CHECK-SD-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-SD-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: movi v2.16b, #100 +; CHECK-SD-NEXT: sshr v1.16b, v1.16b, #4 +; CHECK-SD-NEXT: usra v1.16b, v1.16b, #7 +; CHECK-SD-NEXT: mls v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv16i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: movi v16.8b, #100 +; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 +; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w13, s2 +; CHECK-GI-NEXT: fmov w17, s0 +; CHECK-GI-NEXT: fmov w2, s3 +; CHECK-GI-NEXT: mov w14, v2.s[1] +; CHECK-GI-NEXT: mov w18, v0.s[1] +; CHECK-GI-NEXT: mov w3, v3.s[1] +; CHECK-GI-NEXT: mov w15, v2.s[2] +; CHECK-GI-NEXT: mov w0, v0.s[2] +; CHECK-GI-NEXT: sdiv w11, w9, w8 +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov w4, v3.s[2] +; CHECK-GI-NEXT: mov w16, v2.s[3] +; CHECK-GI-NEXT: mov w1, v0.s[3] +; CHECK-GI-NEXT: mov w5, v3.s[3] +; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0 +; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0 +; CHECK-GI-NEXT: sdiv w13, w13, w8 +; CHECK-GI-NEXT: fmov s4, w11 +; CHECK-GI-NEXT: sdiv w17, w17, w8 +; CHECK-GI-NEXT: fmov s5, w13 +; CHECK-GI-NEXT: sdiv w2, w2, w8 +; CHECK-GI-NEXT: fmov s6, w17 +; CHECK-GI-NEXT: sdiv w12, w9, w8 +; CHECK-GI-NEXT: mov w9, v1.s[2] +; CHECK-GI-NEXT: fmov s7, w2 +; CHECK-GI-NEXT: sdiv w14, w14, w8 +; CHECK-GI-NEXT: mov v4.s[1], w12 +; CHECK-GI-NEXT: sdiv w18, w18, w8 +; CHECK-GI-NEXT: mov v5.s[1], w14 +; CHECK-GI-NEXT: sdiv w3, w3, w8 +; CHECK-GI-NEXT: mov v6.s[1], w18 +; CHECK-GI-NEXT: sdiv w10, w9, w8 +; CHECK-GI-NEXT: mov w9, v1.s[3] +; CHECK-GI-NEXT: mov v7.s[1], w3 +; CHECK-GI-NEXT: sdiv w15, w15, w8 +; CHECK-GI-NEXT: mov v4.s[2], w10 +; CHECK-GI-NEXT: sdiv w0, w0, w8 +; CHECK-GI-NEXT: mov v5.s[2], w15 +; CHECK-GI-NEXT: sdiv w4, w4, w8 +; CHECK-GI-NEXT: mov v6.s[2], w0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: mov v7.s[2], w4 +; CHECK-GI-NEXT: sdiv w16, w16, w8 +; CHECK-GI-NEXT: mov v4.s[3], w9 +; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s +; CHECK-GI-NEXT: sdiv w1, w1, w8 +; CHECK-GI-NEXT: mov v5.s[3], w16 +; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s +; CHECK-GI-NEXT: sdiv w8, w5, w8 +; CHECK-GI-NEXT: mov v6.s[3], w1 +; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s +; CHECK-GI-NEXT: mov v7.s[3], w8 +; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h +; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: ret +entry: + %s = srem <16 x i8> %d, + ret <16 x i8> %s +} + +define <2 x i8> @uv2i8_7(<2 x i8> %d, <2 x i8> %e) { +; CHECK-SD-LABEL: uv2i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: dup v2.2s, w8 +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: movi v2.2s, #7 +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi d1, #0x0000ff000000ff +; CHECK-GI-NEXT: movi v2.2s, #37 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: and v1.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: neg v3.8b, v3.8b +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: sub v2.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov w9, v2.s[1] +; CHECK-GI-NEXT: mov v2.b[1], w9 +; CHECK-GI-NEXT: ushl v2.8b, v2.8b, v3.8b +; CHECK-GI-NEXT: umov w8, v2.b[0] +; CHECK-GI-NEXT: umov w9, v2.b[1] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: add v1.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov v2.b[1], w8 +; CHECK-GI-NEXT: mov v1.b[1], w9 +; CHECK-GI-NEXT: neg v2.8b, v2.8b +; CHECK-GI-NEXT: ushl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: movi v2.2s, #7 +; CHECK-GI-NEXT: umov w8, v1.b[0] +; CHECK-GI-NEXT: umov w9, v1.b[1] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i8> %d, + ret <2 x i8> %s +} + +define <2 x i8> @uv2i8_100(<2 x i8> %d, <2 x i8> %e) { +; CHECK-SD-LABEL: uv2i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff +; CHECK-SD-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-SD-NEXT: movk w8, #655, lsl #16 +; CHECK-SD-NEXT: dup v2.2s, w8 +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi d1, #0x0000ff000000ff +; CHECK-GI-NEXT: movi v2.2s, #41 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: and v1.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: ushl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: movi v2.2s, #100 +; CHECK-GI-NEXT: umov w8, v1.b[0] +; CHECK-GI-NEXT: umov w9, v1.b[1] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i8> %d, + ret <2 x i8> %s +} + +define <3 x i8> @uv3i8_7(<3 x i8> %d, <3 x i8> %e) { +; CHECK-SD-LABEL: uv3i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: and w9, w0, #0xff +; CHECK-SD-NEXT: and w10, w1, #0xff +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: and w12, w2, #0xff +; CHECK-SD-NEXT: umull x11, w9, w8 +; CHECK-SD-NEXT: umull x13, w10, w8 +; CHECK-SD-NEXT: umull x8, w12, w8 +; CHECK-SD-NEXT: lsr x11, x11, #32 +; CHECK-SD-NEXT: lsr x13, x13, #32 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: sub w11, w11, w11, lsl #3 +; CHECK-SD-NEXT: sub w13, w13, w13, lsl #3 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: add w0, w9, w11 +; CHECK-SD-NEXT: add w1, w10, w13 +; CHECK-SD-NEXT: add w2, w12, w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: mov w10, #37 // =0x25 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: mov v2.h[1], w10 +; CHECK-GI-NEXT: and w9, w2, #0xff +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[1], w1 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: mov v2.h[2], w10 +; CHECK-GI-NEXT: mov v3.h[2], w8 +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov v0.h[2], w2 +; CHECK-GI-NEXT: mul v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: neg v2.4h, v3.4h +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: sub v2.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: uzp1 v2.8b, v2.8b, v0.8b +; CHECK-GI-NEXT: neg v3.8b, v3.8b +; CHECK-GI-NEXT: ushl v2.8b, v2.8b, v3.8b +; CHECK-GI-NEXT: mov b3, v2.b[1] +; CHECK-GI-NEXT: mov b4, v2.b[2] +; CHECK-GI-NEXT: fmov w8, s3 +; CHECK-GI-NEXT: fmov w9, s4 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: ushl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: mov b2, v1.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[2] +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: mov v2.h[2], w8 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: umov w0, v0.h[0] +; CHECK-GI-NEXT: umov w1, v0.h[1] +; CHECK-GI-NEXT: umov w2, v0.h[2] +; CHECK-GI-NEXT: ret +entry: + %s = urem <3 x i8> %d, + ret <3 x i8> %s +} + +define <3 x i8> @uv3i8_100(<3 x i8> %d, <3 x i8> %e) { +; CHECK-SD-LABEL: uv3i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-SD-NEXT: and w9, w0, #0xff +; CHECK-SD-NEXT: and w10, w1, #0xff +; CHECK-SD-NEXT: movk w8, #655, lsl #16 +; CHECK-SD-NEXT: and w12, w2, #0xff +; CHECK-SD-NEXT: mov w14, #100 // =0x64 +; CHECK-SD-NEXT: umull x11, w9, w8 +; CHECK-SD-NEXT: umull x13, w10, w8 +; CHECK-SD-NEXT: umull x8, w12, w8 +; CHECK-SD-NEXT: lsr x11, x11, #32 +; CHECK-SD-NEXT: lsr x13, x13, #32 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: msub w0, w11, w14, w9 +; CHECK-SD-NEXT: msub w1, w13, w14, w10 +; CHECK-SD-NEXT: msub w2, w8, w14, w12 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: mov w10, #41 // =0x29 +; CHECK-GI-NEXT: and w9, w1, #0xff +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: fmov s1, w10 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v1.h[1], w10 +; CHECK-GI-NEXT: and w9, w2, #0xff +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: mov v1.h[2], w10 +; CHECK-GI-NEXT: mov v2.h[2], w8 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: neg v1.4h, v2.4h +; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-GI-NEXT: neg v1.8b, v3.8b +; CHECK-GI-NEXT: fmov s3, w0 +; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: mov v3.h[1], w1 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: mov v3.h[2], w2 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: mov v0.h[2], w9 +; CHECK-GI-NEXT: mov v1.h[2], w8 +; CHECK-GI-NEXT: mls v3.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: umov w0, v3.h[0] +; CHECK-GI-NEXT: umov w1, v3.h[1] +; CHECK-GI-NEXT: umov w2, v3.h[2] +; CHECK-GI-NEXT: ret +entry: + %s = urem <3 x i8> %d, + ret <3 x i8> %s +} + +define <4 x i8> @uv4i8_7(<4 x i8> %d, <4 x i8> %e) { +; CHECK-SD-LABEL: uv4i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: umov w9, v0.h[0] +; CHECK-SD-NEXT: umov w10, v0.h[1] +; CHECK-SD-NEXT: umov w13, v0.h[2] +; CHECK-SD-NEXT: umov w15, v0.h[3] +; CHECK-SD-NEXT: umull x11, w9, w8 +; CHECK-SD-NEXT: umull x12, w10, w8 +; CHECK-SD-NEXT: umull x14, w13, w8 +; CHECK-SD-NEXT: lsr x11, x11, #32 +; CHECK-SD-NEXT: umull x8, w15, w8 +; CHECK-SD-NEXT: lsr x12, x12, #32 +; CHECK-SD-NEXT: sub w11, w11, w11, lsl #3 +; CHECK-SD-NEXT: sub w12, w12, w12, lsl #3 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: add w9, w9, w11 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: add w10, w10, w12 +; CHECK-SD-NEXT: lsr x9, x14, #32 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: add w8, w15, w8 +; CHECK-SD-NEXT: add w9, w13, w9 +; CHECK-SD-NEXT: mov v0.h[2], w9 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #37 // =0x25 +; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v1.b[2], w8 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: ushr v2.4h, v1.4h, #8 +; CHECK-GI-NEXT: mov v3.b[3], w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: sub v2.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: neg v3.8b, v3.8b +; CHECK-GI-NEXT: mov v4.b[1], w8 +; CHECK-GI-NEXT: uzp1 v2.8b, v2.8b, v0.8b +; CHECK-GI-NEXT: mov v4.b[2], w8 +; CHECK-GI-NEXT: ushl v2.8b, v2.8b, v3.8b +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: mov v4.b[3], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: usra v2.4h, v1.4h, #8 +; CHECK-GI-NEXT: uzp1 v1.8b, v2.8b, v0.8b +; CHECK-GI-NEXT: neg v2.8b, v4.8b +; CHECK-GI-NEXT: ushl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: dup v2.4h, w8 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i8> %d, + ret <4 x i8> %s +} + +define <4 x i8> @uv4i8_100(<4 x i8> %d, <4 x i8> %e) { +; CHECK-SD-LABEL: uv4i8_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-SD-NEXT: mov w14, #100 // =0x64 +; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-SD-NEXT: movk w8, #655, lsl #16 +; CHECK-SD-NEXT: umov w9, v0.h[0] +; CHECK-SD-NEXT: umov w10, v0.h[1] +; CHECK-SD-NEXT: umov w12, v0.h[2] +; CHECK-SD-NEXT: umov w15, v0.h[3] +; CHECK-SD-NEXT: umull x11, w9, w8 +; CHECK-SD-NEXT: umull x13, w10, w8 +; CHECK-SD-NEXT: lsr x11, x11, #32 +; CHECK-SD-NEXT: lsr x13, x13, #32 +; CHECK-SD-NEXT: msub w9, w11, w14, w9 +; CHECK-SD-NEXT: umull x11, w12, w8 +; CHECK-SD-NEXT: msub w10, w13, w14, w10 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: umull x8, w15, w8 +; CHECK-SD-NEXT: lsr x9, x11, #32 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: msub w9, w9, w14, w12 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: msub w8, w8, w14, w15 +; CHECK-SD-NEXT: mov v0.h[2], w9 +; CHECK-SD-NEXT: mov v0.h[3], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i8_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #41 // =0x29 +; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov v1.b[2], w8 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: ushr v1.4h, v1.4h, #8 +; CHECK-GI-NEXT: mov v3.b[3], w8 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: ushl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: dup v2.4h, w8 +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i8> %d, + ret <4 x i8> %s +} + +define <8 x i8> @uv8i8_7(<8 x i8> %d, <8 x i8> %e) { +; CHECK-SD-LABEL: uv8i8_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v1.8b, #37 +; CHECK-SD-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-SD-NEXT: sub v2.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-SD-NEXT: shrn v2.8b, v2.8h, #1 +; CHECK-SD-NEXT: add v1.8b, v2.8b, v1.8b +; CHECK-SD-NEXT: movi v2.8b, #7 +; CHECK-SD-NEXT: ushr v1.8b, v1.8b, #2 +; CHECK-SD-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i8_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v1.8b, #37 +; CHECK-GI-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: sub v2.8b, v0.8b, v1.8b +; CHECK-GI-NEXT: usra v1.8b, v2.8b, #1 +; CHECK-GI-NEXT: movi v2.8b, #7 +; CHECK-GI-NEXT: ushr v1.8b, v1.8b, #2 +; CHECK-GI-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ret +entry: + %s = urem <8 x i8> %d, + ret <8 x i8> %s +} + +define <8 x i8> @uv8i8_100(<8 x i8> %d, <8 x i8> %e) { +; CHECK-LABEL: uv8i8_100: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v1.8b, #41 +; CHECK-NEXT: movi v2.8b, #100 +; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-NEXT: ushr v1.8b, v1.8b, #4 +; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b +; CHECK-NEXT: ret +entry: + %s = urem <8 x i8> %d, + ret <8 x i8> %s +} + +define <16 x i8> @uv16i8_7(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: uv16i8_7: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v1.16b, #37 +; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: sub v2.16b, v0.16b, v1.16b +; CHECK-NEXT: usra v1.16b, v2.16b, #1 +; CHECK-NEXT: movi v2.16b, #7 +; CHECK-NEXT: ushr v1.16b, v1.16b, #2 +; CHECK-NEXT: mls v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %s = urem <16 x i8> %d, + ret <16 x i8> %s +} + +define <16 x i8> @uv16i8_100(<16 x i8> %d, <16 x i8> %e) { +; CHECK-LABEL: uv16i8_100: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v1.16b, #41 +; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b +; CHECK-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: movi v2.16b, #100 +; CHECK-NEXT: ushr v1.16b, v1.16b, #4 +; CHECK-NEXT: mls v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret +entry: + %s = urem <16 x i8> %d, + ret <16 x i8> %s +} + +define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) { +; CHECK-SD-LABEL: sv2i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v1.2s, v0.2s, #16 +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: movi v3.2s, #7 +; CHECK-SD-NEXT: movk w8, #37449, lsl #16 +; CHECK-SD-NEXT: dup v2.2s, w8 +; CHECK-SD-NEXT: sshr v0.2s, v1.2s, #16 +; CHECK-SD-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #32 +; CHECK-SD-NEXT: ssra v2.2s, v1.2s, #16 +; CHECK-SD-NEXT: sshr v1.2s, v2.2s, #2 +; CHECK-SD-NEXT: usra v1.2s, v2.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v3.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i16> %d, + ret <2 x i16> %s +} + +define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) { +; CHECK-SD-LABEL: sv2i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #37 +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: usra v1.2s, v1.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i16> %d, + ret <2 x i16> %s +} + +define <3 x i16> @sv3i16_7(<3 x i16> %d, <3 x i16> %e) { +; CHECK-SD-LABEL: sv3i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: smov x9, v0.h[0] +; CHECK-SD-NEXT: mov x8, #-56173 // =0xffffffffffff2493 +; CHECK-SD-NEXT: smov x10, v0.h[1] +; CHECK-SD-NEXT: movk x8, #37449, lsl #16 +; CHECK-SD-NEXT: smov w12, v0.h[0] +; CHECK-SD-NEXT: smov x11, v0.h[2] +; CHECK-SD-NEXT: smov w13, v0.h[1] +; CHECK-SD-NEXT: smull x9, w9, w8 +; CHECK-SD-NEXT: smull x10, w10, w8 +; CHECK-SD-NEXT: smull x8, w11, w8 +; CHECK-SD-NEXT: smov w11, v0.h[2] +; CHECK-SD-NEXT: lsr x9, x9, #32 +; CHECK-SD-NEXT: lsr x10, x10, #32 +; CHECK-SD-NEXT: add w9, w9, w12 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: asr w14, w9, #2 +; CHECK-SD-NEXT: add w10, w10, w13 +; CHECK-SD-NEXT: asr w15, w10, #2 +; CHECK-SD-NEXT: add w8, w8, w11 +; CHECK-SD-NEXT: add w9, w14, w9, lsr #31 +; CHECK-SD-NEXT: asr w14, w8, #2 +; CHECK-SD-NEXT: add w10, w15, w10, lsr #31 +; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 +; CHECK-SD-NEXT: add w8, w14, w8, lsr #31 +; CHECK-SD-NEXT: sub w10, w10, w10, lsl #3 +; CHECK-SD-NEXT: add w9, w12, w9 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: add w10, w13, w10 +; CHECK-SD-NEXT: add w8, w11, w8 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: smov w9, v0.h[0] +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: smov w11, v0.h[1] +; CHECK-GI-NEXT: smov w13, v0.h[2] +; CHECK-GI-NEXT: sdiv w10, w9, w8 +; CHECK-GI-NEXT: sdiv w12, w11, w8 +; CHECK-GI-NEXT: lsl w14, w10, #3 +; CHECK-GI-NEXT: sub w10, w14, w10 +; CHECK-GI-NEXT: sub w9, w9, w10 +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: sdiv w8, w13, w8 +; CHECK-GI-NEXT: lsl w15, w12, #3 +; CHECK-GI-NEXT: sub w10, w15, w12 +; CHECK-GI-NEXT: sub w10, w11, w10 +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w8, w13, w8 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <3 x i16> %d, + ret <3 x i16> %s +} + +define <3 x i16> @sv3i16_100(<3 x i16> %d, <3 x i16> %e) { +; CHECK-SD-LABEL: sv3i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: smov x9, v0.h[0] +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: smov x10, v0.h[1] +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: smov x11, v0.h[2] +; CHECK-SD-NEXT: mov w12, #100 // =0x64 +; CHECK-SD-NEXT: smov w13, v0.h[1] +; CHECK-SD-NEXT: smull x9, w9, w8 +; CHECK-SD-NEXT: smull x10, w10, w8 +; CHECK-SD-NEXT: smull x8, w11, w8 +; CHECK-SD-NEXT: smov w11, v0.h[0] +; CHECK-SD-NEXT: asr x9, x9, #37 +; CHECK-SD-NEXT: asr x10, x10, #37 +; CHECK-SD-NEXT: add w9, w9, w9, lsr #31 +; CHECK-SD-NEXT: asr x8, x8, #37 +; CHECK-SD-NEXT: add w10, w10, w10, lsr #31 +; CHECK-SD-NEXT: msub w9, w9, w12, w11 +; CHECK-SD-NEXT: smov w11, v0.h[2] +; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 +; CHECK-SD-NEXT: msub w10, w10, w12, w13 +; CHECK-SD-NEXT: msub w8, w8, w12, w11 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: smov w9, v0.h[0] +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: smov w11, v0.h[1] +; CHECK-GI-NEXT: smov w13, v0.h[2] +; CHECK-GI-NEXT: sdiv w10, w9, w8 +; CHECK-GI-NEXT: sdiv w12, w11, w8 +; CHECK-GI-NEXT: msub w9, w10, w8, w9 +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: sdiv w14, w13, w8 +; CHECK-GI-NEXT: msub w10, w12, w8, w11 +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: msub w8, w14, w8, w13 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <3 x i16> %d, + ret <3 x i16> %s +} + +define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) { +; CHECK-SD-LABEL: sv4i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: movi v2.4h, #7 +; CHECK-SD-NEXT: dup v1.4h, w8 +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #17 +; CHECK-SD-NEXT: xtn v1.4h, v1.4s +; CHECK-SD-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: movi v2.4h, #7 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w11, v0.s[2] +; CHECK-GI-NEXT: mov w12, v0.s[3] +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: sdiv w8, w12, w8 +; CHECK-GI-NEXT: mov v1.s[2], w11 +; CHECK-GI-NEXT: mov v1.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i16> %d, + ret <4 x i16> %s +} + +define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) { +; CHECK-SD-LABEL: sv4i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #5243 // =0x147b +; CHECK-SD-NEXT: movi v2.4h, #100 +; CHECK-SD-NEXT: dup v1.4h, w8 +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #19 +; CHECK-SD-NEXT: xtn v1.4h, v1.4s +; CHECK-SD-NEXT: usra v1.4h, v1.4h, #15 +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: movi v2.4h, #100 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w11, v0.s[2] +; CHECK-GI-NEXT: mov w12, v0.s[3] +; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: sdiv w8, w12, w8 +; CHECK-GI-NEXT: mov v1.s[2], w11 +; CHECK-GI-NEXT: mov v1.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i16> %d, + ret <4 x i16> %s +} + +define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) { +; CHECK-SD-LABEL: sv8i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: dup v1.8h, w8 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: movi v2.8h, #7 +; CHECK-SD-NEXT: sshr v1.8h, v1.8h, #1 +; CHECK-SD-NEXT: usra v1.8h, v1.8h, #15 +; CHECK-SD-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv8i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: movi v4.4h, #7 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w13, s0 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w14, v0.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w15, v0.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: mov w16, v0.s[3] +; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w13, w13, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s3, w13 +; CHECK-GI-NEXT: sdiv w14, w14, w8 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v3.s[1], w14 +; CHECK-GI-NEXT: sdiv w15, w15, w8 +; CHECK-GI-NEXT: mov v2.s[2], w11 +; CHECK-GI-NEXT: sdiv w12, w12, w8 +; CHECK-GI-NEXT: mov v3.s[2], w15 +; CHECK-GI-NEXT: sdiv w8, w16, w8 +; CHECK-GI-NEXT: mov v2.s[3], w12 +; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s +; CHECK-GI-NEXT: mov v3.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = srem <8 x i16> %d, + ret <8 x i16> %s +} + +define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) { +; CHECK-SD-LABEL: sv8i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #5243 // =0x147b +; CHECK-SD-NEXT: dup v1.8h, w8 +; CHECK-SD-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: movi v2.8h, #100 +; CHECK-SD-NEXT: sshr v1.8h, v1.8h, #3 +; CHECK-SD-NEXT: usra v1.8h, v1.8h, #15 +; CHECK-SD-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv8i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: movi v4.4h, #100 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: fmov w13, s0 +; CHECK-GI-NEXT: mov w10, v1.s[1] +; CHECK-GI-NEXT: mov w14, v0.s[1] +; CHECK-GI-NEXT: mov w11, v1.s[2] +; CHECK-GI-NEXT: mov w15, v0.s[2] +; CHECK-GI-NEXT: mov w12, v1.s[3] +; CHECK-GI-NEXT: mov w16, v0.s[3] +; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w13, w13, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s3, w13 +; CHECK-GI-NEXT: sdiv w14, w14, w8 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v3.s[1], w14 +; CHECK-GI-NEXT: sdiv w15, w15, w8 +; CHECK-GI-NEXT: mov v2.s[2], w11 +; CHECK-GI-NEXT: sdiv w12, w12, w8 +; CHECK-GI-NEXT: mov v3.s[2], w15 +; CHECK-GI-NEXT: sdiv w8, w16, w8 +; CHECK-GI-NEXT: mov v2.s[3], w12 +; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s +; CHECK-GI-NEXT: mov v3.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: ret +entry: + %s = srem <8 x i16> %d, + ret <8 x i16> %s +} + +define <2 x i16> @uv2i16_7(<2 x i16> %d, <2 x i16> %e) { +; CHECK-SD-LABEL: uv2i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: dup v2.2s, w8 +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: movi v2.2s, #7 +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #9363 // =0x2493 +; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: mul v1.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: neg v3.4h, v3.4h +; CHECK-GI-NEXT: ushr v2.2s, v1.2s, #16 +; CHECK-GI-NEXT: sub v2.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: uzp1 v2.4h, v2.4h, v0.4h +; CHECK-GI-NEXT: ushl v2.4h, v2.4h, v3.4h +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: usra v2.2s, v1.2s, #16 +; CHECK-GI-NEXT: uzp1 v1.4h, v2.4h, v0.4h +; CHECK-GI-NEXT: neg v2.4h, v3.4h +; CHECK-GI-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: dup v2.2s, w8 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i16> %d, + ret <2 x i16> %s +} + +define <2 x i16> @uv2i16_100(<2 x i16> %d, <2 x i16> %e) { +; CHECK-SD-LABEL: uv2i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-SD-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-SD-NEXT: movk w8, #655, lsl #16 +; CHECK-SD-NEXT: dup v2.2s, w8 +; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v2.2s +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: uzp1 v2.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: mov w8, #5243 // =0x147b +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: neg v1.4h, v1.4h +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: ushl v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: ushll v2.4s, v3.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: ushr v1.2s, v1.2s, #16 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: dup v2.2s, w8 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i16> %d, + ret <2 x i16> %s +} + +define <3 x i16> @uv3i16_7(<3 x i16> %d, <3 x i16> %e) { +; CHECK-SD-LABEL: uv3i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: umov w9, v0.h[0] +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: umov w10, v0.h[1] +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: umov w12, v0.h[2] +; CHECK-SD-NEXT: umull x11, w9, w8 +; CHECK-SD-NEXT: umull x13, w10, w8 +; CHECK-SD-NEXT: umull x8, w12, w8 +; CHECK-SD-NEXT: lsr x11, x11, #32 +; CHECK-SD-NEXT: lsr x13, x13, #32 +; CHECK-SD-NEXT: sub w11, w11, w11, lsl #3 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: sub w13, w13, w13, lsl #3 +; CHECK-SD-NEXT: add w9, w9, w11 +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: add w10, w10, w13 +; CHECK-SD-NEXT: add w8, w12, w8 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: umov w9, v0.h[0] +; CHECK-GI-NEXT: mov w8, #9363 // =0x2493 +; CHECK-GI-NEXT: umov w10, v0.h[1] +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: umov w11, v0.h[2] +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov w9, #16 // =0x10 +; CHECK-GI-NEXT: mov v2.s[1], w8 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: mov v3.s[1], w9 +; CHECK-GI-NEXT: mov v2.s[2], w8 +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: mov v1.s[2], w11 +; CHECK-GI-NEXT: mov v3.s[2], w9 +; CHECK-GI-NEXT: mov w9, #2 // =0x2 +; CHECK-GI-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: neg v2.4s, v3.4s +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: mov v3.h[2], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: sub v4.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: neg v3.4h, v3.4h +; CHECK-GI-NEXT: ushl v3.4h, v4.4h, v3.4h +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov v4.h[1], w8 +; CHECK-GI-NEXT: add v1.4h, v3.4h, v1.4h +; CHECK-GI-NEXT: ushl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: mov v4.h[2], w8 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v4.4h +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = urem <3 x i16> %d, + ret <3 x i16> %s +} + +define <3 x i16> @uv3i16_100(<3 x i16> %d, <3 x i16> %e) { +; CHECK-SD-LABEL: uv3i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: umov w9, v0.h[0] +; CHECK-SD-NEXT: mov w8, #23593 // =0x5c29 +; CHECK-SD-NEXT: umov w10, v0.h[1] +; CHECK-SD-NEXT: movk w8, #655, lsl #16 +; CHECK-SD-NEXT: umov w12, v0.h[2] +; CHECK-SD-NEXT: mov w14, #100 // =0x64 +; CHECK-SD-NEXT: umull x11, w9, w8 +; CHECK-SD-NEXT: umull x13, w10, w8 +; CHECK-SD-NEXT: umull x8, w12, w8 +; CHECK-SD-NEXT: lsr x11, x11, #32 +; CHECK-SD-NEXT: msub w9, w11, w14, w9 +; CHECK-SD-NEXT: lsr x11, x13, #32 +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: msub w10, w11, w14, w10 +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: msub w8, w8, w14, w12 +; CHECK-SD-NEXT: mov v0.h[1], w10 +; CHECK-SD-NEXT: mov v0.h[2], w8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: mov w11, #5243 // =0x147b +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov s2, w11 +; CHECK-GI-NEXT: mov v1.h[1], w8 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: mov v1.h[2], w8 +; CHECK-GI-NEXT: mov v2.s[2], w11 +; CHECK-GI-NEXT: neg v1.4h, v1.4h +; CHECK-GI-NEXT: ushl v1.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: umov w8, v1.h[0] +; CHECK-GI-NEXT: umov w9, v1.h[1] +; CHECK-GI-NEXT: umov w10, v1.h[2] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov w8, #16 // =0x10 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v1.s[1], w9 +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: mov v3.s[1], w8 +; CHECK-GI-NEXT: mov v1.s[2], w10 +; CHECK-GI-NEXT: mov v3.s[2], w8 +; CHECK-GI-NEXT: mov w8, #1 // =0x1 +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mul v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: mov v4.h[1], w8 +; CHECK-GI-NEXT: neg v2.4s, v3.4s +; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: mov v4.h[2], w8 +; CHECK-GI-NEXT: mov v2.h[1], w9 +; CHECK-GI-NEXT: xtn v1.4h, v1.4s +; CHECK-GI-NEXT: neg v3.4h, v4.4h +; CHECK-GI-NEXT: mov v2.h[2], w9 +; CHECK-GI-NEXT: ushl v1.4h, v1.4h, v3.4h +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ret +entry: + %s = urem <3 x i16> %d, + ret <3 x i16> %s +} + +define <4 x i16> @uv4i16_7(<4 x i16> %d, <4 x i16> %e) { +; CHECK-SD-LABEL: uv4i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: dup v1.4h, w8 +; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-SD-NEXT: sub v2.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-SD-NEXT: shrn v2.4h, v2.4s, #1 +; CHECK-SD-NEXT: add v1.4h, v2.4h, v1.4h +; CHECK-SD-NEXT: movi v2.4h, #7 +; CHECK-SD-NEXT: ushr v1.4h, v1.4h, #2 +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI52_0 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI52_0] +; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: sub v2.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: usra v1.4h, v2.4h, #1 +; CHECK-GI-NEXT: movi v2.4h, #7 +; CHECK-GI-NEXT: ushr v1.4h, v1.4h, #2 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i16> %d, + ret <4 x i16> %s +} + +define <4 x i16> @uv4i16_100(<4 x i16> %d, <4 x i16> %e) { +; CHECK-SD-LABEL: uv4i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #5243 // =0x147b +; CHECK-SD-NEXT: ushr v2.4h, v0.4h, #2 +; CHECK-SD-NEXT: dup v1.4h, w8 +; CHECK-SD-NEXT: umull v1.4s, v2.4h, v1.4h +; CHECK-SD-NEXT: movi v2.4h, #100 +; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #17 +; CHECK-SD-NEXT: xtn v1.4h, v1.4s +; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI53_0 +; CHECK-GI-NEXT: ushr v1.4h, v0.4h, #2 +; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI53_0] +; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: movi v2.4h, #100 +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: ushr v1.4h, v1.4h, #1 +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i16> %d, + ret <4 x i16> %s +} + +define <8 x i16> @uv8i16_7(<8 x i16> %d, <8 x i16> %e) { +; CHECK-SD-LABEL: uv8i16_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: dup v1.8h, w8 +; CHECK-SD-NEXT: umull2 v2.4s, v0.8h, v1.8h +; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: sub v2.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: usra v1.8h, v2.8h, #1 +; CHECK-SD-NEXT: movi v2.8h, #7 +; CHECK-SD-NEXT: ushr v1.8h, v1.8h, #2 +; CHECK-SD-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i16_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI54_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI54_0] +; CHECK-GI-NEXT: umull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: sub v2.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: usra v1.8h, v2.8h, #1 +; CHECK-GI-NEXT: movi v2.8h, #7 +; CHECK-GI-NEXT: ushr v1.8h, v1.8h, #2 +; CHECK-GI-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ret +entry: + %s = urem <8 x i16> %d, + ret <8 x i16> %s +} + +define <8 x i16> @uv8i16_100(<8 x i16> %d, <8 x i16> %e) { +; CHECK-SD-LABEL: uv8i16_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #5243 // =0x147b +; CHECK-SD-NEXT: ushr v2.8h, v0.8h, #2 +; CHECK-SD-NEXT: dup v1.8h, w8 +; CHECK-SD-NEXT: umull2 v3.4s, v2.8h, v1.8h +; CHECK-SD-NEXT: umull v1.4s, v2.4h, v1.4h +; CHECK-SD-NEXT: movi v2.8h, #100 +; CHECK-SD-NEXT: uzp2 v1.8h, v1.8h, v3.8h +; CHECK-SD-NEXT: ushr v1.8h, v1.8h, #1 +; CHECK-SD-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv8i16_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI55_0 +; CHECK-GI-NEXT: ushr v1.8h, v0.8h, #2 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI55_0] +; CHECK-GI-NEXT: umull2 v3.4s, v1.8h, v2.8h +; CHECK-GI-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: movi v2.8h, #100 +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v3.8h +; CHECK-GI-NEXT: ushr v1.8h, v1.8h, #1 +; CHECK-GI-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: ret +entry: + %s = urem <8 x i16> %d, + ret <8 x i16> %s +} + +define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) { +; CHECK-SD-LABEL: sv2i32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: movi v3.2s, #7 +; CHECK-SD-NEXT: movk w8, #37449, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: add v1.2s, v1.2s, v0.2s +; CHECK-SD-NEXT: sshr v2.2s, v1.2s, #2 +; CHECK-SD-NEXT: usra v2.2s, v1.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v2.2s, v3.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: movi v2.2s, #7 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w8, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i32> %d, + ret <2 x i32> %s +} + +define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) { +; CHECK-SD-LABEL: sv2i32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #37 +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: usra v1.2s, v1.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: movi v2.2s, #100 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w8, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i32> %d, + ret <2 x i32> %s +} + +define <3 x i32> @sv3i32_7(<3 x i32> %d, <3 x i32> %e) { +; CHECK-SD-LABEL: sv3i32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: mov w9, v0.s[2] +; CHECK-SD-NEXT: movi v3.2s, #7 +; CHECK-SD-NEXT: movk w8, #37449, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: smull x8, w9, w8 +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: add w8, w8, w9 +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: asr w10, w8, #2 +; CHECK-SD-NEXT: add w8, w10, w8, lsr #31 +; CHECK-SD-NEXT: add v1.2s, v1.2s, v0.2s +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: sshr v2.2s, v1.2s, #2 +; CHECK-SD-NEXT: add w8, w9, w8 +; CHECK-SD-NEXT: usra v2.2s, v1.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v2.2s, v3.2s +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: sdiv w10, w9, w8 +; CHECK-GI-NEXT: fmov w11, s1 +; CHECK-GI-NEXT: fmov w13, s0 +; CHECK-GI-NEXT: sdiv w12, w11, w8 +; CHECK-GI-NEXT: lsl w14, w10, #3 +; CHECK-GI-NEXT: sub w10, w14, w10 +; CHECK-GI-NEXT: sub w9, w9, w10 +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: sdiv w8, w13, w8 +; CHECK-GI-NEXT: lsl w15, w12, #3 +; CHECK-GI-NEXT: sub w10, w15, w12 +; CHECK-GI-NEXT: sub w10, w11, w10 +; CHECK-GI-NEXT: mov v0.s[1], w10 +; CHECK-GI-NEXT: lsl w9, w8, #3 +; CHECK-GI-NEXT: sub w8, w9, w8 +; CHECK-GI-NEXT: sub w8, w13, w8 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: ret +entry: + %s = srem <3 x i32> %d, + ret <3 x i32> %s +} + +define <3 x i32> @sv3i32_100(<3 x i32> %d, <3 x i32> %e) { +; CHECK-SD-LABEL: sv3i32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: mov w9, v0.s[2] +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: mov w10, #100 // =0x64 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: smull x8, w9, w8 +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: asr x8, x8, #37 +; CHECK-SD-NEXT: add w8, w8, w8, lsr #31 +; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #37 +; CHECK-SD-NEXT: msub w8, w8, w10, w9 +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: usra v1.2s, v1.2s, #31 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv3i32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: mov s0, v0.s[2] +; CHECK-GI-NEXT: sdiv w10, w9, w8 +; CHECK-GI-NEXT: fmov w11, s1 +; CHECK-GI-NEXT: fmov w13, s0 +; CHECK-GI-NEXT: sdiv w12, w11, w8 +; CHECK-GI-NEXT: msub w9, w10, w8, w9 +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: sdiv w14, w13, w8 +; CHECK-GI-NEXT: msub w10, w12, w8, w11 +; CHECK-GI-NEXT: mov v0.s[1], w10 +; CHECK-GI-NEXT: msub w8, w14, w8, w13 +; CHECK-GI-NEXT: mov v0.s[2], w8 +; CHECK-GI-NEXT: ret +entry: + %s = srem <3 x i32> %d, + ret <3 x i32> %s +} + +define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) { +; CHECK-SD-LABEL: sv4i32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 +; CHECK-SD-NEXT: movi v3.4s, #7 +; CHECK-SD-NEXT: movk w8, #37449, lsl #16 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #2 +; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w11, v0.s[2] +; CHECK-GI-NEXT: mov w12, v0.s[3] +; CHECK-GI-NEXT: movi v2.4s, #7 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: sdiv w8, w12, w8 +; CHECK-GI-NEXT: mov v1.s[2], w11 +; CHECK-GI-NEXT: mov v1.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i32> %d, + ret <4 x i32> %s +} + +define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) { +; CHECK-SD-LABEL: sv4i32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: movi v3.4s, #100 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv4i32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w11, v0.s[2] +; CHECK-GI-NEXT: mov w12, v0.s[3] +; CHECK-GI-NEXT: movi v2.4s, #100 +; CHECK-GI-NEXT: sdiv w9, w9, w8 +; CHECK-GI-NEXT: sdiv w10, w10, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: sdiv w11, w11, w8 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: sdiv w8, w12, w8 +; CHECK-GI-NEXT: mov v1.s[2], w11 +; CHECK-GI-NEXT: mov v1.s[3], w8 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ret +entry: + %s = srem <4 x i32> %d, + ret <4 x i32> %s +} + +define <2 x i32> @uv2i32_7(<2 x i32> %d, <2 x i32> %e) { +; CHECK-SD-LABEL: uv2i32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1 +; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s +; CHECK-SD-NEXT: movi v2.2s, #7 +; CHECK-SD-NEXT: ushr v1.2s, v1.2s, #2 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI62_0 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI62_0] +; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: sub v2.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: usra v1.2s, v2.2s, #1 +; CHECK-GI-NEXT: movi v2.2s, #7 +; CHECK-GI-NEXT: ushr v1.2s, v1.2s, #2 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i32> %d, + ret <2 x i32> %s +} + +define <2 x i32> @uv2i32_100(<2 x i32> %d, <2 x i32> %e) { +; CHECK-SD-LABEL: uv2i32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: ushr v1.2d, v1.2d, #37 +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI63_0 +; CHECK-GI-NEXT: movi v2.2s, #100 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI63_0] +; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: ushr v1.2s, v1.2s, #5 +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i32> %d, + ret <2 x i32> %s +} + +define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) { +; CHECK-SD-LABEL: uv3i32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: mov w9, v0.s[2] +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: umull x8, w9, w8 +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: lsr x8, x8, #32 +; CHECK-SD-NEXT: sub w10, w9, w8 +; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-SD-NEXT: add w8, w8, w10, lsr #1 +; CHECK-SD-NEXT: lsr w8, w8, #2 +; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 +; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: add w8, w9, w8 +; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1 +; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s +; CHECK-SD-NEXT: movi v2.2s, #7 +; CHECK-SD-NEXT: ushr v1.2s, v1.2s, #2 +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: adrp x8, .LCPI64_0 +; CHECK-GI-NEXT: mov w9, #18725 // =0x4925 +; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI64_0] +; CHECK-GI-NEXT: mov w8, v0.s[2] +; CHECK-GI-NEXT: movk w9, #9362, lsl #16 +; CHECK-GI-NEXT: mov w10, #1 // =0x1 +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: umull x8, w8, w9 +; CHECK-GI-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: lsr x8, x8, #32 +; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #32 +; CHECK-GI-NEXT: mov d2, v1.d[1] +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov w9, #2 // =0x2 +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: mov v1.s[1], w11 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: mov v3.s[1], w9 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: mov v3.s[2], w9 +; CHECK-GI-NEXT: sub v4.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: neg v2.4s, v2.4s +; CHECK-GI-NEXT: ushl v2.4s, v4.4s, v2.4s +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v4.s[1], w8 +; CHECK-GI-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-GI-NEXT: neg v2.4s, v3.4s +; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: mov v4.s[2], w8 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <3 x i32> %d, + ret <3 x i32> %s +} + +define <3 x i32> @uv3i32_100(<3 x i32> %d, <3 x i32> %e) { +; CHECK-SD-LABEL: uv3i32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: mov w9, v0.s[2] +; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: mov w10, #100 // =0x64 +; CHECK-SD-NEXT: dup v1.2s, w8 +; CHECK-SD-NEXT: umull x8, w9, w8 +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: lsr x8, x8, #37 +; CHECK-SD-NEXT: msub w8, w8, w10, w9 +; CHECK-SD-NEXT: ushr v1.2d, v1.2d, #37 +; CHECK-SD-NEXT: xtn v1.2s, v1.2d +; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s +; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv3i32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] +; CHECK-GI-NEXT: adrp x8, .LCPI65_0 +; CHECK-GI-NEXT: mov w9, v0.s[2] +; CHECK-GI-NEXT: ldr d2, [x8, :lo12:.LCPI65_0] +; CHECK-GI-NEXT: mov w8, #5 // =0x5 +; CHECK-GI-NEXT: mov w10, #34079 // =0x851f +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: movk w10, #20971, lsl #16 +; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] +; CHECK-GI-NEXT: umull x9, w9, w10 +; CHECK-GI-NEXT: mov v3.s[1], w8 +; CHECK-GI-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: mov v3.s[2], w8 +; CHECK-GI-NEXT: lsr x8, x9, #32 +; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #32 +; CHECK-GI-NEXT: neg v3.4s, v3.4s +; CHECK-GI-NEXT: mov d2, v1.d[1] +; CHECK-GI-NEXT: fmov x11, d1 +; CHECK-GI-NEXT: fmov s1, w11 +; CHECK-GI-NEXT: fmov x10, d2 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: mov v2.s[1], w10 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v2.s[2], w10 +; CHECK-GI-NEXT: ushl v1.4s, v1.4s, v3.4s +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <3 x i32> %d, + ret <3 x i32> %s +} + +define <4 x i32> @uv4i32_7(<4 x i32> %d, <4 x i32> %e) { +; CHECK-SD-LABEL: uv4i32_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 +; CHECK-SD-NEXT: movk w8, #9362, lsl #16 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: sub v2.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: usra v1.4s, v2.4s, #1 +; CHECK-SD-NEXT: movi v2.4s, #7 +; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i32_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI66_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI66_0] +; CHECK-GI-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: sub v2.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: usra v1.4s, v2.4s, #1 +; CHECK-GI-NEXT: movi v2.4s, #7 +; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i32> %d, + ret <4 x i32> %s +} + +define <4 x i32> @uv4i32_100(<4 x i32> %d, <4 x i32> %e) { +; CHECK-SD-LABEL: uv4i32_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #34079 // =0x851f +; CHECK-SD-NEXT: movk w8, #20971, lsl #16 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: movi v2.4s, #100 +; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #5 +; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv4i32_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI67_0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI67_0] +; CHECK-GI-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: movi v2.4s, #100 +; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #5 +; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: ret +entry: + %s = urem <4 x i32> %d, + ret <4 x i32> %s +} + +define <2 x i64> @sv2i64_7(<2 x i64> %d, <2 x i64> %e) { +; CHECK-SD-LABEL: sv2i64_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #18725 // =0x4925 +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: movk x8, #9362, lsl #16 +; CHECK-SD-NEXT: movk x8, #37449, lsl #32 +; CHECK-SD-NEXT: movk x8, #18724, lsl #48 +; CHECK-SD-NEXT: smulh x11, x10, x8 +; CHECK-SD-NEXT: smulh x8, x9, x8 +; CHECK-SD-NEXT: asr x12, x11, #1 +; CHECK-SD-NEXT: add x11, x12, x11, lsr #63 +; CHECK-SD-NEXT: asr x13, x8, #1 +; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3 +; CHECK-SD-NEXT: add x8, x13, x8, lsr #63 +; CHECK-SD-NEXT: add x10, x10, x11 +; CHECK-SD-NEXT: sub x8, x8, x8, lsl #3 +; CHECK-SD-NEXT: fmov d0, x10 +; CHECK-SD-NEXT: add x8, x9, x8 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i64_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x9, d0 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: sdiv x9, x9, x8 +; CHECK-GI-NEXT: sdiv x8, x10, x8 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: adrp x8, .LCPI68_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI68_0] +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: mov x9, v2.d[1] +; CHECK-GI-NEXT: fmov x10, d1 +; CHECK-GI-NEXT: mov x8, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d1, x10 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i64> %d, + ret <2 x i64> %s +} + +define <2 x i64> @sv2i64_100(<2 x i64> %d, <2 x i64> %e) { +; CHECK-SD-LABEL: sv2i64_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #55051 // =0xd70b +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: movk x8, #28835, lsl #16 +; CHECK-SD-NEXT: movk x8, #2621, lsl #32 +; CHECK-SD-NEXT: movk x8, #41943, lsl #48 +; CHECK-SD-NEXT: smulh x11, x10, x8 +; CHECK-SD-NEXT: smulh x8, x9, x8 +; CHECK-SD-NEXT: add x11, x11, x10 +; CHECK-SD-NEXT: asr x12, x11, #6 +; CHECK-SD-NEXT: add x8, x8, x9 +; CHECK-SD-NEXT: add x11, x12, x11, lsr #63 +; CHECK-SD-NEXT: asr x13, x8, #6 +; CHECK-SD-NEXT: mov w12, #100 // =0x64 +; CHECK-SD-NEXT: msub x10, x11, x12, x10 +; CHECK-SD-NEXT: add x8, x13, x8, lsr #63 +; CHECK-SD-NEXT: msub x8, x8, x12, x9 +; CHECK-SD-NEXT: fmov d0, x10 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i64_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x9, d0 +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: mov x10, v0.d[1] +; CHECK-GI-NEXT: sdiv x9, x9, x8 +; CHECK-GI-NEXT: sdiv x8, x10, x8 +; CHECK-GI-NEXT: fmov d1, x9 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: adrp x8, .LCPI69_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI69_0] +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: mov x9, v2.d[1] +; CHECK-GI-NEXT: fmov x10, d1 +; CHECK-GI-NEXT: mov x8, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d1, x10 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i64> %d, + ret <2 x i64> %s +} + +define <2 x i64> @uv2i64_7(<2 x i64> %d, <2 x i64> %e) { +; CHECK-SD-LABEL: uv2i64_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #9363 // =0x2493 +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: movk x8, #37449, lsl #16 +; CHECK-SD-NEXT: movk x8, #18724, lsl #32 +; CHECK-SD-NEXT: movk x8, #9362, lsl #48 +; CHECK-SD-NEXT: umulh x11, x10, x8 +; CHECK-SD-NEXT: umulh x8, x9, x8 +; CHECK-SD-NEXT: sub x12, x10, x11 +; CHECK-SD-NEXT: add x11, x11, x12, lsr #1 +; CHECK-SD-NEXT: sub x12, x9, x8 +; CHECK-SD-NEXT: lsr x11, x11, #2 +; CHECK-SD-NEXT: add x8, x8, x12, lsr #1 +; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3 +; CHECK-SD-NEXT: lsr x8, x8, #2 +; CHECK-SD-NEXT: add x10, x10, x11 +; CHECK-SD-NEXT: sub x8, x8, x8, lsl #3 +; CHECK-SD-NEXT: fmov d0, x10 +; CHECK-SD-NEXT: add x8, x9, x8 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i64_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov x8, #9363 // =0x2493 +; CHECK-GI-NEXT: fmov x10, d0 +; CHECK-GI-NEXT: mov x9, v0.d[1] +; CHECK-GI-NEXT: movk x8, #37449, lsl #16 +; CHECK-GI-NEXT: movk x8, #18724, lsl #32 +; CHECK-GI-NEXT: movk x8, #9362, lsl #48 +; CHECK-GI-NEXT: umulh x10, x10, x8 +; CHECK-GI-NEXT: umulh x8, x9, x8 +; CHECK-GI-NEXT: fmov d1, x10 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: adrp x8, .LCPI70_0 +; CHECK-GI-NEXT: sub v2.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: usra v1.2d, v2.2d, #1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI70_0] +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: mov x9, v2.d[1] +; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #2 +; CHECK-GI-NEXT: fmov x10, d1 +; CHECK-GI-NEXT: mov x8, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d1, x10 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i64> %d, + ret <2 x i64> %s +} + +define <2 x i64> @uv2i64_100(<2 x i64> %d, <2 x i64> %e) { +; CHECK-SD-LABEL: uv2i64_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: mov x8, #62915 // =0xf5c3 +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: movk x8, #23592, lsl #16 +; CHECK-SD-NEXT: movk x8, #49807, lsl #32 +; CHECK-SD-NEXT: lsr x11, x10, #2 +; CHECK-SD-NEXT: movk x8, #10485, lsl #48 +; CHECK-SD-NEXT: lsr x12, x9, #2 +; CHECK-SD-NEXT: umulh x11, x11, x8 +; CHECK-SD-NEXT: umulh x8, x12, x8 +; CHECK-SD-NEXT: mov w12, #100 // =0x64 +; CHECK-SD-NEXT: lsr x11, x11, #2 +; CHECK-SD-NEXT: msub x10, x11, x12, x10 +; CHECK-SD-NEXT: lsr x8, x8, #2 +; CHECK-SD-NEXT: msub x8, x8, x12, x9 +; CHECK-SD-NEXT: fmov d0, x10 +; CHECK-SD-NEXT: mov v0.d[1], x8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i64_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ushr v1.2d, v0.2d, #2 +; CHECK-GI-NEXT: mov x8, #62915 // =0xf5c3 +; CHECK-GI-NEXT: movk x8, #23592, lsl #16 +; CHECK-GI-NEXT: movk x8, #49807, lsl #32 +; CHECK-GI-NEXT: fmov x10, d1 +; CHECK-GI-NEXT: movk x8, #10485, lsl #48 +; CHECK-GI-NEXT: mov x9, v1.d[1] +; CHECK-GI-NEXT: umulh x10, x10, x8 +; CHECK-GI-NEXT: umulh x8, x9, x8 +; CHECK-GI-NEXT: fmov d1, x10 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: adrp x8, .LCPI71_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI71_0] +; CHECK-GI-NEXT: fmov x11, d2 +; CHECK-GI-NEXT: mov x9, v2.d[1] +; CHECK-GI-NEXT: ushr v1.2d, v1.2d, #2 +; CHECK-GI-NEXT: fmov x10, d1 +; CHECK-GI-NEXT: mov x8, v1.d[1] +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: mul x8, x8, x9 +; CHECK-GI-NEXT: fmov d1, x10 +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: sub v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i64> %d, + ret <2 x i64> %s +} + +define <2 x i128> @sv2i128_7(<2 x i128> %d, <2 x i128> %e) { +; CHECK-SD-LABEL: sv2i128_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x19, x3 +; CHECK-SD-NEXT: mov x20, x2 +; CHECK-SD-NEXT: mov w2, #7 // =0x7 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov w2, #7 // =0x7 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x2, x0 +; CHECK-SD-NEXT: mov x3, x1 +; CHECK-SD-NEXT: mov x0, x21 +; CHECK-SD-NEXT: mov x1, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i128_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov w2, #7 // =0x7 +; CHECK-GI-NEXT: mov x3, xzr +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x21, x0 +; CHECK-GI-NEXT: mov x22, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov w2, #7 // =0x7 +; CHECK-GI-NEXT: mov x3, xzr +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x2, x0 +; CHECK-GI-NEXT: mov x3, x1 +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i128> %d, + ret <2 x i128> %s +} + +define <2 x i128> @sv2i128_100(<2 x i128> %d, <2 x i128> %e) { +; CHECK-SD-LABEL: sv2i128_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x19, x3 +; CHECK-SD-NEXT: mov x20, x2 +; CHECK-SD-NEXT: mov w2, #100 // =0x64 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov w2, #100 // =0x64 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __modti3 +; CHECK-SD-NEXT: mov x2, x0 +; CHECK-SD-NEXT: mov x3, x1 +; CHECK-SD-NEXT: mov x0, x21 +; CHECK-SD-NEXT: mov x1, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sv2i128_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w30, -48 +; CHECK-GI-NEXT: mov x19, x2 +; CHECK-GI-NEXT: mov x20, x3 +; CHECK-GI-NEXT: mov w2, #100 // =0x64 +; CHECK-GI-NEXT: mov x3, xzr +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x21, x0 +; CHECK-GI-NEXT: mov x22, x1 +; CHECK-GI-NEXT: mov x0, x19 +; CHECK-GI-NEXT: mov x1, x20 +; CHECK-GI-NEXT: mov w2, #100 // =0x64 +; CHECK-GI-NEXT: mov x3, xzr +; CHECK-GI-NEXT: bl __modti3 +; CHECK-GI-NEXT: mov x2, x0 +; CHECK-GI-NEXT: mov x3, x1 +; CHECK-GI-NEXT: mov x0, x21 +; CHECK-GI-NEXT: mov x1, x22 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret +entry: + %s = srem <2 x i128> %d, + ret <2 x i128> %s +} + +define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) { +; CHECK-SD-LABEL: uv2i128_7: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x19, x3 +; CHECK-SD-NEXT: mov x20, x2 +; CHECK-SD-NEXT: mov w2, #7 // =0x7 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov w2, #7 // =0x7 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x2, x0 +; CHECK-SD-NEXT: mov x3, x1 +; CHECK-SD-NEXT: mov x0, x21 +; CHECK-SD-NEXT: mov x1, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i128_7: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov x10, #18725 // =0x4925 +; CHECK-GI-NEXT: mov x8, #9362 // =0x2492 +; CHECK-GI-NEXT: sub x4, x0, x0 +; CHECK-GI-NEXT: movk x10, #9362, lsl #16 +; CHECK-GI-NEXT: movk x8, #37449, lsl #16 +; CHECK-GI-NEXT: umulh x18, x0, xzr +; CHECK-GI-NEXT: movk x10, #37449, lsl #32 +; CHECK-GI-NEXT: movk x8, #18724, lsl #32 +; CHECK-GI-NEXT: movk x10, #18724, lsl #48 +; CHECK-GI-NEXT: movk x8, #9362, lsl #48 +; CHECK-GI-NEXT: mul x11, x1, x10 +; CHECK-GI-NEXT: mul x12, x0, x8 +; CHECK-GI-NEXT: umulh x13, x0, x10 +; CHECK-GI-NEXT: mul x14, x1, x8 +; CHECK-GI-NEXT: adds x11, x11, x12 +; CHECK-GI-NEXT: umulh x15, x1, x10 +; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: cmn x11, x13 +; CHECK-GI-NEXT: and x11, x12, #0x1 +; CHECK-GI-NEXT: umulh x16, x0, x8 +; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: add x14, x14, x4 +; CHECK-GI-NEXT: and x12, x12, #0x1 +; CHECK-GI-NEXT: and x4, xzr, #0x1 +; CHECK-GI-NEXT: mul x13, x3, x10 +; CHECK-GI-NEXT: add x11, x11, x12 +; CHECK-GI-NEXT: and x12, xzr, #0x1 +; CHECK-GI-NEXT: adds x14, x14, x15 +; CHECK-GI-NEXT: add x12, x12, x4 +; CHECK-GI-NEXT: mul x5, x2, x8 +; CHECK-GI-NEXT: cset w4, hs +; CHECK-GI-NEXT: adds x14, x14, x16 +; CHECK-GI-NEXT: and x16, x4, #0x1 +; CHECK-GI-NEXT: umulh x9, xzr, x10 +; CHECK-GI-NEXT: cset w4, hs +; CHECK-GI-NEXT: adds x11, x14, x11 +; CHECK-GI-NEXT: add x12, x12, x16 +; CHECK-GI-NEXT: and x16, x4, #0x1 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: umulh x17, x1, x8 +; CHECK-GI-NEXT: add x12, x12, x16 +; CHECK-GI-NEXT: adds x13, x13, x5 +; CHECK-GI-NEXT: umulh x15, x2, x10 +; CHECK-GI-NEXT: cset w4, hs +; CHECK-GI-NEXT: and x16, x4, #0x1 +; CHECK-GI-NEXT: mul x6, x3, x8 +; CHECK-GI-NEXT: umulh x10, x3, x10 +; CHECK-GI-NEXT: cmn x13, x15 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: add x14, x9, x17 +; CHECK-GI-NEXT: umulh x15, x2, x8 +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: add x13, x14, x18 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: sub x17, x2, x2 +; CHECK-GI-NEXT: and x18, xzr, #0x1 +; CHECK-GI-NEXT: and x14, x14, #0x1 +; CHECK-GI-NEXT: umulh x8, x3, x8 +; CHECK-GI-NEXT: add x12, x13, x12 +; CHECK-GI-NEXT: add x14, x16, x14 +; CHECK-GI-NEXT: add x16, x6, x17 +; CHECK-GI-NEXT: and x17, xzr, #0x1 +; CHECK-GI-NEXT: adds x10, x16, x10 +; CHECK-GI-NEXT: add x17, x17, x18 +; CHECK-GI-NEXT: cset w16, hs +; CHECK-GI-NEXT: adds x10, x10, x15 +; CHECK-GI-NEXT: umulh x15, x2, xzr +; CHECK-GI-NEXT: and x16, x16, #0x1 +; CHECK-GI-NEXT: cset w18, hs +; CHECK-GI-NEXT: adds x10, x10, x14 +; CHECK-GI-NEXT: add x16, x17, x16 +; CHECK-GI-NEXT: and x17, x18, #0x1 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: add x13, x16, x17 +; CHECK-GI-NEXT: and x14, x14, #0x1 +; CHECK-GI-NEXT: add x8, x9, x8 +; CHECK-GI-NEXT: subs x9, x0, x11 +; CHECK-GI-NEXT: add x13, x13, x14 +; CHECK-GI-NEXT: add x8, x8, x15 +; CHECK-GI-NEXT: sbc x14, x1, x12 +; CHECK-GI-NEXT: add x8, x8, x13 +; CHECK-GI-NEXT: subs x13, x2, x10 +; CHECK-GI-NEXT: lsl x15, x14, #63 +; CHECK-GI-NEXT: sbc x16, x3, x8 +; CHECK-GI-NEXT: lsr x14, x14, #1 +; CHECK-GI-NEXT: orr x9, x15, x9, lsr #1 +; CHECK-GI-NEXT: lsl x15, x16, #63 +; CHECK-GI-NEXT: orr x13, x15, x13, lsr #1 +; CHECK-GI-NEXT: adds x9, x9, x11 +; CHECK-GI-NEXT: lsr x11, x16, #1 +; CHECK-GI-NEXT: adc x12, x14, x12 +; CHECK-GI-NEXT: adds x10, x13, x10 +; CHECK-GI-NEXT: lsl x13, x12, #62 +; CHECK-GI-NEXT: lsr x12, x12, #2 +; CHECK-GI-NEXT: adc x8, x11, x8 +; CHECK-GI-NEXT: lsl x11, x8, #62 +; CHECK-GI-NEXT: orr x9, x13, x9, lsr #2 +; CHECK-GI-NEXT: mov w13, #7 // =0x7 +; CHECK-GI-NEXT: lsr x8, x8, #2 +; CHECK-GI-NEXT: lsl x14, x12, #3 +; CHECK-GI-NEXT: orr x10, x11, x10, lsr #2 +; CHECK-GI-NEXT: umulh x11, x9, x13 +; CHECK-GI-NEXT: lsl x15, x9, #3 +; CHECK-GI-NEXT: sub x12, x14, x12 +; CHECK-GI-NEXT: lsl x16, x8, #3 +; CHECK-GI-NEXT: umulh x13, x10, x13 +; CHECK-GI-NEXT: lsl x14, x10, #3 +; CHECK-GI-NEXT: sub x9, x15, x9 +; CHECK-GI-NEXT: sub x8, x16, x8 +; CHECK-GI-NEXT: subs x0, x0, x9 +; CHECK-GI-NEXT: sub x10, x14, x10 +; CHECK-GI-NEXT: add x11, x12, x11 +; CHECK-GI-NEXT: sbc x1, x1, x11 +; CHECK-GI-NEXT: subs x2, x2, x10 +; CHECK-GI-NEXT: add x8, x8, x13 +; CHECK-GI-NEXT: sbc x3, x3, x8 +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i128> %d, + ret <2 x i128> %s +} + +define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { +; CHECK-SD-LABEL: uv2i128_100: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w20, -16 +; CHECK-SD-NEXT: .cfi_offset w21, -24 +; CHECK-SD-NEXT: .cfi_offset w22, -32 +; CHECK-SD-NEXT: .cfi_offset w30, -48 +; CHECK-SD-NEXT: mov x19, x3 +; CHECK-SD-NEXT: mov x20, x2 +; CHECK-SD-NEXT: mov w2, #100 // =0x64 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x21, x0 +; CHECK-SD-NEXT: mov x22, x1 +; CHECK-SD-NEXT: mov x0, x20 +; CHECK-SD-NEXT: mov x1, x19 +; CHECK-SD-NEXT: mov w2, #100 // =0x64 +; CHECK-SD-NEXT: mov x3, xzr +; CHECK-SD-NEXT: bl __umodti3 +; CHECK-SD-NEXT: mov x2, x0 +; CHECK-SD-NEXT: mov x3, x1 +; CHECK-SD-NEXT: mov x0, x21 +; CHECK-SD-NEXT: mov x1, x22 +; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: uv2i128_100: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov x10, #23593 // =0x5c29 +; CHECK-GI-NEXT: mov x8, #62914 // =0xf5c2 +; CHECK-GI-NEXT: sub x18, x0, x0 +; CHECK-GI-NEXT: movk x10, #49807, lsl #16 +; CHECK-GI-NEXT: movk x8, #23592, lsl #16 +; CHECK-GI-NEXT: movk x10, #10485, lsl #32 +; CHECK-GI-NEXT: movk x8, #49807, lsl #32 +; CHECK-GI-NEXT: movk x10, #36700, lsl #48 +; CHECK-GI-NEXT: movk x8, #10485, lsl #48 +; CHECK-GI-NEXT: mul x11, x1, x10 +; CHECK-GI-NEXT: mul x12, x0, x8 +; CHECK-GI-NEXT: umulh x13, x0, x10 +; CHECK-GI-NEXT: mul x14, x1, x8 +; CHECK-GI-NEXT: adds x11, x11, x12 +; CHECK-GI-NEXT: umulh x15, x1, x10 +; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: cmn x11, x13 +; CHECK-GI-NEXT: and x11, x12, #0x1 +; CHECK-GI-NEXT: umulh x16, x0, x8 +; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: and x12, x12, #0x1 +; CHECK-GI-NEXT: add x14, x14, x18 +; CHECK-GI-NEXT: add x11, x11, x12 +; CHECK-GI-NEXT: and x12, xzr, #0x1 +; CHECK-GI-NEXT: umulh x9, xzr, x10 +; CHECK-GI-NEXT: adds x14, x14, x15 +; CHECK-GI-NEXT: and x15, xzr, #0x1 +; CHECK-GI-NEXT: umulh x17, x1, x8 +; CHECK-GI-NEXT: cset w4, hs +; CHECK-GI-NEXT: add x15, x12, x15 +; CHECK-GI-NEXT: adds x12, x14, x16 +; CHECK-GI-NEXT: and x4, x4, #0x1 +; CHECK-GI-NEXT: mul x18, x3, x10 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: adds x12, x12, x11 +; CHECK-GI-NEXT: add x11, x15, x4 +; CHECK-GI-NEXT: and x14, x14, #0x1 +; CHECK-GI-NEXT: cset w15, hs +; CHECK-GI-NEXT: mul x5, x2, x8 +; CHECK-GI-NEXT: add x11, x11, x14 +; CHECK-GI-NEXT: and x14, x15, #0x1 +; CHECK-GI-NEXT: add x17, x9, x17 +; CHECK-GI-NEXT: add x14, x11, x14 +; CHECK-GI-NEXT: mov w11, #100 // =0x64 +; CHECK-GI-NEXT: umulh x13, x0, xzr +; CHECK-GI-NEXT: umulh x16, x2, x10 +; CHECK-GI-NEXT: adds x18, x18, x5 +; CHECK-GI-NEXT: mul x15, x3, x8 +; CHECK-GI-NEXT: add x13, x17, x13 +; CHECK-GI-NEXT: cset w17, hs +; CHECK-GI-NEXT: umulh x10, x3, x10 +; CHECK-GI-NEXT: add x13, x13, x14 +; CHECK-GI-NEXT: and x17, x17, #0x1 +; CHECK-GI-NEXT: cmn x18, x16 +; CHECK-GI-NEXT: sub x18, x2, x2 +; CHECK-GI-NEXT: umulh x16, x2, x8 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: and x14, x14, #0x1 +; CHECK-GI-NEXT: add x15, x15, x18 +; CHECK-GI-NEXT: and x18, xzr, #0x1 +; CHECK-GI-NEXT: add x14, x17, x14 +; CHECK-GI-NEXT: umulh x8, x3, x8 +; CHECK-GI-NEXT: and x17, xzr, #0x1 +; CHECK-GI-NEXT: adds x10, x15, x10 +; CHECK-GI-NEXT: add x15, x17, x18 +; CHECK-GI-NEXT: cset w17, hs +; CHECK-GI-NEXT: umulh x18, x2, xzr +; CHECK-GI-NEXT: and x17, x17, #0x1 +; CHECK-GI-NEXT: adds x10, x10, x16 +; CHECK-GI-NEXT: lsl x16, x13, #60 +; CHECK-GI-NEXT: add x15, x15, x17 +; CHECK-GI-NEXT: cset w17, hs +; CHECK-GI-NEXT: adds x10, x10, x14 +; CHECK-GI-NEXT: and x14, x17, #0x1 +; CHECK-GI-NEXT: cset w17, hs +; CHECK-GI-NEXT: add x8, x9, x8 +; CHECK-GI-NEXT: add x14, x15, x14 +; CHECK-GI-NEXT: and x15, x17, #0x1 +; CHECK-GI-NEXT: orr x12, x16, x12, lsr #4 +; CHECK-GI-NEXT: add x9, x14, x15 +; CHECK-GI-NEXT: add x8, x8, x18 +; CHECK-GI-NEXT: add x8, x8, x9 +; CHECK-GI-NEXT: lsr x9, x13, #4 +; CHECK-GI-NEXT: umulh x14, x12, x11 +; CHECK-GI-NEXT: lsl x13, x8, #60 +; CHECK-GI-NEXT: lsr x8, x8, #4 +; CHECK-GI-NEXT: mul x12, x12, x11 +; CHECK-GI-NEXT: orr x10, x13, x10, lsr #4 +; CHECK-GI-NEXT: madd x9, x9, x11, x14 +; CHECK-GI-NEXT: umulh x13, x10, x11 +; CHECK-GI-NEXT: subs x0, x0, x12 +; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: sbc x1, x1, x9 +; CHECK-GI-NEXT: madd x8, x8, x11, x13 +; CHECK-GI-NEXT: subs x2, x2, x10 +; CHECK-GI-NEXT: sbc x3, x3, x8 +; CHECK-GI-NEXT: ret +entry: + %s = urem <2 x i128> %d, + ret <2 x i128> %s +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 24ec4fa48f778..6ae2f56f6ae6d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -211,91 +211,41 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) { ; CHECK-LABEL: v_urem_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705 -; CHECK-NEXT: v_mov_b32_e32 v3, 0x12d8fb -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0xb2a50881 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v3 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i32 %num, 1235195 ret i32 %result } define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { -; GISEL-LABEL: v_urem_v2i32_oddk_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb -; GISEL-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 -; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_urem_v2i32_oddk_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CGP-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v5, v2, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v0, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 -; CGP-NEXT: v_mul_lo_u32 v2, v2, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v0, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v0, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_urem_v2i32_oddk_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, 0xb2a50881 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v4, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v4 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; CHECK-NEXT: v_lshrrev_b32_e32 v6, 1, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v4, 20, v4 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i32> %num, ret <2 x i32> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index f6a228614a27e..2a1bf4bf068f0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -968,523 +968,106 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-LABEL: v_urem_i64_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, 0x12d8fb -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0xffed2705 -; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; CHECK-NEXT: v_trunc_f32_e32 v4, v4 -; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v7, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v7 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v7 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v11, v4, v6 -; CHECK-NEXT: v_mul_hi_u32 v12, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v3, v5 -; CHECK-NEXT: v_mul_lo_u32 v5, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v6 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v3, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v4, v5 -; CHECK-NEXT: v_mul_hi_u32 v11, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v1, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v0, v3 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c31 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xd9528440 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v6, v0, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v3 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v3 ; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_mul_lo_u32 v7, v0, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v1, v4 -; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v6 -; CHECK-NEXT: v_subb_u32_e64 v4, vcc, v1, v3, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v2 -; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[6:7] -; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; CHECK-NEXT: s_mov_b64 s[4:5], vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x12d8fb, v5 -; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v2, s[4:5] -; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 20 +; CHECK-NEXT: v_mul_lo_u32 v5, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v3, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v2, v2, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, 1235195 ret i64 %result } define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { -; GISEL-LABEL: v_urem_v2i64_oddk_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: s_mov_b32 s4, 1 -; GISEL-NEXT: v_mov_b32_e32 v5, 0xffed2705 -; GISEL-NEXT: s_mov_b32 s5, 1 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 -; GISEL-NEXT: s_cmp_lg_u32 s5, 0 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_trunc_f32_e32 v7, v7 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6 -; GISEL-NEXT: v_mul_hi_u32 v11, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v8 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_mul_lo_u32 v11, v6, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v17, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v17, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v18 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v11 -; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v7, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v13, s6, v11 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, s7, v6 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v16, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 -; GISEL-NEXT: v_mul_hi_u32 v18, v11, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v5 -; GISEL-NEXT: v_mul_lo_u32 v19, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v11, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v5 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v5 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v19, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v7, v5 -; GISEL-NEXT: v_add_i32_e64 v12, s[6:7], v14, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v18, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v14 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v1, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v1, v11 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v14, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v15, v0, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_mul_lo_u32 v16, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v17, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v18, v2, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v17, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v17, v8 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_lo_u32 v12, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v10, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v6, v6, v4 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v12 -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v1, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v8 -; GISEL-NEXT: v_subb_u32_e64 v8, vcc, v3, v5, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v2, v4 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[8:9] -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: s_mov_b64 s[4:5], vcc -; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9 -; GISEL-NEXT: v_sub_i32_e64 v12, s[6:7], v0, v4 -; GISEL-NEXT: v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7] -; GISEL-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_urem_v2i64_oddk_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CGP-NEXT: v_cvt_f32_u32_e32 v5, 0x12d8fb -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v6, 0 -; CGP-NEXT: v_mov_b32_e32 v7, 0xffed2705 -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v9, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v7 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v9 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v8 -; CGP-NEXT: v_mul_lo_u32 v13, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v7, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v9, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v12, v6, v7 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v1, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v0, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v10, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v2, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v1, v6 -; CGP-NEXT: v_mul_hi_u32 v14, v0, v6 -; CGP-NEXT: v_mul_hi_u32 v15, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v2, v6 -; CGP-NEXT: v_mul_lo_u32 v17, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v18, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v18 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v16, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v7, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v5, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v9 -; CGP-NEXT: v_subb_u32_e64 v6, vcc, v1, v7, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e64 v2, s[6:7], v2, v11 -; CGP-NEXT: v_subb_u32_e64 v8, vcc, v3, v5, s[6:7] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v4 -; CGP-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[8:9] -; CGP-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; CGP-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[4:5] -; CGP-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: s_mov_b64 s[4:5], vcc -; CGP-NEXT: v_subrev_i32_e32 v11, vcc, 0x12d8fb, v9 -; CGP-NEXT: v_sub_i32_e64 v12, s[6:7], v0, v4 -; CGP-NEXT: v_subbrev_u32_e64 v1, s[6:7], 0, v1, s[6:7] -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[6:7] -; CGP-NEXT: v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 -; CGP-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[4:5] -; CGP-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; CGP-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] -; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_urem_v2i64_oddk_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c31 +; CHECK-NEXT: v_mov_b32_e32 v5, 0xd9528440 +; CHECK-NEXT: v_mov_b32_e32 v8, 0x12d8fb +; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v5 +; CHECK-NEXT: v_mul_hi_u32 v11, v1, v4 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v5 +; CHECK-NEXT: v_mul_hi_u32 v13, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v14, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v15, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v16, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v17, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_hi_u32 v18, v2, v5 +; CHECK-NEXT: v_mul_hi_u32 v19, v3, v5 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v11 +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v14, v15 +; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v17, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v12, vcc, v4, v18 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v11 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v15, v4 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v12, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v19, v7 +; CHECK-NEXT: v_lshr_b64 v[4:5], v[4:5], 20 +; CHECK-NEXT: v_lshr_b64 v[6:7], v[6:7], 20 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v8 +; CHECK-NEXT: v_mul_lo_u32 v5, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v4, v4, v8 +; CHECK-NEXT: v_mul_lo_u32 v10, v6, v8 +; CHECK-NEXT: v_mul_lo_u32 v7, v7, v8 +; CHECK-NEXT: v_mul_hi_u32 v6, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 +; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, ret <2 x i64> %result }