Skip to content

[X86] Truncate i64 sub to i32 when upper 33 bits are zeros #145850

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58191,8 +58191,28 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
unsigned int Opcode = N->getOpcode();
SDLoc DL(N);

// Use a 32-bit sub+zext if upper 33 bits known zero.
if (VT == MVT::i64 && Subtarget.is64Bit()) {
APInt HiMask = APInt::getHighBitsSet(64, 33);
if (DAG.MaskedValueIsZero(Op0, HiMask) &&
DAG.MaskedValueIsZero(Op1, HiMask)) {
SDValue LHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op0);
SDValue RHS = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
bool NUW = Op0->getFlags().hasNoUnsignedWrap();
NUW = NUW & DAG.willNotOverflowAdd(false, LHS, RHS);
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(NUW);
// Always true since in the worst case 0 - 2147483647 = -2147483647, still
// fits in i32
Flags.setNoSignedWrap(true);
SDValue Sub = DAG.getNode(Opcode, DL, MVT::i32, LHS, RHS, Flags);
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Sub);
}
}

auto IsNonOpaqueConstant = [&](SDValue Op) {
return DAG.isConstantIntBuildVectorOrConstantInt(Op,
/*AllowOpaques*/ false);
Expand Down
78 changes: 78 additions & 0 deletions llvm/test/CodeGen/X86/reduce-i64-sub.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64

; Truncate to 32 bit subtraction since first 48 bits are zeros
define i64 @test1(i16 %a, i16 %b) nounwind {
; X86-LABEL: test1:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edx, %edx
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the xor %edx, %edx be considered redundant here?
Since we’re truncating to a 32-bit subtraction and the high bits are known to be zero, and if we remove the redundant sbb, is there still a reason to explicitly clear edx?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its necessary to stop a dependency on the old value of EDX stalling the SBB instruction - some cpus recognise the dependency break (search X86ScheduleBtVer2.td for IsDepBreakingFunction).

; X86-NEXT: subl %ecx, %eax
; X86-NEXT: sbbl %edx, %edx
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why didn't the 32-bit target recognize that the SBB was redundant?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interestingly in PR #144066 there are no redundant adc instructions which is not the case for the sbb instructions in this PR. Upon digging further I found out that combineADC handles the case when both operands are zeros in this code

  static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI) {
  SDValue LHS = N->getOperand(0);
  SDValue RHS = N->getOperand(1);
  SDValue CarryIn = N->getOperand(2);
  auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
  auto *RHSC = dyn_cast<ConstantSDNode>(RHS);

  // Canonicalize constant to RHS.
  if (LHSC && !RHSC)
    return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
                       CarryIn);

  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
  // the result is either zero or one (depending on the input carry bit).
  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
  if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
      // We don't have a good way to replace an EFLAGS use, so only do this when
      // dead right now.
      SDValue(N, 1).use_empty()) {
    SDLoc DL(N);
    EVT VT = N->getValueType(0);
    SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
    SDValue Res1 = DAG.getNode(
        ISD::AND, DL, VT,
        DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                    DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
        DAG.getConstant(1, DL, VT));
    return DCI.CombineTo(N, Res1, CarryOut);
  }

But combineSBB does not handle this case

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
  SDValue LHS = N->getOperand(0);
  SDValue RHS = N->getOperand(1);
  SDValue BorrowIn = N->getOperand(2);

  if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
    MVT VT = N->getSimpleValueType(0);
    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
  }

  // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
  // iff the flag result is dead.
  if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
      !N->hasAnyUseOfValue(1))
    return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
                       LHS.getOperand(1), BorrowIn);

  return SDValue();
}

So should we add the code to handle redundant sbb's in this PR or make it a separate issue?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate issue is fine cheers!

; X86-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0:
; X64-NEXT: movzwl %si, %ecx
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
%zext_a = zext i16 %a to i64
%zext_b = zext i16 %b to i64
%sub = sub i64 %zext_a, %zext_b
ret i64 %sub
}

; Do not truncate to 32 bit subtraction if 32nd bit is set
define i64 @test2(i16 %a, i16 %b) nounwind {
; X86-LABEL: test2:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: movl $1, %edx
; X86-NEXT: sbbl $0, %edx
; X86-NEXT: retl
;
; X64-LABEL: test2:
; X64: # %bb.0:
; X64-NEXT: movzwl %di, %ecx
; X64-NEXT: movzwl %si, %edx
; X64-NEXT: movabsq $4294967296, %rax # imm = 0x100000000
; X64-NEXT: orq %rcx, %rax
; X64-NEXT: subq %rdx, %rax
; X64-NEXT: retq
%zext_a = zext i16 %a to i64
%zext_b = zext i16 %b to i64
%or_a = or i64 %zext_a, 4294967296
%sub = sub i64 %or_a, %zext_b
ret i64 %sub
}

; Do not truncate to 32 bit subtraction in case of sign extension
define i64 @test3(i16 %a, i16 %b) nounwind {
; X86-LABEL: test3:
; X86: # %bb.0:
; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: sarl $31, %edx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: sbbl $0, %edx
; X86-NEXT: retl
;
; X64-LABEL: test3:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movswq %di, %rax
; X64-NEXT: movzwl %si, %ecx
; X64-NEXT: subq %rcx, %rax
; X64-NEXT: retq
%sext_a = sext i16 %a to i64
%zext_b = zext i16 %b to i64
%sub = sub i64 %sext_a, %zext_b
ret i64 %sub
}

Loading