Skip to content

Commit 632151f

Browse files
gaynor-anthropicdtcxzywnikic
authored
InstCombine: improve optimizations for ceiling division with no overflow (#142869)
Fixes #142497. Alive2: https://alive2.llvm.org/ce/z/CeaHaH The contents of this pull request were substantially written using claude-code. I've reviewed to the best of my ability (it's been years since I did any compilers work). --------- Co-authored-by: Yingwei Zheng <[email protected]> Co-authored-by: Nikita Popov <[email protected]>
1 parent bb70023 commit 632151f

File tree

2 files changed

+289
-0
lines changed

2 files changed

+289
-0
lines changed

llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1787,6 +1787,34 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
17871787
if (Instruction *Ashr = foldAddToAshr(I))
17881788
return Ashr;
17891789

1790+
// Ceiling division by power-of-2:
1791+
// (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N)
1792+
// This is valid when adding (N-1) to X doesn't overflow.
1793+
{
1794+
Value *X;
1795+
const APInt *ShiftAmt, *Mask;
1796+
CmpPredicate Pred;
1797+
1798+
// Match: (X >> C) + zext((X & Mask) != 0)
1799+
// or: zext((X & Mask) != 0) + (X >> C)
1800+
if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))),
1801+
m_ZExt(m_SpecificICmp(
1802+
ICmpInst::ICMP_NE,
1803+
m_And(m_Deferred(X), m_LowBitMask(Mask)),
1804+
m_ZeroInt())))) &&
1805+
Mask->popcount() == *ShiftAmt) {
1806+
1807+
// Check if X + Mask doesn't overflow
1808+
Constant *MaskC = ConstantInt::get(X->getType(), *Mask);
1809+
if (willNotOverflowUnsignedAdd(X, MaskC, I)) {
1810+
// (X + Mask) >> ShiftAmt
1811+
Value *Add = Builder.CreateNUWAdd(X, MaskC);
1812+
return BinaryOperator::CreateLShr(
1813+
Add, ConstantInt::get(X->getType(), *ShiftAmt));
1814+
}
1815+
}
1816+
}
1817+
17901818
// (~X) + (~Y) --> -2 - (X + Y)
17911819
{
17921820
// To ensure we can save instructions we need to ensure that we consume both

llvm/test/Transforms/InstCombine/add.ll

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4273,4 +4273,265 @@ define i32 @fold_zext_nneg_add_const_fail2(i8 %x) {
42734273
}
42744274

42754275
declare void @llvm.assume(i1)
4276+
declare i32 @llvm.ctlz.i32(i32, i1)
4277+
4278+
; Ceiling division by power-of-2: (x >> log2(N)) + ((x & (N-1)) != 0) -> (x + (N-1)) >> log2(N)
4279+
; This is only valid when x + (N-1) doesn't overflow
4280+
4281+
; Test with known range that prevents overflow
4282+
define i32 @ceil_div_by_8_known_range(i32 range(i32 0, 100) %x) {
4283+
; CHECK-LABEL: @ceil_div_by_8_known_range(
4284+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
4285+
; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3
4286+
; CHECK-NEXT: ret i32 [[R]]
4287+
;
4288+
%shr = lshr i32 %x, 3
4289+
%and = and i32 %x, 7
4290+
%cmp = icmp ne i32 %and, 0
4291+
%ext = zext i1 %cmp to i32
4292+
%r = add i32 %shr, %ext
4293+
ret i32 %r
4294+
}
4295+
4296+
; Test with the exact IR from the original testcase
4297+
define i32 @ceil_div_from_clz(i32 %v) {
4298+
; CHECK-LABEL: @ceil_div_from_clz(
4299+
; CHECK-NEXT: [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 false)
4300+
; CHECK-NEXT: [[TMP1:%.*]] = sub nuw nsw i32 39, [[CTLZ]]
4301+
; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3
4302+
; CHECK-NEXT: ret i32 [[R]]
4303+
;
4304+
%ctlz = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false)
4305+
%sub = sub nuw nsw i32 32, %ctlz
4306+
%shr = lshr i32 %sub, 3
4307+
%and = and i32 %sub, 7
4308+
%cmp = icmp ne i32 %and, 0
4309+
%ext = zext i1 %cmp to i32
4310+
%r = add nuw nsw i32 %shr, %ext
4311+
ret i32 %r
4312+
}
4313+
4314+
; Vector version with known range
4315+
define <2 x i32> @ceil_div_by_8_vec_range(<2 x i32> range(i32 0, 1000) %x) {
4316+
; CHECK-LABEL: @ceil_div_by_8_vec_range(
4317+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <2 x i32> [[X:%.*]], splat (i32 7)
4318+
; CHECK-NEXT: [[R:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3)
4319+
; CHECK-NEXT: ret <2 x i32> [[R]]
4320+
;
4321+
%shr = lshr <2 x i32> %x, <i32 3, i32 3>
4322+
%and = and <2 x i32> %x, <i32 7, i32 7>
4323+
%cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
4324+
%ext = zext <2 x i1> %cmp to <2 x i32>
4325+
%r = add <2 x i32> %shr, %ext
4326+
ret <2 x i32> %r
4327+
}
4328+
4329+
; Ceiling division by 16 with known range
4330+
define i16 @ceil_div_by_16_i16(i16 range(i16 0, 1000) %x) {
4331+
; CHECK-LABEL: @ceil_div_by_16_i16(
4332+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i16 [[X:%.*]], 15
4333+
; CHECK-NEXT: [[R:%.*]] = lshr i16 [[TMP1]], 4
4334+
; CHECK-NEXT: ret i16 [[R]]
4335+
;
4336+
%shr = lshr i16 %x, 4
4337+
%and = and i16 %x, 15
4338+
%cmp = icmp ne i16 %and, 0
4339+
%ext = zext i1 %cmp to i16
4340+
%r = add i16 %shr, %ext
4341+
ret i16 %r
4342+
}
4343+
4344+
; Negative test: no overflow guarantee - should NOT optimize
4345+
define i32 @ceil_div_by_8_no_overflow_info(i32 %x) {
4346+
; CHECK-LABEL: @ceil_div_by_8_no_overflow_info(
4347+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
4348+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7
4349+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
4350+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4351+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
4352+
; CHECK-NEXT: ret i32 [[R]]
4353+
;
4354+
%shr = lshr i32 %x, 3
4355+
%and = and i32 %x, 7
4356+
%cmp = icmp ne i32 %and, 0
4357+
%ext = zext i1 %cmp to i32
4358+
%r = add i32 %shr, %ext
4359+
ret i32 %r
4360+
}
4361+
4362+
; Negative test: nuw on final add doesn't help
4363+
define i32 @ceil_div_by_8_only_nuw_on_add(i32 %x) {
4364+
; CHECK-LABEL: @ceil_div_by_8_only_nuw_on_add(
4365+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
4366+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7
4367+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
4368+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4369+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
4370+
; CHECK-NEXT: ret i32 [[R]]
4371+
;
4372+
%shr = lshr i32 %x, 3
4373+
%and = and i32 %x, 7
4374+
%cmp = icmp ne i32 %and, 0
4375+
%ext = zext i1 %cmp to i32
4376+
%r = add nuw i32 %shr, %ext ; nuw here doesn't prove x+7 won't overflow
4377+
ret i32 %r
4378+
}
4379+
4380+
; Negative test: wrong mask
4381+
define i32 @ceil_div_wrong_mask(i32 range(i32 0, 100) %x) {
4382+
; CHECK-LABEL: @ceil_div_wrong_mask(
4383+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
4384+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 6
4385+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
4386+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4387+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
4388+
; CHECK-NEXT: ret i32 [[R]]
4389+
;
4390+
%shr = lshr i32 %x, 3
4391+
%and = and i32 %x, 6 ; Wrong mask: should be 7
4392+
%cmp = icmp ne i32 %and, 0
4393+
%ext = zext i1 %cmp to i32
4394+
%r = add i32 %shr, %ext
4395+
ret i32 %r
4396+
}
4397+
4398+
; Negative test: wrong shift amount
4399+
define i32 @ceil_div_wrong_shift(i32 range(i32 0, 100) %x) {
4400+
; CHECK-LABEL: @ceil_div_wrong_shift(
4401+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 4
4402+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7
4403+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
4404+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4405+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
4406+
; CHECK-NEXT: ret i32 [[R]]
4407+
;
4408+
%shr = lshr i32 %x, 4 ; Shift by 4, but mask is 7 (should be 15)
4409+
%and = and i32 %x, 7
4410+
%cmp = icmp ne i32 %and, 0
4411+
%ext = zext i1 %cmp to i32
4412+
%r = add i32 %shr, %ext
4413+
ret i32 %r
4414+
}
4415+
4416+
; Negative test: wrong comparison
4417+
define i32 @ceil_div_wrong_cmp(i32 range(i32 0, 100) %x) {
4418+
; CHECK-LABEL: @ceil_div_wrong_cmp(
4419+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
4420+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7
4421+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
4422+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4423+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
4424+
; CHECK-NEXT: ret i32 [[R]]
4425+
;
4426+
%shr = lshr i32 %x, 3
4427+
%and = and i32 %x, 7
4428+
%cmp = icmp eq i32 %and, 0 ; Wrong: should be ne
4429+
%ext = zext i1 %cmp to i32
4430+
%r = add i32 %shr, %ext
4431+
ret i32 %r
4432+
}
4433+
4434+
; Multi-use test: all intermediate values have uses
4435+
define i32 @ceil_div_multi_use(i32 range(i32 0, 100) %x) {
4436+
; CHECK-LABEL: @ceil_div_multi_use(
4437+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
4438+
; CHECK-NEXT: call void @use_i32(i32 [[SHR]])
4439+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7
4440+
; CHECK-NEXT: call void @use_i32(i32 [[AND]])
4441+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
4442+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4443+
; CHECK-NEXT: call void @use_i32(i32 [[EXT]])
4444+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
4445+
; CHECK-NEXT: ret i32 [[R]]
4446+
;
4447+
%shr = lshr i32 %x, 3
4448+
call void @use_i32(i32 %shr)
4449+
%and = and i32 %x, 7
4450+
call void @use_i32(i32 %and)
4451+
%cmp = icmp ne i32 %and, 0
4452+
%ext = zext i1 %cmp to i32
4453+
call void @use_i32(i32 %ext)
4454+
%r = add i32 %shr, %ext
4455+
ret i32 %r
4456+
}
4457+
4458+
; Commuted test: add operands are swapped
4459+
define i32 @ceil_div_commuted(i32 range(i32 0, 100) %x) {
4460+
; CHECK-LABEL: @ceil_div_commuted(
4461+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
4462+
; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3
4463+
; CHECK-NEXT: ret i32 [[R]]
4464+
;
4465+
%shr = lshr i32 %x, 3
4466+
%and = and i32 %x, 7
4467+
%cmp = icmp ne i32 %and, 0
4468+
%ext = zext i1 %cmp to i32
4469+
%r = add i32 %ext, %shr ; Operands swapped
4470+
ret i32 %r
4471+
}
4472+
4473+
; Commuted with multi-use
4474+
define i32 @ceil_div_commuted_multi_use(i32 range(i32 0, 100) %x) {
4475+
; CHECK-LABEL: @ceil_div_commuted_multi_use(
4476+
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
4477+
; CHECK-NEXT: call void @use_i32(i32 [[SHR]])
4478+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 7
4479+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
4480+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4481+
; CHECK-NEXT: call void @use_i32(i32 [[EXT]])
4482+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
4483+
; CHECK-NEXT: ret i32 [[R]]
4484+
;
4485+
%shr = lshr i32 %x, 3
4486+
call void @use_i32(i32 %shr)
4487+
%and = and i32 %x, 7
4488+
%cmp = icmp ne i32 %and, 0
4489+
%ext = zext i1 %cmp to i32
4490+
call void @use_i32(i32 %ext)
4491+
%r = add i32 %ext, %shr ; Operands swapped
4492+
ret i32 %r
4493+
}
4494+
4495+
; Multi-use test where only zext has multiple uses - should still optimize
4496+
define i32 @ceil_div_zext_multi_use(i32 range(i32 0, 100) %x) {
4497+
; CHECK-LABEL: @ceil_div_zext_multi_use(
4498+
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], 7
4499+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
4500+
; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32
4501+
; CHECK-NEXT: call void @use_i32(i32 [[EXT]])
4502+
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[X]], 7
4503+
; CHECK-NEXT: [[R:%.*]] = lshr i32 [[TMP1]], 3
4504+
; CHECK-NEXT: ret i32 [[R]]
4505+
;
4506+
%shr = lshr i32 %x, 3
4507+
%and = and i32 %x, 7
4508+
%cmp = icmp ne i32 %and, 0
4509+
%ext = zext i1 %cmp to i32
4510+
call void @use_i32(i32 %ext)
4511+
%r = add i32 %shr, %ext
4512+
ret i32 %r
4513+
}
4514+
4515+
; Multi-use with vector type
4516+
define <2 x i32> @ceil_div_vec_multi_use(<2 x i32> range(i32 0, 1000) %x) {
4517+
; CHECK-LABEL: @ceil_div_vec_multi_use(
4518+
; CHECK-NEXT: [[SHR:%.*]] = lshr <2 x i32> [[X:%.*]], splat (i32 3)
4519+
; CHECK-NEXT: call void @use_vec(<2 x i32> [[SHR]])
4520+
; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[X]], splat (i32 7)
4521+
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
4522+
; CHECK-NEXT: [[EXT:%.*]] = zext <2 x i1> [[CMP]] to <2 x i32>
4523+
; CHECK-NEXT: [[R:%.*]] = add nuw nsw <2 x i32> [[SHR]], [[EXT]]
4524+
; CHECK-NEXT: ret <2 x i32> [[R]]
4525+
;
4526+
%shr = lshr <2 x i32> %x, <i32 3, i32 3>
4527+
call void @use_vec(<2 x i32> %shr)
4528+
%and = and <2 x i32> %x, <i32 7, i32 7>
4529+
%cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
4530+
%ext = zext <2 x i1> %cmp to <2 x i32>
4531+
%r = add <2 x i32> %shr, %ext
4532+
ret <2 x i32> %r
4533+
}
4534+
4535+
declare void @use_i32(i32)
4536+
declare void @use_vec(<2 x i32>)
42764537
declare void @fake_func(i32)

0 commit comments

Comments
 (0)