Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7046,3 +7046,73 @@ void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegi
Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
}
}

//
// Efficient LEA-based multiply emulation for immediates not easily handled
// by shift+add. Uses two fast LEAs based on Intel Optimization Manual,
// section "3.5.1.2 Using LEA". IMUL latency with register operands is 3 cycles,
// while fast LEA has 1 cycle. So two fast LEAs often outperform IMUL for
// specific constants.
//
// The pattern table below lists:
// - First LEA: BASE = src, INDEX = src, SCALE = s1
// - Second LEA: BASE = (use_src2 ? src : dst), INDEX = dst, SCALE = s2
//
// All dst inputs for the second LEA are derived from “terminal” outputs.
//
void C2_MacroAssembler::imullq_imm(BasicType bt, Register dst, Register src, int32_t imm) {
assert(bt == T_LONG || bt == T_INT, "Unexpected type");

if (!VM_Version::supports_fast_2op_lea()) {
imullq(bt, dst, src, imm);
return;
}

// Descriptor for one LEA pattern entry.
struct LeaPattern {
int32_t imm;
// First LEA: dst = src + src * scale1
Address::ScaleFactor scale1;

// Second LEA:
// If use_src2 == true:
// dst = src + dst * scale2
// else:
// dst = dst + dst * scale2
bool use_src_as_base;
Address::ScaleFactor scale2;
};

static const LeaPattern patterns[] = {
{ 11, Address::times_4, true, Address::times_2 },
{ 13, Address::times_2, true, Address::times_4 },
{ 19, Address::times_8, true, Address::times_2 },
{ 21, Address::times_4, true, Address::times_4 },
{ 25, Address::times_4, false, Address::times_4 },
{ 27, Address::times_2, false, Address::times_8 },
{ 37, Address::times_8, true, Address::times_4 },
{ 41, Address::times_4, true, Address::times_8 },
{ 45, Address::times_4, false, Address::times_8 },
{ 73, Address::times_8, true, Address::times_8 },
{ 81, Address::times_8, false, Address::times_8 },
};

// Lookup table
for (const LeaPattern& p : patterns) {
if (p.imm == imm) {
// First LEA → dst = src + src * scale1
lealq(bt, dst, Address(src, src, p.scale1));

// Second LEA
if (p.use_src_as_base) {
lealq(bt, dst, Address(src, dst, p.scale2));
} else {
lealq(bt, dst, Address(dst, dst, p.scale2));
}
return;
}
}

// Fallback: unsupported imm use IMUL
imullq(bt, dst, src, imm);
}
2 changes: 2 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -584,4 +584,6 @@

void reconstruct_frame_pointer(Register rtmp);

void imullq_imm(BasicType bt, Register dst, Register src, int32_t imm);

#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP
18 changes: 18 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9823,3 +9823,21 @@ void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) {
movzbl(dst, dst);
}
}

void MacroAssembler::imullq(BasicType bt, Register dst, Register src, int32_t imm) {
if (bt == T_LONG) {
imulq(dst, src, imm);
} else {
assert(bt == T_INT, "Unexpected type");
imull(dst, src, imm);
}
}

void MacroAssembler::lealq(BasicType bt, Register dst, Address src) {
if (bt == T_LONG) {
leaq(dst, src);
} else {
assert(bt == T_INT, "Unexpected type");
leal(dst, src);
}
}
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2061,6 +2061,9 @@ class MacroAssembler: public Assembler {
void save_legacy_gprs();
void restore_legacy_gprs();
void setcc(Assembler::Condition comparison, Register dst);

void imullq(BasicType bt, Register dst, Register src, int32_t imm);
void lealq(BasicType bt, Register dst, Address src);
};

#endif // CPU_X86_MACROASSEMBLER_X86_HPP
22 changes: 12 additions & 10 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -11458,12 +11458,12 @@ instruct mulI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr)
instruct mulI_rReg_imm(rRegI dst, rRegI src, immI imm, rFlagsReg cr)
%{
match(Set dst (MulI src imm));
effect(KILL cr);
effect(KILL cr, TEMP_DEF dst);

ins_cost(300);
format %{ "imull $dst, $src, $imm\t# int" %}
format %{ "imull_imm $dst, $src, $imm\t# int" %}
ins_encode %{
__ imull($dst$$Register, $src$$Register, $imm$$constant);
__ imullq_imm(T_INT, $dst$$Register, $src$$Register, $imm$$constant);
%}
ins_pipe(ialu_reg_reg_alu0);
%}
Expand Down Expand Up @@ -11500,12 +11500,13 @@ instruct mulI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr
instruct mulI_mem_imm(rRegI dst, memory src, immI imm, rFlagsReg cr)
%{
match(Set dst (MulI (LoadI src) imm));
effect(KILL cr);
effect(KILL cr, TEMP dst);

ins_cost(300);
format %{ "imull $dst, $src, $imm\t# int" %}
ins_encode %{
__ imull($dst$$Register, $src$$Address, $imm$$constant);
__ movl($dst$$Register, $src$$Address);
__ imullq_imm(T_INT, $dst$$Register, $dst$$Register, $imm$$constant);
%}
ins_pipe(ialu_reg_mem_alu0);
%}
Expand Down Expand Up @@ -11552,12 +11553,12 @@ instruct mulL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr)
instruct mulL_rReg_imm(rRegL dst, rRegL src, immL32 imm, rFlagsReg cr)
%{
match(Set dst (MulL src imm));
effect(KILL cr);
effect(KILL cr, TEMP_DEF dst);

ins_cost(300);
format %{ "imulq $dst, $src, $imm\t# long" %}
format %{ "imulq_imm $dst, $src, $imm\t# long" %}
ins_encode %{
__ imulq($dst$$Register, $src$$Register, $imm$$constant);
__ imullq_imm(T_LONG, $dst$$Register, $src$$Register, $imm$$constant);
%}
ins_pipe(ialu_reg_reg_alu0);
%}
Expand Down Expand Up @@ -11594,12 +11595,13 @@ instruct mulL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr
instruct mulL_mem_imm(rRegL dst, memory src, immL32 imm, rFlagsReg cr)
%{
match(Set dst (MulL (LoadL src) imm));
effect(KILL cr);
effect(KILL cr, TEMP dst);

ins_cost(300);
format %{ "imulq $dst, $src, $imm\t# long" %}
ins_encode %{
__ imulq($dst$$Register, $src$$Address, $imm$$constant);
__ movq($dst$$Register, $src$$Address);
__ imullq_imm(T_LONG, $dst$$Register, $dst$$Register, $imm$$constant);
%}
ins_pipe(ialu_reg_mem_alu0);
%}
Expand Down
131 changes: 131 additions & 0 deletions test/hotspot/jtreg/compiler/c2/TestConstantMultiplier.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

/*
* @test
* @bug 8373480
* @summary Optimize multiplication by constant multiplier using LEA instructions
* @library /test/lib /
* @compile ../../compiler/lib/ir_framework/TestFramework.java
* @compile ../../compiler/lib/generators/Generators.java
* @compile ../../compiler/lib/verify/Verify.java
* @run driver compiler.c2.TestConstantMultiplier
*/

package compiler.c2;

import java.util.List;
import java.util.Set;
import java.util.stream.IntStream;

import compiler.lib.ir_framework.*;
import compiler.lib.verify.*;
import compiler.lib.ir_framework.Test;

import compiler.lib.compile_framework.*;
import compiler.lib.generators.Generators;

import compiler.lib.template_framework.Template;
import compiler.lib.template_framework.TemplateToken;
import static compiler.lib.template_framework.Template.scope;
import static compiler.lib.template_framework.Template.let;

import compiler.lib.template_framework.library.TestFrameworkClass;

public class TestConstantMultiplier {

public static void main(String[] args) {
// Create a new CompileFramework instance.
CompileFramework comp = new CompileFramework();

// Add a java source file.
comp.addJavaSourceCode("c2.compilerr.ConstantMultiplierTest", generate(comp));

// Compile the source file.
comp.compile();

comp.invoke("c2.compiler.ConstantMultiplierTest", "main", new Object[] {new String[] {}});

// We can also pass VM flags for the Test VM.
comp.invoke("c2.compiler.ConstantMultiplierTest", "main", new Object[] {new String[] {"-Xbatch"}});
}


// Generate a source Java file as String
public static String generate(CompileFramework comp) {
var testHeader = Template.make(() -> scope(
"""
public static Random RANDOM = new Random(1023);

"""
));
var testTemplate = Template.make(() -> scope(
IntStream.of(81, 73, 45, 41, 37, 27, 25, 21, 19, 13, 11).mapToObj(
multiplier -> scope(
let("multiplier", multiplier),
"""
@Test
@IR(applyIfPlatform = {"x64", "true"}, counts = {IRNode.X86_MULT_IMM_I, "1"})
private static int testMultBy#{multiplier}I(int num) {
return num * #{multiplier};
}

@Run(test = "testMultBy#{multiplier}I")
private static void runMultBy#{multiplier}II() {
int multiplicand = RANDOM.nextInt();
Verify.checkEQ(#{multiplier} * multiplicand, testMultBy#{multiplier}I(multiplicand));
}

@Test
@IR(applyIfPlatform = {"x64", "true"}, counts = {IRNode.X86_MULT_IMM_L, "1"})
private static long testMultBy#{multiplier}L(long num) {
return num * #{multiplier};
}

@Run(test = "testMultBy#{multiplier}L")
private static void runMultBy#{multiplier}L() {
long multiplicand = RANDOM.nextInt();
Verify.checkEQ(#{multiplier} * multiplicand, testMultBy#{multiplier}L(multiplicand));
}
"""
)).toList()
));

var testClass = Template.make(() -> scope(
testHeader.asToken(),
testTemplate.asToken()
));

List<TemplateToken> testTemplateTokens = List.of(testClass.asToken());

return TestFrameworkClass.render(
// package and class name.
"c2.compiler", "ConstantMultiplierTest",
// Set of imports.
Set.of("java.util.Random","compiler.lib.verify.*"),
// classpath, so the Test VM has access to the compiled class files.
comp.getEscapedClassPathOfCompiledClasses(),
// The list of tests.
testTemplateTokens);
}
}
10 changes: 10 additions & 0 deletions test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -2781,6 +2781,16 @@ public class IRNode {
machOnlyNameRegex(VSTOREMASK_TRUECOUNT, "vstoremask_truecount_neon");
}

public static final String X86_MULT_IMM_I = PREFIX + "X86_MULT_IMM_I" + POSTFIX;
static {
machOnlyNameRegex(X86_MULT_IMM_I, "mulI_rReg_imm");
}

public static final String X86_MULT_IMM_L = PREFIX + "X86_MULT_IMM_L" + POSTFIX;
static {
machOnlyNameRegex(X86_MULT_IMM_L, "mulL_rReg_imm");
}

public static final String X86_SCONV_D2I = PREFIX + "X86_SCONV_D2I" + POSTFIX;
static {
machOnlyNameRegex(X86_SCONV_D2I, "convD2I_reg_reg");
Expand Down
Loading