diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 3de7f473fea0f..49d06f40a1072 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -7046,3 +7046,73 @@ void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegi Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc); } } + +// +// Efficient LEA-based multiply emulation for immediates not easily handled +// by shift+add. Uses two fast LEAs based on Intel Optimization Manual, +// section "3.5.1.2 Using LEA". IMUL latency with register operands is 3 cycles, +// while fast LEA has 1 cycle. So two fast LEAs often outperform IMUL for +// specific constants. +// +// The pattern table below lists: +// - First LEA: BASE = src, INDEX = src, SCALE = s1 +// - Second LEA: BASE = (use_src2 ? src : dst), INDEX = dst, SCALE = s2 +// +// All dst inputs for the second LEA are derived from “terminal” outputs. +// +void C2_MacroAssembler::imullq_imm(BasicType bt, Register dst, Register src, int32_t imm) { + assert(bt == T_LONG || bt == T_INT, "Unexpected type"); + + if (!VM_Version::supports_fast_2op_lea()) { + imullq(bt, dst, src, imm); + return; + } + + // Descriptor for one LEA pattern entry. + struct LeaPattern { + int32_t imm; + // First LEA: dst = src + src * scale1 + Address::ScaleFactor scale1; + + // Second LEA: + // If use_src2 == true: + // dst = src + dst * scale2 + // else: + // dst = dst + dst * scale2 + bool use_src_as_base; + Address::ScaleFactor scale2; + }; + + static const LeaPattern patterns[] = { + { 11, Address::times_4, true, Address::times_2 }, + { 13, Address::times_2, true, Address::times_4 }, + { 19, Address::times_8, true, Address::times_2 }, + { 21, Address::times_4, true, Address::times_4 }, + { 25, Address::times_4, false, Address::times_4 }, + { 27, Address::times_2, false, Address::times_8 }, + { 37, Address::times_8, true, Address::times_4 }, + { 41, Address::times_4, true, Address::times_8 }, + { 45, Address::times_4, false, Address::times_8 }, + { 73, Address::times_8, true, Address::times_8 }, + { 81, Address::times_8, false, Address::times_8 }, + }; + + // Lookup table + for (const LeaPattern& p : patterns) { + if (p.imm == imm) { + // First LEA → dst = src + src * scale1 + lealq(bt, dst, Address(src, src, p.scale1)); + + // Second LEA + if (p.use_src_as_base) { + lealq(bt, dst, Address(src, dst, p.scale2)); + } else { + lealq(bt, dst, Address(dst, dst, p.scale2)); + } + return; + } + } + + // Fallback: unsupported imm use IMUL + imullq(bt, dst, src, imm); +} diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index cd5f0ceb90074..0b517f3e3d70b 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -584,4 +584,6 @@ void reconstruct_frame_pointer(Register rtmp); + void imullq_imm(BasicType bt, Register dst, Register src, int32_t imm); + #endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index 44f1a35d443ab..4407e63a0c20b 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -9823,3 +9823,21 @@ void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) { movzbl(dst, dst); } } + +void MacroAssembler::imullq(BasicType bt, Register dst, Register src, int32_t imm) { + if (bt == T_LONG) { + imulq(dst, src, imm); + } else { + assert(bt == T_INT, "Unexpected type"); + imull(dst, src, imm); + } +} + +void MacroAssembler::lealq(BasicType bt, Register dst, Address src) { + if (bt == T_LONG) { + leaq(dst, src); + } else { + assert(bt == T_INT, "Unexpected type"); + leal(dst, src); + } +} diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 695eea6ad0301..0be779724d4ab 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -2061,6 +2061,9 @@ class MacroAssembler: public Assembler { void save_legacy_gprs(); void restore_legacy_gprs(); void setcc(Assembler::Condition comparison, Register dst); + + void imullq(BasicType bt, Register dst, Register src, int32_t imm); + void lealq(BasicType bt, Register dst, Address src); }; #endif // CPU_X86_MACROASSEMBLER_X86_HPP diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 42d2e815e45df..b724b095b8dd7 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -11458,12 +11458,12 @@ instruct mulI_rReg_ndd(rRegI dst, rRegI src1, rRegI src2, rFlagsReg cr) instruct mulI_rReg_imm(rRegI dst, rRegI src, immI imm, rFlagsReg cr) %{ match(Set dst (MulI src imm)); - effect(KILL cr); + effect(KILL cr, TEMP_DEF dst); ins_cost(300); - format %{ "imull $dst, $src, $imm\t# int" %} + format %{ "imull_imm $dst, $src, $imm\t# int" %} ins_encode %{ - __ imull($dst$$Register, $src$$Register, $imm$$constant); + __ imullq_imm(T_INT, $dst$$Register, $src$$Register, $imm$$constant); %} ins_pipe(ialu_reg_reg_alu0); %} @@ -11500,12 +11500,13 @@ instruct mulI_rReg_rReg_mem_ndd(rRegI dst, rRegI src1, memory src2, rFlagsReg cr instruct mulI_mem_imm(rRegI dst, memory src, immI imm, rFlagsReg cr) %{ match(Set dst (MulI (LoadI src) imm)); - effect(KILL cr); + effect(KILL cr, TEMP dst); ins_cost(300); format %{ "imull $dst, $src, $imm\t# int" %} ins_encode %{ - __ imull($dst$$Register, $src$$Address, $imm$$constant); + __ movl($dst$$Register, $src$$Address); + __ imullq_imm(T_INT, $dst$$Register, $dst$$Register, $imm$$constant); %} ins_pipe(ialu_reg_mem_alu0); %} @@ -11552,12 +11553,12 @@ instruct mulL_rReg_ndd(rRegL dst, rRegL src1, rRegL src2, rFlagsReg cr) instruct mulL_rReg_imm(rRegL dst, rRegL src, immL32 imm, rFlagsReg cr) %{ match(Set dst (MulL src imm)); - effect(KILL cr); + effect(KILL cr, TEMP_DEF dst); ins_cost(300); - format %{ "imulq $dst, $src, $imm\t# long" %} + format %{ "imulq_imm $dst, $src, $imm\t# long" %} ins_encode %{ - __ imulq($dst$$Register, $src$$Register, $imm$$constant); + __ imullq_imm(T_LONG, $dst$$Register, $src$$Register, $imm$$constant); %} ins_pipe(ialu_reg_reg_alu0); %} @@ -11594,12 +11595,13 @@ instruct mulL_rReg_rReg_mem_ndd(rRegL dst, rRegL src1, memory src2, rFlagsReg cr instruct mulL_mem_imm(rRegL dst, memory src, immL32 imm, rFlagsReg cr) %{ match(Set dst (MulL (LoadL src) imm)); - effect(KILL cr); + effect(KILL cr, TEMP dst); ins_cost(300); format %{ "imulq $dst, $src, $imm\t# long" %} ins_encode %{ - __ imulq($dst$$Register, $src$$Address, $imm$$constant); + __ movq($dst$$Register, $src$$Address); + __ imullq_imm(T_LONG, $dst$$Register, $dst$$Register, $imm$$constant); %} ins_pipe(ialu_reg_mem_alu0); %} diff --git a/test/hotspot/jtreg/compiler/c2/TestConstantMultiplier.java b/test/hotspot/jtreg/compiler/c2/TestConstantMultiplier.java new file mode 100644 index 0000000000000..db9faad5d5c69 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/TestConstantMultiplier.java @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8373480 + * @summary Optimize multiplication by constant multiplier using LEA instructions + * @library /test/lib / + * @compile ../../compiler/lib/ir_framework/TestFramework.java + * @compile ../../compiler/lib/generators/Generators.java + * @compile ../../compiler/lib/verify/Verify.java + * @run driver compiler.c2.TestConstantMultiplier + */ + +package compiler.c2; + +import java.util.List; +import java.util.Set; +import java.util.stream.IntStream; + +import compiler.lib.ir_framework.*; +import compiler.lib.verify.*; +import compiler.lib.ir_framework.Test; + +import compiler.lib.compile_framework.*; +import compiler.lib.generators.Generators; + +import compiler.lib.template_framework.Template; +import compiler.lib.template_framework.TemplateToken; +import static compiler.lib.template_framework.Template.scope; +import static compiler.lib.template_framework.Template.let; + +import compiler.lib.template_framework.library.TestFrameworkClass; + +public class TestConstantMultiplier { + + public static void main(String[] args) { + // Create a new CompileFramework instance. + CompileFramework comp = new CompileFramework(); + + // Add a java source file. + comp.addJavaSourceCode("c2.compilerr.ConstantMultiplierTest", generate(comp)); + + // Compile the source file. + comp.compile(); + + comp.invoke("c2.compiler.ConstantMultiplierTest", "main", new Object[] {new String[] {}}); + + // We can also pass VM flags for the Test VM. + comp.invoke("c2.compiler.ConstantMultiplierTest", "main", new Object[] {new String[] {"-Xbatch"}}); + } + + + // Generate a source Java file as String + public static String generate(CompileFramework comp) { + var testHeader = Template.make(() -> scope( + """ + public static Random RANDOM = new Random(1023); + + """ + )); + var testTemplate = Template.make(() -> scope( + IntStream.of(81, 73, 45, 41, 37, 27, 25, 21, 19, 13, 11).mapToObj( + multiplier -> scope( + let("multiplier", multiplier), + """ + @Test + @IR(applyIfPlatform = {"x64", "true"}, counts = {IRNode.X86_MULT_IMM_I, "1"}) + private static int testMultBy#{multiplier}I(int num) { + return num * #{multiplier}; + } + + @Run(test = "testMultBy#{multiplier}I") + private static void runMultBy#{multiplier}II() { + int multiplicand = RANDOM.nextInt(); + Verify.checkEQ(#{multiplier} * multiplicand, testMultBy#{multiplier}I(multiplicand)); + } + + @Test + @IR(applyIfPlatform = {"x64", "true"}, counts = {IRNode.X86_MULT_IMM_L, "1"}) + private static long testMultBy#{multiplier}L(long num) { + return num * #{multiplier}; + } + + @Run(test = "testMultBy#{multiplier}L") + private static void runMultBy#{multiplier}L() { + long multiplicand = RANDOM.nextInt(); + Verify.checkEQ(#{multiplier} * multiplicand, testMultBy#{multiplier}L(multiplicand)); + } + """ + )).toList() + )); + + var testClass = Template.make(() -> scope( + testHeader.asToken(), + testTemplate.asToken() + )); + + List testTemplateTokens = List.of(testClass.asToken()); + + return TestFrameworkClass.render( + // package and class name. + "c2.compiler", "ConstantMultiplierTest", + // Set of imports. + Set.of("java.util.Random","compiler.lib.verify.*"), + // classpath, so the Test VM has access to the compiled class files. + comp.getEscapedClassPathOfCompiledClasses(), + // The list of tests. + testTemplateTokens); + } +} diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 608027e7ee108..b073c50299205 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -2781,6 +2781,16 @@ public class IRNode { machOnlyNameRegex(VSTOREMASK_TRUECOUNT, "vstoremask_truecount_neon"); } + public static final String X86_MULT_IMM_I = PREFIX + "X86_MULT_IMM_I" + POSTFIX; + static { + machOnlyNameRegex(X86_MULT_IMM_I, "mulI_rReg_imm"); + } + + public static final String X86_MULT_IMM_L = PREFIX + "X86_MULT_IMM_L" + POSTFIX; + static { + machOnlyNameRegex(X86_MULT_IMM_L, "mulL_rReg_imm"); + } + public static final String X86_SCONV_D2I = PREFIX + "X86_SCONV_D2I" + POSTFIX; static { machOnlyNameRegex(X86_SCONV_D2I, "convD2I_reg_reg"); diff --git a/test/micro/org/openjdk/bench/vm/compiler/ConstantMultiplierOptimization.java b/test/micro/org/openjdk/bench/vm/compiler/ConstantMultiplierOptimization.java new file mode 100644 index 0000000000000..a797e6ecdc4df --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/ConstantMultiplierOptimization.java @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.lang.invoke.*; +import java.util.Random; + + +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MINUTES) +@State(Scope.Thread) +@Fork(value = 1, jvmArgs = {"-XX:LoopUnrollLimit=1"}) +public class ConstantMultiplierOptimization { + + public static int mul_by_25_I(int a) { + return a * 25; + } + public static int mul_by_27_I(int a) { + return a * 27; + } + public static int mul_by_37_I(int a) { + return a * 37; + } + public static int mul_by_19_I(int a) { + return a * 19; + } + public static int mul_by_13_I(int a) { + return a * 13; + } + public static int mul_by_11_I(int a) { + return a * 11; + } + + public static long mul_by_25_L(long a) { + return a * 25; + } + public static long mul_by_27_L(long a) { + return a * 27; + } + public static long mul_by_37_L(long a) { + return a * 37; + } + public static long mul_by_19_L(long a) { + return a * 19; + } + public static long mul_by_13_L(long a) { + return a * 13; + } + public static long mul_by_11_L(long a) { + return a * 11; + } + + @Benchmark + public long testConstMultiplierL() { + long res = 0; + for (long i = 0 ; i < 100000000; i++) { + res += mul_by_37_L(i); + res += mul_by_25_L(i); + res += mul_by_27_L(i); + res += mul_by_19_L(i); + res += mul_by_13_L(i); + res += mul_by_11_L(i); + } + return res; + } + + @Benchmark + public int testConstMultiplierI() { + int res = 0; + for (int i = 0 ; i < 100000000; i++) { + res += mul_by_37_I(i); + res += mul_by_25_I(i); + res += mul_by_27_I(i); + res += mul_by_19_I(i); + res += mul_by_13_I(i); + res += mul_by_11_I(i); + } + return res; + } +}