From 8aa5442e48a0ef5c863a9c709828b2e818a58443 Mon Sep 17 00:00:00 2001 From: Artemiy Volkov Date: Tue, 9 Jul 2024 08:10:29 +0200 Subject: [PATCH 1/6] RISC-V: Add Synopsys RMX-100 series pipeline description. This patch introduces the pipeline description for the Synopsys RMX-100 series processor to the RISC-V GCC backend. The RMX-100 has a short, three-stage, in-order execution pipeline with configurable multiply unit options. The option -mmpy-option was added to control which version of the MPY unit the core has and what the latency of multiply instructions should be similar to ARCv2 cores (see gcc/config/arc/arc.opt:60). gcc/ChangeLog: * config/riscv/riscv-cores.def (RISCV_TUNE): Add arc-v-rmx-100-series. * config/riscv/riscv-opts.h (enum riscv_microarchitecture_type): Add arcv_rmx100. (enum arcv_mpy_option_enum): New enum for ARC-V multiply options. * config/riscv/riscv-protos.h (arcv_mpy_1c_bypass_p): New declaration. (arcv_mpy_2c_bypass_p): New declaration. (arcv_mpy_10c_bypass_p): New declaration. * config/riscv/riscv.cc (arcv_mpy_1c_bypass_p): New function. (arcv_mpy_2c_bypass_p): New function. (arcv_mpy_10c_bypass_p): New function. * config/riscv/riscv.md: Add arcv_rmx100. * config/riscv/riscv.opt: New option for RMX-100 multiply unit configuration * doc/riscv-mtune.texi: Document arc-v-rmx-100-series. * config/riscv/arcv-rmx100.md: New file. Authored-by: Artemiy Volkov Co-authored-by: Michiel Derhaeg Signed-off-by: Luis Silva --- gcc/config/riscv/arcv-rmx100.md | 104 +++++++++++++++++++++++++++++++ gcc/config/riscv/riscv-cores.def | 1 + gcc/config/riscv/riscv-opts.h | 8 +++ gcc/config/riscv/riscv-protos.h | 4 ++ gcc/config/riscv/riscv.cc | 49 +++++++++++++++ gcc/config/riscv/riscv.md | 4 +- gcc/config/riscv/riscv.opt | 17 +++++ gcc/doc/riscv-mtune.texi | 2 + 8 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 gcc/config/riscv/arcv-rmx100.md diff --git a/gcc/config/riscv/arcv-rmx100.md b/gcc/config/riscv/arcv-rmx100.md new file mode 100644 index 000000000000..bd6423b9a28c --- /dev/null +++ b/gcc/config/riscv/arcv-rmx100.md @@ -0,0 +1,104 @@ +;; DFA scheduling description of the Synopsys RMX-100 cpu +;; for GNU C compiler +;; Copyright (C) 2025 Free Software Foundation, Inc. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. + +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "arcv_rmx100") + +(define_cpu_unit "arcv_rmx100_ALU" "arcv_rmx100") +(define_cpu_unit "arcv_rmx100_FPU" "arcv_rmx100") +(define_cpu_unit "arcv_rmx100_MPY" "arcv_rmx100") +(define_cpu_unit "arcv_rmx100_DIV" "arcv_rmx100") +(define_cpu_unit "arcv_rmx100_DMP" "arcv_rmx100") + +;; Instruction reservation for arithmetic instructions. +(define_insn_reservation "arcv_rmx100_alu_arith" 1 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "unknown, const, arith, shift, slt, multi, auipc, nop, + logical, move, atomic, mvpair, bitmanip, clz, ctz, cpop, + zicond, condmove, clmul, min, max, minu, maxu, rotate")) + "arcv_rmx100_ALU") + +(define_insn_reservation "arcv_rmx100_jmp_insn" 1 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "branch, jump, call, jalr, ret, trap")) + "arcv_rmx100_ALU") + +; DIV insn: latency may be overridden by a define_bypass +(define_insn_reservation "arcv_rmx100_div_insn" 35 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "idiv")) + "arcv_rmx100_DIV*35") + +; MPY insn: latency may be overridden by a define_bypass +(define_insn_reservation "arcv_rmx100_mpy32_insn" 9 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "imul")) + "arcv_rmx100_MPY") + +(define_insn_reservation "arcv_rmx100_load_insn" 3 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "load")) + "arcv_rmx100_DMP,nothing*2") + +(define_insn_reservation "arcv_rmx100_store_insn" 1 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "store,fpstore")) + "arcv_rmx100_DMP") + +;; FPU scheduling. FIXME: This is based on the "fast" unit for now, the "slow" +;; option remains to be implemented later (together with the -mfpu flag). + +(define_insn_reservation "arcv_rmx100_fpload_insn" 3 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "fpload")) + "arcv_rmx100_DMP,nothing*2") + +(define_insn_reservation "arcv_rmx100_farith_insn" 2 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "fadd,fcmp")) + "arcv_rmx100_FPU,nothing") + +(define_insn_reservation "arcv_rmx100_xfer" 1 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "fmove,mtc,mfc,fcvt,fcvt_f2i,fcvt_i2f")) + "arcv_rmx100_FPU") + +(define_insn_reservation "arcv_rmx100_fmul_insn" 2 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "fmul")) + "arcv_rmx100_FPU,nothing") + +(define_insn_reservation "arcv_rmx100_fmac_insn" 2 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "fmadd")) + "arcv_rmx100_FPU,nothing") + +(define_insn_reservation "arcv_rmx100_fdiv_insn" 10 + (and (eq_attr "tune" "arcv_rmx100") + (eq_attr "type" "fdiv,fsqrt")) + "arcv_rmx100_FPU") + + +(define_bypass 1 "arcv_rmx100_mpy32_insn" + "arcv_rmx100_*" "arcv_mpy_1c_bypass_p") +(define_bypass 2 "arcv_rmx100_mpy32_insn" + "arcv_rmx100_*" "arcv_mpy_2c_bypass_p") + +(define_bypass 9 "arcv_rmx100_div_insn" "arcv_rmx100_*" "arcv_mpy_1c_bypass_p") +(define_bypass 9 "arcv_rmx100_div_insn" "arcv_rmx100_*" "arcv_mpy_2c_bypass_p") diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def index 7266b5eac113..e12871211015 100644 --- a/gcc/config/riscv/riscv-cores.def +++ b/gcc/config/riscv/riscv-cores.def @@ -51,6 +51,7 @@ RISCV_TUNE("xt-c920v2", generic, generic_ooo_tune_info) RISCV_TUNE("xiangshan-nanhu", xiangshan, xiangshan_nanhu_tune_info) RISCV_TUNE("xiangshan-kunminghu", xiangshan, generic_ooo_tune_info) RISCV_TUNE("spacemit-x60", spacemit_x60, spacemit_x60_tune_info) +RISCV_TUNE("arc-v-rmx-100-series", arcv_rmx100, arcv_rmx100_tune_info) RISCV_TUNE("generic-ooo", generic_ooo, generic_ooo_tune_info) RISCV_TUNE("size", generic, optimize_size_tune_info) RISCV_TUNE("mips-p8700", mips_p8700, mips_p8700_tune_info) diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h index 9b92a965e27f..bcfc7a642bc4 100644 --- a/gcc/config/riscv/riscv-opts.h +++ b/gcc/config/riscv/riscv-opts.h @@ -65,6 +65,7 @@ enum riscv_microarchitecture_type { andes_23_series, andes_45_series, spacemit_x60, + arcv_rmx100, }; extern enum riscv_microarchitecture_type riscv_microarchitecture; @@ -89,6 +90,13 @@ enum rvv_max_lmul_enum { RVV_DYNAMIC = 9 }; +/* ARC-V multiply option. */ +enum arcv_mpy_option_enum { + ARCV_MPY_OPTION_1C = 1, + ARCV_MPY_OPTION_2C = 2, + ARCV_MPY_OPTION_10C = 8, +}; + enum riscv_multilib_select_kind { /* Select multilib by builtin way. */ select_by_builtin, diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index abf9df77891f..fa5d906d2059 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -839,6 +839,10 @@ extern const char *th_output_move (rtx, rtx); extern bool th_print_operand_address (FILE *, machine_mode, rtx); #endif +extern bool arcv_mpy_1c_bypass_p (rtx_insn *, rtx_insn *); +extern bool arcv_mpy_2c_bypass_p (rtx_insn *, rtx_insn *); +extern bool arcv_mpy_10c_bypass_p (rtx_insn *, rtx_insn *); + extern bool strided_load_broadcast_p (void); extern bool riscv_prefer_agnostic_p (void); extern bool riscv_use_divmod_expander (void); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 96519c96a2b4..ca29db4ffeef 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -834,6 +834,31 @@ static const struct riscv_tune_param andes_45_tune_info = { true, /* prefer-agnostic. */ }; +/* Costs to use when optimizing for Synopsys RMX-100. */ +static const struct riscv_tune_param arcv_rmx100_tune_info = { + {COSTS_N_INSNS (2), COSTS_N_INSNS (2)}, /* fp_add */ + {COSTS_N_INSNS (2), COSTS_N_INSNS (2)}, /* fp_mul */ + {COSTS_N_INSNS (17), COSTS_N_INSNS (17)}, /* fp_div */ + {COSTS_N_INSNS (2), COSTS_N_INSNS (2)}, /* int_mul */ + {COSTS_N_INSNS (17), COSTS_N_INSNS (17)}, /* int_div */ + 1, /* issue_rate */ + 4, /* branch_cost */ + 2, /* memory_cost */ + 4, /* fmv_cost */ + false, /* slow_unaligned_access */ + false, /* vector_unaligned_access */ + false, /* use_divmod_expansion */ + false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ + false, /* speculative_sched_vsetvl */ + RISCV_FUSE_NOTHING, /* fusible_ops */ + NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ + true, /* prefer-agnostic. */ +}; + static bool riscv_avoid_shrink_wrapping_separate (); static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, bool *); static tree riscv_handle_type_attribute (tree *, tree, tree, int, bool *); @@ -10583,6 +10608,30 @@ riscv_store_data_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn) return store_data_bypass_p (out_insn, in_insn); } +/* Implement one boolean function for each of the values of the + arcv_mpy_option enum, for the needs of arcv-rmx100.md. */ + +bool +arcv_mpy_1c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, + rtx_insn *in_insn ATTRIBUTE_UNUSED) +{ + return arcv_mpy_option == ARCV_MPY_OPTION_1C; +} + +bool +arcv_mpy_2c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, + rtx_insn *in_insn ATTRIBUTE_UNUSED) +{ + return arcv_mpy_option == ARCV_MPY_OPTION_2C; +} + +bool +arcv_mpy_10c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, + rtx_insn *in_insn ATTRIBUTE_UNUSED) +{ + return arcv_mpy_option == ARCV_MPY_OPTION_10C; +} + /* Implement TARGET_SECONDARY_MEMORY_NEEDED. When floating-point registers are wider than integer ones, moves between diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 6f8cd26e5c95..8cc5dbd8efab 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -674,7 +674,8 @@ ;; Keep this in sync with enum riscv_microarchitecture. (define_attr "tune" "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,generic_ooo,mips_p8700, - tt_ascalon_d8,andes_25_series,andes_23_series,andes_45_series,spacemit_x60" + tt_ascalon_d8,andes_25_series,andes_23_series,andes_45_series,spacemit_x60, + arcv_rmx100" (const (symbol_ref "((enum attr_tune) riscv_microarchitecture)"))) ;; Describe a user's asm statement. @@ -4993,3 +4994,4 @@ (include "andes-25-series.md") (include "andes-45-series.md") (include "spacemit-x60.md") +(include "arcv-rmx100.md") diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index 452062c65008..5ed35b7fb497 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -412,3 +412,20 @@ Specifies whether the fence.tso instruction should be used. mautovec-segment Target Integer Var(riscv_mautovec_segment) Init(1) Enable (default) or disable generation of vector segment load/store instructions. + +Enum +Name(arcv_mpy_option) Type(enum arcv_mpy_option_enum) +Valid arguments to -mmpy_option=: + +EnumValue +Enum(arcv_mpy_option) String(1c) Value(ARCV_MPY_OPTION_1C) + +EnumValue +Enum(arcv_mpy_option) String(2c) Value(ARCV_MPY_OPTION_2C) + +EnumValue +Enum(arcv_mpy_option) String(10c) Value(ARCV_MPY_OPTION_10C) + +mmpy-option= +Target RejectNegative Joined Enum(arcv_mpy_option) Var(arcv_mpy_option) Init(ARCV_MPY_OPTION_2C) +The type of MPY unit used by the RMX-100 core (to be used in combination with -mtune=arc-v-rmx-100-series) (default: 2c). diff --git a/gcc/doc/riscv-mtune.texi b/gcc/doc/riscv-mtune.texi index 3e61d11462a9..c9c2fa62dd3b 100644 --- a/gcc/doc/riscv-mtune.texi +++ b/gcc/doc/riscv-mtune.texi @@ -52,6 +52,8 @@ particular CPU name. Permissible values for this option are: @samp{spacemit-x60}, +@samp{arc-v-rmx-100-series}, + @samp{generic-ooo}, @samp{size}, From 19667e132d7a64ac6c0878ab1ec2d21ba0543ac2 Mon Sep 17 00:00:00 2001 From: Michiel Derhaeg Date: Fri, 28 Nov 2025 18:56:06 +0100 Subject: [PATCH 2/6] RISC-V: Add Synopsys RHX-100 series pipeline description. This patch introduces the pipeline description for the Synopsys RHX-100 series processor to the RISC-V GCC backend. The RHX-100 features a 10-stage, dual-issue, in-order execution pipeline architecture. It has support for instruction fusion, which will be addressed by subsequent patches. gcc/ChangeLog: * config/riscv/riscv-cores.def (RISCV_TUNE): Add arc-v-rhx-100-series. * config/riscv/riscv-opts.h (enum riscv_microarchitecture_type): Add arcv_rhx100. * config/riscv/riscv.cc (enum riscv_fusion_pairs): Add RISCV_FUSE_ARCV. * config/riscv/riscv.md: Add arcv_rhx100 to tune attribute. * doc/riscv-mtune.texi: Add RHX-100 documentation. * config/riscv/arcv-rhx100.md: New file. Authored-by: Artemiy Volkov Co-authored-by: Michiel Derhaeg Signed-off-by: Luis Silva --- gcc/config/riscv/arcv-rhx100.md | 96 ++++++++++++++++++++++++++++++++ gcc/config/riscv/riscv-cores.def | 1 + gcc/config/riscv/riscv-opts.h | 1 + gcc/config/riscv/riscv.cc | 26 +++++++++ gcc/config/riscv/riscv.md | 3 +- gcc/doc/riscv-mtune.texi | 2 + 6 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 gcc/config/riscv/arcv-rhx100.md diff --git a/gcc/config/riscv/arcv-rhx100.md b/gcc/config/riscv/arcv-rhx100.md new file mode 100644 index 000000000000..c0631a17a280 --- /dev/null +++ b/gcc/config/riscv/arcv-rhx100.md @@ -0,0 +1,96 @@ +;; DFA scheduling description of the Synopsys RHX-100 cpu +;; for GNU C compiler +;; Copyright (C) 2025 Free Software Foundation, Inc. + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. + +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "arcv_rhx100") + +(define_cpu_unit "arcv_rhx100_ALU_A_fuse0_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_ALU_A_fuse1_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_ALU_B_fuse0_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_ALU_B_fuse1_early" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_MPY32" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_DIV" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_DMP_fuse0" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_DMP_fuse1" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_fdivsqrt" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueA_fuse0" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueA_fuse1" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueB_fuse0" "arcv_rhx100") +(define_cpu_unit "arcv_rhx100_issueB_fuse1" "arcv_rhx100") + +;; Instruction reservation for arithmetic instructions (pipe A, pipe B). +(define_insn_reservation "arcv_rhx100_alu_early_arith" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "unknown,move,const,arith,shift,slt,multi,auipc,nop,logical,\ + bitmanip,min,max,minu,maxu,clz,ctz,atomic,\ + condmove,mvpair,zicond,cpop,clmul")) + "((arcv_rhx100_issueA_fuse0 + arcv_rhx100_ALU_A_fuse0_early) | (arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse1_early)) | ((arcv_rhx100_issueB_fuse0 + arcv_rhx100_ALU_B_fuse0_early) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse1_early))") + +(define_insn_reservation "arcv_rhx100_jmp_insn" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "branch,jump,call,jalr,ret,trap")) + "arcv_rhx100_issueA_fuse0 | arcv_rhx100_issueA_fuse1") + +(define_insn_reservation "arcv_rhx100_div_insn" 12 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "idiv")) + "arcv_rhx100_issueA_fuse0 + arcv_rhx100_DIV, nothing*11") + +(define_insn_reservation "arcv_rhx100_mpy32_insn" 4 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "imul")) + "arcv_rhx100_issueA_fuse0 + arcv_rhx100_MPY32, nothing*3") + +(define_insn_reservation "arcv_rhx100_load_insn" 3 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "load,fpload")) + "(arcv_rhx100_issueB_fuse0 + arcv_rhx100_DMP_fuse0) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_DMP_fuse1)") + +(define_insn_reservation "arcv_rhx100_store_insn" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "store,fpstore")) + "(arcv_rhx100_issueB_fuse0 + arcv_rhx100_DMP_fuse0) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_DMP_fuse1)") + +;; (soft) floating points +(define_insn_reservation "arcv_rhx100_xfer" 3 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "mfc,mtc,fcvt,fcvt_i2f,fcvt_f2i,fmove,fcmp")) + "(arcv_rhx100_ALU_A_fuse0_early | arcv_rhx100_ALU_B_fuse0_early), nothing*2") + +(define_insn_reservation "arcv_rhx100_fmul" 5 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "fadd,fmul,fmadd")) + "(arcv_rhx100_ALU_A_fuse0_early | arcv_rhx100_ALU_B_fuse0_early)") + +(define_insn_reservation "arcv_rhx100_fdiv" 20 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "fdiv,fsqrt")) + "arcv_rhx100_fdivsqrt*20") + +;; Bypasses +(define_bypass 1 "arcv_rhx100_alu_early_arith" "arcv_rhx100_store_insn" "riscv_store_data_bypass_p") + +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_store_insn" "riscv_store_data_bypass_p") +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_alu_early_arith") +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_mpy*_insn") +(define_bypass 2 "arcv_rhx100_load_insn" "arcv_rhx100_load_insn") +(define_bypass 1 "arcv_rhx100_load_insn" "arcv_rhx100_div_insn") + +(define_bypass 3 "arcv_rhx100_mpy32_insn" "arcv_rhx100_mpy*_insn") +(define_bypass 3 "arcv_rhx100_mpy32_insn" "arcv_rhx100_div_insn") diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def index e12871211015..b7acc33f498b 100644 --- a/gcc/config/riscv/riscv-cores.def +++ b/gcc/config/riscv/riscv-cores.def @@ -52,6 +52,7 @@ RISCV_TUNE("xiangshan-nanhu", xiangshan, xiangshan_nanhu_tune_info) RISCV_TUNE("xiangshan-kunminghu", xiangshan, generic_ooo_tune_info) RISCV_TUNE("spacemit-x60", spacemit_x60, spacemit_x60_tune_info) RISCV_TUNE("arc-v-rmx-100-series", arcv_rmx100, arcv_rmx100_tune_info) +RISCV_TUNE("arc-v-rhx-100-series", arcv_rhx100, arcv_rhx100_tune_info) RISCV_TUNE("generic-ooo", generic_ooo, generic_ooo_tune_info) RISCV_TUNE("size", generic, optimize_size_tune_info) RISCV_TUNE("mips-p8700", mips_p8700, mips_p8700_tune_info) diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h index bcfc7a642bc4..ef36c879adab 100644 --- a/gcc/config/riscv/riscv-opts.h +++ b/gcc/config/riscv/riscv-opts.h @@ -66,6 +66,7 @@ enum riscv_microarchitecture_type { andes_45_series, spacemit_x60, arcv_rmx100, + arcv_rhx100, }; extern enum riscv_microarchitecture_type riscv_microarchitecture; diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index ca29db4ffeef..2e0ad70730d3 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -291,6 +291,7 @@ enum riscv_fusion_pairs RISCV_FUSE_BFEXT = (1 << 11), RISCV_FUSE_EXPANDED_LD = (1 << 12), RISCV_FUSE_B_ALUI = (1 << 13), + RISCV_FUSE_ARCV = (1 << 14), }; /* Costs of various operations on the different architectures. */ @@ -859,6 +860,31 @@ static const struct riscv_tune_param arcv_rmx100_tune_info = { true, /* prefer-agnostic. */ }; +/* Costs to use when optimizing for Synopsys RHX-100. */ +static const struct riscv_tune_param arcv_rhx100_tune_info = { + {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_add */ + {COSTS_N_INSNS (4), COSTS_N_INSNS (5)}, /* fp_mul */ + {COSTS_N_INSNS (20), COSTS_N_INSNS (20)}, /* fp_div */ + {COSTS_N_INSNS (4), COSTS_N_INSNS (4)}, /* int_mul */ + {COSTS_N_INSNS (27), COSTS_N_INSNS (43)}, /* int_div */ + 4, /* issue_rate */ + 9, /* branch_cost */ + 2, /* memory_cost */ + 8, /* fmv_cost */ + false, /* slow_unaligned_access */ + false, /* vector_unaligned_access */ + false, /* use_divmod_expansion */ + false, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ + false, /* speculative_sched_vsetvl */ + RISCV_FUSE_ARCV, /* fusible_ops */ + NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ + true, /* prefer-agnostic. */ +}; + static bool riscv_avoid_shrink_wrapping_separate (); static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, bool *); static tree riscv_handle_type_attribute (tree *, tree, tree, int, bool *); diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 8cc5dbd8efab..b063a40a8107 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -675,7 +675,7 @@ (define_attr "tune" "generic,sifive_7,sifive_p400,sifive_p600,xiangshan,generic_ooo,mips_p8700, tt_ascalon_d8,andes_25_series,andes_23_series,andes_45_series,spacemit_x60, - arcv_rmx100" + arcv_rmx100,arcv_rhx100" (const (symbol_ref "((enum attr_tune) riscv_microarchitecture)"))) ;; Describe a user's asm statement. @@ -4995,3 +4995,4 @@ (include "andes-45-series.md") (include "spacemit-x60.md") (include "arcv-rmx100.md") +(include "arcv-rhx100.md") diff --git a/gcc/doc/riscv-mtune.texi b/gcc/doc/riscv-mtune.texi index c9c2fa62dd3b..8370f63029cc 100644 --- a/gcc/doc/riscv-mtune.texi +++ b/gcc/doc/riscv-mtune.texi @@ -54,6 +54,8 @@ particular CPU name. Permissible values for this option are: @samp{arc-v-rmx-100-series}, +@samp{arc-v-rhx-100-series}, + @samp{generic-ooo}, @samp{size}, From 87b83a7a5547a481c932c4924b200151446117ea Mon Sep 17 00:00:00 2001 From: Michiel Derhaeg Date: Fri, 28 Nov 2025 19:11:30 +0100 Subject: [PATCH 3/6] RISC-V: Implement riscv_macro_fusion_pair_p for Synopsys RHX-100 series. This patch implements instruction fusion support for the Synopsys RHX-100 processor by adding the arcv_macro_fusion_pair_p function and supporting infrastructure. The implementation supports fusion of several instruction patterns: multiply-add sequences, shift-based bit extraction, load-immediate with conditional branches, adjacent memory operations, memory operations with arithmetic instructions, memory operations with LUI instructions, and load-immediate with store operations. A new arcv.cc file is added to contain ARC-V specific optimizations, and the existing multiply bypass functions are moved from riscv.cc to this new file for better organization. gcc/ChangeLog: * config.gcc: Add arcv.o to extra_objs. * config/riscv/riscv.cc (arcv_mpy_1c_bypass_p): Move to arcv.cc (arcv_mpy_2c_bypass_p): Move to arcv.cc (arcv_mpy_10c_bypass_p): Move to arcv.cc (riscv_macro_fusion_pair_p): New function. * config/riscv/t-riscv: Add arcv.o build rule. * config/riscv/arcv.cc: New file. Authored-by: Artemiy Volkov Co-authored-by: Michiel Derhaeg Signed-off-by: Luis Silva --- gcc/config.gcc | 2 +- gcc/config/riscv/arcv.cc | 523 ++++++++++++++++++++++++++++++++ gcc/config/riscv/riscv-protos.h | 3 + gcc/config/riscv/riscv.cc | 27 +- gcc/config/riscv/riscv.h | 4 + gcc/config/riscv/t-riscv | 9 + 6 files changed, 543 insertions(+), 25 deletions(-) create mode 100644 gcc/config/riscv/arcv.cc diff --git a/gcc/config.gcc b/gcc/config.gcc index 836cdff6317b..87c029d0e87e 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -560,7 +560,7 @@ riscv*) extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-string.o" extra_objs="${extra_objs} riscv-v.o riscv-vsetvl.o riscv-vector-costs.o riscv-avlprop.o riscv-vect-permconst.o" extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o sifive-vector-builtins-bases.o andes-vector-builtins-bases.o" - extra_objs="${extra_objs} thead.o riscv-target-attr.o riscv-zicfilp.o riscv-bclr-lowest-set-bit.o riscv-opt-popretz.o" + extra_objs="${extra_objs} thead.o riscv-target-attr.o riscv-zicfilp.o riscv-bclr-lowest-set-bit.o riscv-opt-popretz.o arcv.o" d_target_objs="riscv-d.o" extra_headers="riscv_vector.h riscv_crypto.h riscv_bitmanip.h riscv_th_vector.h sifive_vector.h andes_vector.h" target_gtfiles="$target_gtfiles \$(srcdir)/config/riscv/riscv-vector-builtins.cc" diff --git a/gcc/config/riscv/arcv.cc b/gcc/config/riscv/arcv.cc new file mode 100644 index 000000000000..f477113a98de --- /dev/null +++ b/gcc/config/riscv/arcv.cc @@ -0,0 +1,523 @@ +/* Subroutines used for code generation for Synopsys ARC-V processors. + Copyright (C) 2025 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "target.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "tm.h" +#include "optabs.h" +#include "regs.h" +#include "emit-rtl.h" +#include "recog.h" +#include "diagnostic-core.h" +#include "stor-layout.h" +#include "alias.h" +#include "fold-const.h" +#include "output.h" +#include "insn-attr.h" +#include "flags.h" +#include "explow.h" +#include "calls.h" +#include "varasm.h" +#include "expr.h" +#include "tm_p.h" +#include "df.h" +#include "reload.h" +#include "sched-int.h" +#include "tm-constrs.h" + +/* Implement one boolean function for each of the values of the + arcv_mpy_option enum, for the needs of arcv-rmx100.md. */ + +bool +arcv_mpy_1c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, + rtx_insn *in_insn ATTRIBUTE_UNUSED) +{ + return arcv_mpy_option == ARCV_MPY_OPTION_1C; +} + +bool +arcv_mpy_2c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, + rtx_insn *in_insn ATTRIBUTE_UNUSED) +{ + return arcv_mpy_option == ARCV_MPY_OPTION_2C; +} + +bool +arcv_mpy_10c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, + rtx_insn *in_insn ATTRIBUTE_UNUSED) +{ + return arcv_mpy_option == ARCV_MPY_OPTION_10C; +} + +/* Return TRUE if the target microarchitecture supports macro-op + fusion for two memory operations of mode MODE (the direction + of transfer is determined by the IS_LOAD parameter). */ + +static bool +arcv_pair_fusion_mode_allowed_p (machine_mode mode, bool is_load) +{ + if (!TARGET_ARCV_RHX100) + return true; + + return ((is_load && (mode == SImode + || mode == HImode + || mode == QImode)) + || (!is_load && mode == SImode)); +} + +/* Return TRUE if two addresses can be fused. */ + +static bool +arcv_fused_addr_p (rtx addr0, rtx addr1, bool is_load) +{ + rtx base0, base1, tmp; + HOST_WIDE_INT off0 = 0, off1 = 0; + + if (GET_CODE (addr0) == SIGN_EXTEND || GET_CODE (addr0) == ZERO_EXTEND) + addr0 = XEXP (addr0, 0); + + if (GET_CODE (addr1) == SIGN_EXTEND || GET_CODE (addr1) == ZERO_EXTEND) + addr1 = XEXP (addr1, 0); + + if (!MEM_P (addr0) || !MEM_P (addr1)) + return false; + + /* Require the accesses to have the same mode. */ + if (GET_MODE (addr0) != GET_MODE (addr1)) + return false; + + /* Check if the mode is allowed. */ + if (!arcv_pair_fusion_mode_allowed_p (GET_MODE (addr0), is_load)) + return false; + + rtx reg0 = XEXP (addr0, 0); + rtx reg1 = XEXP (addr1, 0); + + if (GET_CODE (reg0) == PLUS) + { + base0 = XEXP (reg0, 0); + tmp = XEXP (reg0, 1); + if (!CONST_INT_P (tmp)) + return false; + off0 = INTVAL (tmp); + } + else if (REG_P (reg0)) + base0 = reg0; + else + return false; + + if (GET_CODE (reg1) == PLUS) + { + base1 = XEXP (reg1, 0); + tmp = XEXP (reg1, 1); + if (!CONST_INT_P (tmp)) + return false; + off1 = INTVAL (tmp); + } + else if (REG_P (reg1)) + base1 = reg1; + else + return false; + + /* Check if we have the same base. */ + gcc_assert (REG_P (base0) && REG_P (base1)); + if (REGNO (base0) != REGNO (base1)) + return false; + + /* Fuse adjacent aligned addresses. */ + if ((off0 % GET_MODE_SIZE (GET_MODE (addr0)).to_constant () == 0) + && (abs (off1 - off0) == GET_MODE_SIZE (GET_MODE (addr0)).to_constant ())) + return true; + + return false; +} + +/* Helper function to check if instruction type is arithmetic-like. */ + +static bool +arcv_arith_type_insn_p (rtx_insn *insn) +{ + enum attr_type type = get_attr_type (insn); + + return (type == TYPE_ARITH + || type == TYPE_LOGICAL + || type == TYPE_SHIFT + || type == TYPE_SLT + || type == TYPE_BITMANIP + || type == TYPE_MIN + || type == TYPE_MAX + || type == TYPE_MINU + || type == TYPE_MAXU + || type == TYPE_CLZ + || type == TYPE_CTZ); +} + +/* Helper to check if curr's source operand is valid for fusion. */ + +static bool +arcv_arith_src_p (rtx curr_set) +{ + rtx src = SET_SRC (curr_set); + + /* Immediate operand or register operand. */ + return CONST_INT_P (src) || REG_P (XEXP (src, 0)); +} + +/* Helper to check if curr operation is compatible with load's destination. */ + +static bool +arcv_load_arith_pair_p (rtx prev_set, rtx curr_set) +{ + rtx load_addr = XEXP (SET_SRC (prev_set), 0); + rtx load_dest = SET_DEST (prev_set); + rtx arith_src = XEXP (SET_SRC (curr_set), 0); + rtx arith_dest = SET_DEST (curr_set); + + /* Address register must be a register. */ + if (!REG_P (load_addr)) + return false; + + /* Address register must match first source operand of arithmetic op. */ + if (REGNO (load_addr) != REGNO (arith_src)) + return false; + + /* Address register must not be the load destination (no clobber). */ + if (REGNO (load_addr) == REGNO (load_dest)) + return false; + + /* Load and arithmetic destinations must be different. */ + if (REGNO (load_dest) == REGNO (arith_dest)) + return false; + + /* Check operand constraints for different arithmetic formats. */ + rtx src = SET_SRC (curr_set); + + /* Unary operation: (set (reg:X rd1) (not (reg:X rs1))). */ + if (GET_RTX_LENGTH (GET_CODE (src)) == 1) + return true; + + /* Immediate operation: (set (reg:X rd2) (op (reg:X rs1) (const_int))). */ + if (CONST_INT_P (XEXP (src, 1))) + return true; + + /* Binary register operation: ensure load dest != second source register. */ + if (REGNO (load_dest) != REGNO (XEXP (src, 1))) + return true; + + return false; +} + +/* Helper to check if curr operation is compatible with store's address. */ + +static bool +arcv_store_arith_pair_p (rtx prev_set, rtx curr_set) +{ + rtx store_addr = XEXP (SET_DEST (prev_set), 0); + rtx arith_src = XEXP (SET_SRC (curr_set), 0); + + /* Address register must be a register. */ + if (!REG_P (store_addr)) + return false; + + /* Address register must match first source operand of arithmetic op. */ + if (REGNO (store_addr) != REGNO (arith_src)) + return false; + + /* Check operand constraints for different arithmetic formats. */ + rtx src = SET_SRC (curr_set); + + /* Unary operation. */ + if (GET_RTX_LENGTH (GET_CODE (src)) == 1) + return true; + + /* Immediate operation. */ + if (CONST_INT_P (XEXP (src, 1))) + return true; + + /* Binary register operation: store addr == second source is OK. */ + if (REGNO (store_addr) == REGNO (XEXP (src, 1))) + return true; + + return false; +} + +/* Return true if PREV and CURR constitute an ordered load/store + op/opimm + pair, for the purposes of ARCV-specific macro-op fusion. */ +static bool +arcv_memop_arith_pair_p (rtx_insn *prev, rtx_insn *curr) +{ + rtx prev_set = single_set (prev); + rtx curr_set = single_set (curr); + + gcc_assert (prev_set); + gcc_assert (curr_set); + + /* Check if curr is an arithmetic-type instruction. */ + if (!arcv_arith_type_insn_p (curr)) + return false; + + /* Check if curr has valid source operands. */ + if (!arcv_arith_src_p (curr_set)) + return false; + + /* Check for load + arithmetic fusion. */ + if (get_attr_type (prev) == TYPE_LOAD) + return arcv_load_arith_pair_p (prev_set, curr_set); + + /* Check for store + arithmetic fusion. */ + if (get_attr_type (prev) == TYPE_STORE) + return arcv_store_arith_pair_p (prev_set, curr_set); + + return false; +} + + +/* Return true if PREV and CURR constitute an ordered load/store + lui pair, for + the purposes of ARCV-specific macro-op fusion. */ + +static bool +arcv_memop_lui_pair_p (rtx_insn *prev, rtx_insn *curr) +{ + rtx prev_set = single_set (prev); + rtx curr_set = single_set (curr); + + gcc_assert (prev_set); + gcc_assert (curr_set); + + /* Check if curr is a LUI instruction: + - LUI via HIGH: (set (reg:X rd) (high (const_int))) + - LUI via immediate: (set (reg:X rd) (const_int UPPER_IMM_20)) */ + bool is_lui = (REG_P (curr) + && ((get_attr_type (curr) == TYPE_MOVE + && GET_CODE (SET_SRC (curr_set)) == HIGH) + || (CONST_INT_P (SET_SRC (curr_set)) + && LUI_OPERAND (INTVAL (SET_SRC (curr_set)))))); + + if (!is_lui) + return false; + + /* Check for load + LUI fusion: + Load and LUI destinations must be different to avoid hazard. */ + if (get_attr_type (prev) == TYPE_LOAD) + return REGNO (SET_DEST (prev_set)) != REGNO (SET_DEST (curr_set)); + + /* Check for store + LUI fusion (always allowed). */ + if (get_attr_type (prev) == TYPE_STORE) + return true; + + return false; +} + + +/* Return true if PREV and CURR should be kept together during scheduling. */ + +bool +arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) +{ + rtx prev_set = single_set (prev); + rtx curr_set = single_set (curr); + + /* Fuse multiply-add pair: + prev: (set rd_mult (mult rs1 rs2)) + curr: (set rd_add (plus rd_mult rs3)) */ + if (prev_set && curr_set + && GET_CODE (SET_SRC (prev_set)) == MULT + && GET_CODE (SET_SRC (curr_set)) == PLUS) + { + rtx curr_plus = SET_SRC (curr_set); + rtx mult_dest = SET_DEST (prev_set); + unsigned int mult_dest_regno = REGNO (mult_dest); + + /* Check if multiply result is used in either operand of the addition. */ + if (REG_P (XEXP (curr_plus, 0)) + && REGNO (XEXP (curr_plus, 0)) == mult_dest_regno) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_MULT_ADD (op0)\n"); + return true; + } + + if (REG_P (XEXP (curr_plus, 1)) + && REGNO (XEXP (curr_plus, 1)) == mult_dest_regno) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_MULT_ADD (op1)\n"); + return true; + } + } + + /* Fuse logical shift left with logical shift right (bit-extract pattern): + prev: (set rd (ashift rs imm1)) + curr: (set rd (lshiftrt rd imm2)) */ + if (prev_set && curr_set + && GET_CODE (SET_SRC (prev_set)) == ASHIFT + && GET_CODE (SET_SRC (curr_set)) == LSHIFTRT + && REGNO (SET_DEST (prev_set)) == REGNO (SET_DEST (curr_set)) + && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (SET_SRC (curr_set), 0))) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_SHIFT_BITEXTRACT\n"); + return true; + } + + /* Fuse load-immediate with a dependent conditional branch: + prev: (set rd imm) + curr: (if_then_else (cond rd ...) ...) */ + if (get_attr_type (prev) == TYPE_MOVE + && get_attr_move_type (prev) == MOVE_TYPE_CONST + && any_condjump_p (curr)) + { + if (!curr_set) + return false; + + rtx comp = XEXP (SET_SRC (curr_set), 0); + rtx prev_dest = SET_DEST (prev_set); + + if ((REG_P (XEXP (comp, 0)) && XEXP (comp, 0) == prev_dest) + || (REG_P (XEXP (comp, 1)) && XEXP (comp, 1) == prev_dest)) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_LI_BRANCH\n"); + return true; + } + return false; + } + + /* Do not fuse loads/stores before sched2. */ + if (!reload_completed || sched_fusion) + return false; + + /* prev and curr are simple SET insns i.e. no flag setting or branching. */ + bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); + + /* Don't handle anything with a jump past this point. */ + if (!simple_sets_p) + return false; + + /* Fuse adjacent loads. */ + if (get_attr_type (prev) == TYPE_LOAD + && get_attr_type (curr) == TYPE_LOAD) + { + if (arcv_fused_addr_p (SET_SRC (prev_set), SET_SRC (curr_set), true)) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_ADJACENT_LOAD\n"); + return true; + } + } + + /* Fuse adjacent stores. */ + if (get_attr_type (prev) == TYPE_STORE + && get_attr_type (curr) == TYPE_STORE) + { + if (arcv_fused_addr_p (SET_DEST (prev_set), SET_DEST (curr_set), false)) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_ADJACENT_STORE\n"); + return true; + } + } + + /* Look ahead 1 insn to prioritize adjacent load/store pairs. + If curr and next form a better fusion opportunity, defer this fusion. */ + rtx_insn *next = next_insn (curr); + if (next) + { + rtx next_set = single_set (next); + + /* Defer if next instruction forms an adjacent load pair with curr. */ + if (next_set + && get_attr_type (curr) == TYPE_LOAD + && get_attr_type (next) == TYPE_LOAD + && arcv_fused_addr_p (SET_SRC (curr_set), SET_SRC (next_set), true)) + return false; + + /* Defer if next instruction forms an adjacent store pair with curr. */ + if (next_set + && get_attr_type (curr) == TYPE_STORE + && get_attr_type (next) == TYPE_STORE + && arcv_fused_addr_p (SET_DEST (curr_set), SET_DEST (next_set), false)) + return false; + } + + /* Fuse a pre- or post-update memory operation: + Examples: load+add, add+load, store+add, add+store. */ + if (arcv_memop_arith_pair_p (prev, curr)) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_MEMOP_ARITH (prev, curr)\n"); + return true; + } + if (arcv_memop_arith_pair_p (curr, prev)) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_MEMOP_ARITH (curr, prev)\n"); + return true; + } + + /* Fuse a memory operation preceded or followed by a LUI: + Examples: load+lui, lui+load, store+lui, lui+store. */ + if (arcv_memop_lui_pair_p (prev, curr)) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_MEMOP_LUI (prev, curr)\n"); + return true; + } + if (arcv_memop_lui_pair_p (curr, prev)) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_MEMOP_LUI (curr, prev)\n"); + return true; + } + + /* Fuse load-immediate with a store of the destination register: + prev: (set rd imm) + curr: (set (mem ...) rd) */ + if (get_attr_type (prev) == TYPE_MOVE + && get_attr_move_type (prev) == MOVE_TYPE_CONST + && get_attr_type (curr) == TYPE_STORE) + { + rtx store_src = SET_SRC (curr_set); + rtx load_dest = SET_DEST (prev_set); + + if (REG_P (store_src) && store_src == load_dest) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_LI_STORE\n"); + return true; + } + + if (SUBREG_P (store_src) && SUBREG_REG (store_src) == load_dest) + { + if (dump_file) + fprintf (dump_file, "ARCV_FUSE_LI_STORE (subreg)\n"); + return true; + } + } + + return false; +} diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index fa5d906d2059..ef2c71361e2b 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -839,9 +839,12 @@ extern const char *th_output_move (rtx, rtx); extern bool th_print_operand_address (FILE *, machine_mode, rtx); #endif +/* Routines implemented in arcv.cc. */ extern bool arcv_mpy_1c_bypass_p (rtx_insn *, rtx_insn *); extern bool arcv_mpy_2c_bypass_p (rtx_insn *, rtx_insn *); extern bool arcv_mpy_10c_bypass_p (rtx_insn *, rtx_insn *); +extern bool arcv_macro_fusion_pair_p (rtx_insn *, rtx_insn *); +extern void arcv_sched_fusion_priority (rtx_insn *, int, int *, int *); extern bool strided_load_broadcast_p (void); extern bool riscv_prefer_agnostic_p (void); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 2e0ad70730d3..d0cd89884462 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -10634,30 +10634,6 @@ riscv_store_data_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn) return store_data_bypass_p (out_insn, in_insn); } -/* Implement one boolean function for each of the values of the - arcv_mpy_option enum, for the needs of arcv-rmx100.md. */ - -bool -arcv_mpy_1c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, - rtx_insn *in_insn ATTRIBUTE_UNUSED) -{ - return arcv_mpy_option == ARCV_MPY_OPTION_1C; -} - -bool -arcv_mpy_2c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, - rtx_insn *in_insn ATTRIBUTE_UNUSED) -{ - return arcv_mpy_option == ARCV_MPY_OPTION_2C; -} - -bool -arcv_mpy_10c_bypass_p (rtx_insn *out_insn ATTRIBUTE_UNUSED, - rtx_insn *in_insn ATTRIBUTE_UNUSED) -{ - return arcv_mpy_option == ARCV_MPY_OPTION_10C; -} - /* Implement TARGET_SECONDARY_MEMORY_NEEDED. When floating-point registers are wider than integer ones, moves between @@ -11785,6 +11761,9 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } + if (riscv_fusion_enabled_p (RISCV_FUSE_ARCV)) + return arcv_macro_fusion_pair_p (prev, curr); + return false; } diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 6a3e5372d3b5..0254cec5f79f 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -972,6 +972,10 @@ extern enum riscv_cc get_riscv_cc (const rtx use); || (riscv_microarchitecture == sifive_p400) \ || (riscv_microarchitecture == sifive_p600)) +/* True if the target is ARC-V RHX100. */ +#define TARGET_ARCV_RHX100 \ + (riscv_microarchitecture == arcv_rhx100) + /* True if the target supports misaligned vector loads and stores. */ #define TARGET_VECTOR_MISALIGN_SUPPORTED \ riscv_vector_unaligned_access_p diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 2761e5e20c00..73a6313e22b8 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -187,6 +187,15 @@ riscv-zicfilp.o: $(srcdir)/config/riscv/riscv-zicfilp.cc \ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/riscv/riscv-zicfilp.cc +arcv.o: $(srcdir)/config/riscv/arcv.cc \ + $(CONFIG_H) $(SYSTEM_H) coretypes.h $(BACKEND_H) $(TARGET_H) $(RTL_H) \ + $(TREE_H) memmodel.h $(TM_H) $(OPTABS_H) $(REGS_H) $(EMIT_RTL_H) \ + $(RECOG_H) $(DIAGNOSTIC_CORE_H) stor-layout.h $(ALIAS_H) fold-const.h \ + output.h $(INSN_ATTR_H) $(FLAGS_H) explow.h $(CALLS_H) varasm.h \ + $(EXPR_H) tm-constrs.h $(TM_P_H) $(DF_H) reload.h sched-int.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/riscv/arcv.cc + PASSES_EXTRA += $(srcdir)/config/riscv/riscv-passes.def $(common_out_file): $(srcdir)/config/riscv/riscv-cores.def \ From 959550cff7f584f5dcb63ec3f7435cab3d144a33 Mon Sep 17 00:00:00 2001 From: Michiel Derhaeg Date: Fri, 28 Nov 2025 20:38:02 +0100 Subject: [PATCH 4/6] RISC-V: Implement TARGET_SCHED_FUSION_PRIORITY for Synopsys RHX-100 series. This patch implements the TARGET_SCHED_FUSION_PRIORITY hook for the Synopsys RHX-100 processor to improve instruction scheduling by prioritizing fusible memory operations. The implementation analyzes load and store instructions to extract base registers and offsets, then assigns scheduling priorities based on several factors: access width (wider accesses get higher priority), base register number, and memory offset values. Instructions with adjacent addresses are grouped together to enable better fusion opportunities. gcc/ChangeLog: * config/riscv/arcv.cc (arcv_fusion_load_store): New function. (arcv_sched_fusion_priority): New function. * config/riscv/riscv.cc (riscv_sched_fusion_priority): New function. (TARGET_SCHED_FUSION_PRIORITY): Define hook. Authored-by: Artemiy Volkov Co-authored-by: Michiel Derhaeg Signed-off-by: Luis Silva --- gcc/config/riscv/arcv.cc | 101 ++++++++++++++++++++++++++++++++++++++ gcc/config/riscv/riscv.cc | 17 +++++++ 2 files changed, 118 insertions(+) diff --git a/gcc/config/riscv/arcv.cc b/gcc/config/riscv/arcv.cc index f477113a98de..2119d431c8d5 100644 --- a/gcc/config/riscv/arcv.cc +++ b/gcc/config/riscv/arcv.cc @@ -521,3 +521,104 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return false; } + +/* If INSN is a load or store of address in the form of [base+offset], + extract the two parts and set to BASE and OFFSET. IS_LOAD is set + to TRUE if it's a load. Return TRUE if INSN is such an instruction, + otherwise return FALSE. */ + +static bool +arcv_fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, + machine_mode *mode, bool *is_load) +{ + rtx x, dest, src; + + gcc_assert (INSN_P (insn)); + x = PATTERN (insn); + if (GET_CODE (x) != SET) + return false; + + src = SET_SRC (x); + dest = SET_DEST (x); + + if ((GET_CODE (src) == SIGN_EXTEND || GET_CODE (src) == ZERO_EXTEND) + && MEM_P (XEXP (src, 0))) + src = XEXP (src, 0); + + if (REG_P (src) && MEM_P (dest)) + { + *is_load = false; + if (extract_base_offset_in_addr (dest, base, offset)) + *mode = GET_MODE (dest); + } + else if (MEM_P (src) && REG_P (dest)) + { + *is_load = true; + if (extract_base_offset_in_addr (src, base, offset)) + *mode = GET_MODE (src); + } + else + return false; + + return (*base != NULL_RTX && *offset != NULL_RTX); +} + +void +arcv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, + int *pri) +{ + rtx base, offset; + machine_mode mode = SImode; + bool is_load; + + gcc_assert (INSN_P (insn)); + + /* Default priority for non-fusible instructions. */ + int default_pri = max_pri - 1; + + /* Check if this is a fusible load/store instruction. */ + if (!arcv_fusion_load_store (insn, &base, &offset, &mode, &is_load) + || !arcv_pair_fusion_mode_allowed_p (mode, is_load)) + { + *pri = default_pri; + *fusion_pri = default_pri; + return; + } + + /* Start with half the default priority to distinguish fusible from + non-fusible instructions. */ + int priority = default_pri / 2; + + /* Scale priority by access width - narrower accesses get lower priority. + HImode: divide by 2, QImode: divide by 4. This encourages wider + accesses to be scheduled together. */ + if (mode == HImode) + priority /= 2; + else if (mode == QImode) + priority /= 4; + + /* Factor in base register: instructions with smaller register numbers + get higher priority. The shift by 20 bits ensures this is the most + significant component of the priority. */ + const int BASE_REG_SHIFT = 20; + const int BASE_REG_MASK = 0xff; + priority -= ((REGNO (base) & BASE_REG_MASK) << BASE_REG_SHIFT); + + /* Calculate fusion priority: group loads/stores with adjacent addresses + into the same scheduling group. We divide the offset by (mode_size * 2) + to group pairs of adjacent accesses, then shift left by 1 to make room + for the load/store bit. */ + int off_val = (int)(INTVAL (offset)); + int addr_group = off_val / (GET_MODE_SIZE (mode).to_constant () * 2); + *fusion_pri = priority - (addr_group << 1) + is_load; + + /* Factor in the actual offset value: instructions with smaller offsets + get higher priority. We use only the lower 20 bits to avoid overflow. */ + const int OFFSET_MASK = 0xfffff; + if (off_val >= 0) + priority -= (off_val & OFFSET_MASK); + else + priority += ((-off_val) & OFFSET_MASK); + + *pri = priority; +} diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index d0cd89884462..f89eeeb3080b 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -11767,6 +11767,21 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return false; } +static void +riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, + int *pri) +{ + if (TARGET_ARCV_RHX100) + { + arcv_sched_fusion_priority (insn, max_pri, fusion_pri, pri); + return; + } + + /* Default priority. */ + *pri = max_pri - 1; + *fusion_pri = max_pri - 1; +} + /* Adjust the cost/latency of instructions for scheduling. For now this is just used to change the latency of vector instructions according to their LMUL. We assume that an insn with LMUL == 8 requires @@ -16423,6 +16438,8 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode mode) #define TARGET_SCHED_MACRO_FUSION_P riscv_macro_fusion_p #undef TARGET_SCHED_MACRO_FUSION_PAIR_P #define TARGET_SCHED_MACRO_FUSION_PAIR_P riscv_macro_fusion_pair_p +#undef TARGET_SCHED_FUSION_PRIORITY +#define TARGET_SCHED_FUSION_PRIORITY riscv_sched_fusion_priority #undef TARGET_SCHED_INIT #define TARGET_SCHED_INIT riscv_sched_init From 6db6a9f047cfd149c91a4badf991d6ea1f041b79 Mon Sep 17 00:00:00 2001 From: Michiel Derhaeg Date: Fri, 28 Nov 2025 20:45:29 +0100 Subject: [PATCH 5/6] RISC-V: Implement scheduling for Synopsys RHX-100 series. This patch implements instruction scheduling support for the dual-issue Synopsys RHX-100 processor by adding scheduler hooks and state tracking for the two execution pipes. The implementation tracks ALU pipe and memory pipe usage to maximize dual-issue opportunities. It includes reordering logic to promote fusion of adjacent memory operations and other instruction pairs that can execute simultaneously on the RHX-100's dual-issue architecture. The scheduler prioritizes fused instruction pairs and adjusts costs to improve scheduling decisions. Memory operations are directed to the appropriate pipe while arithmetic operations utilize the ALU pipe, enabling optimal utilization of both execution units. New TARGET_SCHED hooks are implemented including ADJUST_PRIORITY, REORDER2, and enhanced VARIABLE_ISSUE handling specifically for the RHX-100 microarchitecture. gcc/ChangeLog: * config/riscv/riscv-protos.h (arcv_sched_init): New declaration. (arcv_sched_reorder2): declaration. (arcv_sched_adjust_priority): New declaration. (arcv_sched_adjust_cost): New declaration. (arcv_can_issue_more_p): New declaration. (arcv_sched_variable_issue): New declaration. * config/riscv/arcv.cc (struct arcv_sched_state): New struct. (arcv_sched_init): New function. (arcv_next_fusible_insn): New function. (arcv_sched_reorder2): New function. (arcv_sched_adjust_priority): New function. (arcv_sched_adjust_cost): New function. (arcv_can_issue_more_p): New function. (arcv_sched_variable_issue): New function. * config/riscv/riscv.cc (riscv_fusion_enabled_p): Add forward declaration. (riscv_sched_init): Add call to arcv_shed_init. (riscv_sched_variable_issue): Add ARC-V-specific handling. (riscv_sched_adjust_cost): Add ARC-V-specific cost adjustment and fix parameter names. (riscv_sched_adjust_priority): New function. (riscv_sched_reorder2): New function. (TARGET_SCHED_ADJUST_PRIORITY): Define hook. (TARGET_SCHED_REORDER2): Define hook. * config/riscv/riscv.h (TARGET_ARCV_RHX100): New macro. Authored-by: Artemiy Volkov Co-authored-by: Michiel Derhaeg Co-authored-by: Alex Turjan Signed-off-by: Luis Silva --- gcc/config/riscv/arcv.cc | 287 ++++++++++++++++++++++++++++++++ gcc/config/riscv/riscv-protos.h | 6 + gcc/config/riscv/riscv.cc | 58 ++++++- gcc/config/riscv/riscv.md | 2 +- 4 files changed, 348 insertions(+), 5 deletions(-) diff --git a/gcc/config/riscv/arcv.cc b/gcc/config/riscv/arcv.cc index 2119d431c8d5..16766a8b4129 100644 --- a/gcc/config/riscv/arcv.cc +++ b/gcc/config/riscv/arcv.cc @@ -49,6 +49,32 @@ along with GCC; see the file COPYING3. If not see #include "sched-int.h" #include "tm-constrs.h" +/* Scheduler state tracking for dual-pipe ARCV architectures. */ + +struct arcv_sched_state { + /* True if the ALU pipe has been scheduled for the current cycle. + The ALU pipe handles arithmetic, logical, and other computational + instructions. */ + int alu_pipe_scheduled_p; + + /* True if pipe B has been scheduled for the current cycle. + Pipe B is the second execution pipe, typically used for memory + operations (loads/stores) but can also handle other instructions. */ + int pipeB_scheduled_p; + + /* The last instruction that was scheduled. Used to detect fusion + opportunities by looking ahead at the next instruction to be + scheduled. */ + rtx_insn *last_scheduled_insn; + + /* Cached value of how many more instructions can be issued in the + current cycle. Updated as instructions are scheduled and pipes + become occupied. */ + short cached_can_issue_more; +}; + +static struct arcv_sched_state sched_state; + /* Implement one boolean function for each of the values of the arcv_mpy_option enum, for the needs of arcv-rmx100.md. */ @@ -522,6 +548,218 @@ arcv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) return false; } +/* Initialize ARCV scheduler state at the beginning of scheduling. */ + +void +arcv_sched_init (void) +{ + sched_state.last_scheduled_insn = 0; +} + +/* Return the next possible fusible insn. */ + +static rtx_insn * +arcv_next_fusible_insn (rtx_insn *insn) +{ + while (insn) + { + insn = NEXT_INSN (insn); + + if (insn == 0) + break; + + if (DEBUG_INSN_P (insn) + || NOTE_P (insn)) + continue; + + if (NOTE_INSN_BASIC_BLOCK_P (insn)) + return NULL; + + if (GET_CODE (insn) == CODE_LABEL + || GET_CODE (insn) == BARRIER + || GET_CODE (PATTERN (insn)) == USE) + continue; + + if (JUMP_TABLE_DATA_P (insn)) + return NULL; + + break; + } + + return insn; +} + +/* Try to reorder ready queue to promote ARCV fusion opportunities. + Returns the number of instructions that can be issued this cycle. */ + +int +arcv_sched_reorder2 (rtx_insn **ready, int *n_readyp) +{ + if (sched_fusion) + return sched_state.cached_can_issue_more; + + if (!sched_state.cached_can_issue_more) + return 0; + + /* Fuse double load/store instances missed by sched_fusion. */ + if (!sched_state.pipeB_scheduled_p && sched_state.last_scheduled_insn + && ready && *n_readyp > 0 + && !SCHED_GROUP_P (sched_state.last_scheduled_insn) + && (get_attr_type (sched_state.last_scheduled_insn) == TYPE_LOAD + || get_attr_type (sched_state.last_scheduled_insn) == TYPE_STORE)) + { + for (int i = 1; i <= *n_readyp; i++) + { + rtx_insn* next_insn = arcv_next_fusible_insn (ready[*n_readyp - i]); + /* Try to fuse the last_scheduled_insn with. */ + /* Fuse only with nondebug insn. */ + if (NONDEBUG_INSN_P (ready[*n_readyp - i]) + /* Which have not been already fused. */ + && !SCHED_GROUP_P (ready[*n_readyp - i]) + && (!next_insn || !SCHED_GROUP_P (next_insn)) + && arcv_macro_fusion_pair_p (sched_state.last_scheduled_insn, + ready[*n_readyp - i])) + { + std::swap (ready[*n_readyp - 1], ready[*n_readyp - i]); + SCHED_GROUP_P (ready[*n_readyp - 1]) = 1; + sched_state.pipeB_scheduled_p = 1; + return sched_state.cached_can_issue_more; + } + } + sched_state.pipeB_scheduled_p = 1; + } + + /* Try to fuse a non-memory last_scheduled_insn. */ + if ((!sched_state.alu_pipe_scheduled_p || !sched_state.pipeB_scheduled_p) + && sched_state.last_scheduled_insn && ready && *n_readyp > 0 + && !SCHED_GROUP_P (sched_state.last_scheduled_insn) + && (get_attr_type (sched_state.last_scheduled_insn) != TYPE_LOAD + && get_attr_type (sched_state.last_scheduled_insn) != TYPE_STORE)) + { + for (int i = 1; i <= *n_readyp; i++) + { + rtx_insn* next_insn = arcv_next_fusible_insn (ready[*n_readyp - i]); + if (NONDEBUG_INSN_P (ready[*n_readyp - i]) + && !SCHED_GROUP_P (ready[*n_readyp - i]) + && active_insn_p (ready[*n_readyp - i]) + && (!next_insn || !SCHED_GROUP_P (next_insn)) + && arcv_macro_fusion_pair_p (sched_state.last_scheduled_insn, + ready[*n_readyp - i])) + { + if (GET_CODE (PATTERN (ready[*n_readyp - i])) == USE) + continue; + + if (get_attr_type (ready[*n_readyp - i]) == TYPE_LOAD + || get_attr_type (ready[*n_readyp - i]) == TYPE_STORE) + { + if (sched_state.pipeB_scheduled_p) + continue; + else + sched_state.pipeB_scheduled_p = 1; + } + else if (!sched_state.alu_pipe_scheduled_p) + sched_state.alu_pipe_scheduled_p = 1; + else + sched_state.pipeB_scheduled_p = 1; + + std::swap (ready[*n_readyp - 1], ready[*n_readyp - i]); + SCHED_GROUP_P (ready[*n_readyp - 1]) = 1; + return sched_state.cached_can_issue_more; + } + } + sched_state.alu_pipe_scheduled_p = 1; + } + + /* When pipe B is scheduled, we can have no more memops this cycle. */ + if (sched_state.pipeB_scheduled_p && *n_readyp > 0 + && NONDEBUG_INSN_P (ready[*n_readyp - 1]) + && recog_memoized (ready[*n_readyp - 1]) >= 0 + && !SCHED_GROUP_P (ready[*n_readyp - 1]) + && (get_attr_type (ready[*n_readyp - 1]) == TYPE_LOAD + || get_attr_type (ready[*n_readyp - 1]) == TYPE_STORE)) + { + if (sched_state.alu_pipe_scheduled_p) + return 0; + + for (int i = 2; i <= *n_readyp; i++) + { + rtx_insn* next_insn = arcv_next_fusible_insn (ready[*n_readyp - i]); + if ((NONDEBUG_INSN_P (ready[*n_readyp - i]) + && recog_memoized (ready[*n_readyp - i]) >= 0 + && get_attr_type (ready[*n_readyp - i]) != TYPE_LOAD + && get_attr_type (ready[*n_readyp - i]) != TYPE_STORE + && !SCHED_GROUP_P (ready[*n_readyp - i]) + && (!next_insn || !SCHED_GROUP_P (next_insn))) + || (next_insn && NONDEBUG_INSN_P (next_insn) + && recog_memoized (next_insn) >= 0 + && get_attr_type (next_insn) != TYPE_LOAD + && get_attr_type (next_insn) != TYPE_STORE)) + { + std::swap (ready[*n_readyp - 1], ready[*n_readyp - i]); + sched_state.alu_pipe_scheduled_p = 1; + sched_state.cached_can_issue_more = 1; + return 1; + } + } + return 0; + } + + /* If all else fails, schedule a single instruction. */ + if (ready && *n_readyp > 0 + && NONDEBUG_INSN_P (ready[*n_readyp - 1]) + && recog_memoized (ready[*n_readyp - 1]) >= 0) + { + rtx_insn *insn = ready[*n_readyp - 1]; + enum attr_type insn_type = get_attr_type (insn); + + /* Memory operations go to pipeB if available. */ + if (!sched_state.pipeB_scheduled_p + && (insn_type == TYPE_LOAD || insn_type == TYPE_STORE)) + { + sched_state.pipeB_scheduled_p = 1; + } + /* Non-memory operations go to ALU pipe. */ + else if (insn_type != TYPE_LOAD && insn_type != TYPE_STORE) + { + sched_state.alu_pipe_scheduled_p = 1; + } + } + + return sched_state.cached_can_issue_more; +} + +int +arcv_sched_adjust_priority (rtx_insn *insn, int priority) +{ + if (DEBUG_INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE + || GET_CODE (PATTERN (insn)) == CLOBBER) + return priority; + + /* Bump the priority of fused load-store pairs for easier + scheduling of the memory pipe. The specific increase + value is determined empirically. */ + if (next_insn (insn) && INSN_P (next_insn (insn)) + && SCHED_GROUP_P (next_insn (insn)) + && ((get_attr_type (insn) == TYPE_STORE + && get_attr_type (next_insn (insn)) == TYPE_STORE) + || (get_attr_type (insn) == TYPE_LOAD + && get_attr_type (next_insn (insn)) == TYPE_LOAD))) + return priority + 1; + + return priority; +} + +/* Adjust scheduling cost for ARCV fusion. */ + +int +arcv_sched_adjust_cost (rtx_insn *insn, int dep_type, int cost) +{ + if (dep_type == REG_DEP_ANTI && !SCHED_GROUP_P (insn)) + return cost + 1; + + return cost; +} + /* If INSN is a load or store of address in the form of [base+offset], extract the two parts and set to BASE and OFFSET. IS_LOAD is set to TRUE if it's a load. Return TRUE if INSN is such an instruction, @@ -622,3 +860,52 @@ arcv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, *pri = priority; } + +bool +arcv_can_issue_more_p (int issue_rate, int more) +{ + /* Beginning of cycle - reset variables. */ + if (more == issue_rate) + { + sched_state.alu_pipe_scheduled_p = 0; + sched_state.pipeB_scheduled_p = 0; + } + + if (sched_state.alu_pipe_scheduled_p && sched_state.pipeB_scheduled_p) + { + sched_state.cached_can_issue_more = 0; + return false; + } + + sched_state.cached_can_issue_more = more; + + return true; +} + +int +arcv_sched_variable_issue (rtx_insn *insn, int more) +{ + if (next_insn (insn) && INSN_P (next_insn (insn)) + && SCHED_GROUP_P (next_insn (insn))) + { + if (get_attr_type (insn) == TYPE_LOAD + || get_attr_type (insn) == TYPE_STORE + || get_attr_type (next_insn (insn)) == TYPE_LOAD + || get_attr_type (next_insn (insn)) == TYPE_STORE) + sched_state.pipeB_scheduled_p = 1; + else + sched_state.alu_pipe_scheduled_p = 1; + } + + if (get_attr_type (insn) == TYPE_ALU_FUSED + || get_attr_type (insn) == TYPE_IMUL_FUSED) + { + sched_state.alu_pipe_scheduled_p = 1; + more -= 1; + } + + sched_state.last_scheduled_insn = insn; + sched_state.cached_can_issue_more = more - 1; + + return sched_state.cached_can_issue_more; +} diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index ef2c71361e2b..490218967a46 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -845,6 +845,12 @@ extern bool arcv_mpy_2c_bypass_p (rtx_insn *, rtx_insn *); extern bool arcv_mpy_10c_bypass_p (rtx_insn *, rtx_insn *); extern bool arcv_macro_fusion_pair_p (rtx_insn *, rtx_insn *); extern void arcv_sched_fusion_priority (rtx_insn *, int, int *, int *); +extern void arcv_sched_init (void); +extern int arcv_sched_reorder2 (rtx_insn **, int *); +extern int arcv_sched_adjust_priority (rtx_insn *, int); +extern int arcv_sched_adjust_cost (rtx_insn *, int, int); +extern bool arcv_can_issue_more_p (int, int); +extern int arcv_sched_variable_issue (rtx_insn *, int); extern bool strided_load_broadcast_p (void); extern bool riscv_prefer_agnostic_p (void); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index f89eeeb3080b..005817a61f4d 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -340,6 +340,9 @@ unsigned riscv_stack_boundary; /* Whether in riscv_output_mi_thunk. */ static bool riscv_in_thunk_func = false; +/* Return true if the instruction fusion described by OP is enabled. */ +static bool riscv_fusion_enabled_p (enum riscv_fusion_pairs op); + /* If non-zero, this is an offset to be added to SP to redefine the CFA when restoring the FP register from the stack. Only valid when generating the epilogue. */ @@ -10929,12 +10932,20 @@ static void riscv_sched_init (FILE *, int, int) { clear_vconfig (); + + if (riscv_fusion_enabled_p (RISCV_FUSE_ARCV)) + arcv_sched_init (); } /* Implement TARGET_SCHED_VARIABLE_ISSUE. */ static int riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more) { + + if (riscv_fusion_enabled_p (RISCV_FUSE_ARCV)) + if (!arcv_can_issue_more_p (riscv_issue_rate (), more)) + return 0; + if (DEBUG_INSN_P (insn)) return more; @@ -10980,6 +10991,9 @@ riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more) } } + if (riscv_fusion_enabled_p (RISCV_FUSE_ARCV)) + return arcv_sched_variable_issue (insn, more); + return more - 1; } @@ -11790,17 +11804,21 @@ riscv_sched_fusion_priority (rtx_insn *insn, int max_pri, int *fusion_pri, we currently only perform the adjustment when -madjust-lmul-cost is given. */ static int -riscv_sched_adjust_cost (rtx_insn *, int, rtx_insn *insn, int cost, +riscv_sched_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, unsigned int) { + /* Use ARCV-specific cost adjustment for RHX-100. */ + if (TARGET_ARCV_RHX100) + return arcv_sched_adjust_cost (insn, dep_type, cost); + /* Only do adjustments for the generic out-of-order scheduling model. */ if (!TARGET_VECTOR || riscv_microarchitecture != generic_ooo) return cost; - if (recog_memoized (insn) < 0) + if (recog_memoized (dep_insn) < 0) return cost; - enum attr_type type = get_attr_type (insn); + enum attr_type type = get_attr_type (dep_insn); if (type == TYPE_VFREDO || type == TYPE_VFWREDO) { @@ -11818,7 +11836,7 @@ riscv_sched_adjust_cost (rtx_insn *, int, rtx_insn *insn, int cost, return cost; enum riscv_vector::vlmul_type lmul = - (riscv_vector::vlmul_type)get_attr_vlmul (insn); + (riscv_vector::vlmul_type)get_attr_vlmul (dep_insn); double factor = 1; switch (lmul) @@ -11872,6 +11890,32 @@ riscv_sched_can_speculate_insn (rtx_insn *insn) } } +/* Implement TARGET_SCHED_ADJUST_PRIORITY hook. */ + +static int +riscv_sched_adjust_priority (rtx_insn *insn, int priority) +{ + if (riscv_fusion_enabled_p (RISCV_FUSE_ARCV)) + return arcv_sched_adjust_priority (insn, priority); + + return priority; +} + +/* Implement TARGET_SCHED_REORDER2 hook. */ + +static int +riscv_sched_reorder2 (FILE *file ATTRIBUTE_UNUSED, + int verbose ATTRIBUTE_UNUSED, + rtx_insn **ready, + int *n_readyp, + int clock ATTRIBUTE_UNUSED) +{ + if (riscv_fusion_enabled_p (RISCV_FUSE_ARCV)) + return arcv_sched_reorder2 (ready, n_readyp); + + return 0; +} + /* Auxiliary function to emit RISC-V ELF attribute. */ static void riscv_emit_attribute () @@ -16456,6 +16500,12 @@ riscv_prefetch_offset_address_p (rtx x, machine_mode mode) #undef TARGET_SCHED_CAN_SPECULATE_INSN #define TARGET_SCHED_CAN_SPECULATE_INSN riscv_sched_can_speculate_insn +#undef TARGET_SCHED_ADJUST_PRIORITY +#define TARGET_SCHED_ADJUST_PRIORITY riscv_sched_adjust_priority + +#undef TARGET_SCHED_REORDER2 +#define TARGET_SCHED_REORDER2 riscv_sched_reorder2 + #undef TARGET_FUNCTION_OK_FOR_SIBCALL #define TARGET_FUNCTION_OK_FOR_SIBCALL riscv_function_ok_for_sibcall diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index b063a40a8107..194526c47573 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -519,7 +519,7 @@ vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down, vgather,vcompress,vmov,vector,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vcpop,vrol,vror,vwsll, vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,vaeskf1,vaeskf2,vaesz, - vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16, + vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c,vfncvtbf16,vfwcvtbf16,vfwmaccbf16,imul_fused,alu_fused, sf_vc,sf_vc_se" (cond [(eq_attr "got" "load") (const_string "load") From f86d7b80ea320e00e02340dc01ddaf9271171359 Mon Sep 17 00:00:00 2001 From: Michiel Derhaeg Date: Fri, 28 Nov 2025 20:55:43 +0100 Subject: [PATCH 6/6] RISC-V: Add instruction patterns for 32-bit multiply-add and bit-extraxt fusion. This patch adds instruction patterns to support fusion of multiply-add sequences and bit extraction operations for the Synopsys RHX-100 processor. The multiply-add fusion supports both signed and unsigned 16-bit operands expanded to 32-bit multiply-accumulate operations. The implementation generates separate multiply and add instructions that can be fused by the processor hardware. The bit extraction fusion implements zero extraction using shift-left followed by shift-right operations, which can be fused into a single micro-operation. New instruction types "imul_fused" and "alu_fused" are added to the scheduling model to handle these fused operations. Test cases are included to verify the correct generation of fusible instruction sequences for multiply-add, bit extraction, and load-immediate with conditional branch patterns. gcc/ChangeLog: * config/riscv/arcv-rhx100.md (arcv_rhx100_imul_fused): New reservation. (arcv_rhx100_alu_fused): New reservation. * config/riscv/iterators.md (is_zero_extract): New code attribute. * config/riscv/riscv.cc (riscv_rtx_costs): Add TARGET_ARCV_RHX100 support for SIGN_EXTRACT. * config/riscv/riscv.md: Add imul_fused and alu_fused to type attribute. (umaddhisi4): New expand. (madd_split): New insn_and_split. (madd_split_extended): New insn_and_split. (*zero_extract_fused): New insn. gcc/testsuite/ChangeLog: * gcc.target/riscv/arcv-fusion-limm-condbr.c: New test. * gcc.target/riscv/arcv-fusion-madd.c: New test. * gcc.target/riscv/arcv-fusion-xbfu.c: New test. Authored-by: Artemiy Volkov Co-authored-by: Michiel Derhaeg Signed-off-by: Luis Silva --- gcc/config/riscv/arcv-rhx100.md | 10 ++ gcc/config/riscv/iterators.md | 2 + gcc/config/riscv/riscv.cc | 3 +- gcc/config/riscv/riscv.md | 138 +++++++++++++++++- .../riscv/arcv-fusion-limm-condbr.c | 12 ++ .../gcc.target/riscv/arcv-fusion-madd.c | 12 ++ .../gcc.target/riscv/arcv-fusion-xbfu.c | 14 ++ 7 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c create mode 100644 gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c create mode 100644 gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c diff --git a/gcc/config/riscv/arcv-rhx100.md b/gcc/config/riscv/arcv-rhx100.md index c0631a17a280..7cbabac29a58 100644 --- a/gcc/config/riscv/arcv-rhx100.md +++ b/gcc/config/riscv/arcv-rhx100.md @@ -42,6 +42,16 @@ condmove,mvpair,zicond,cpop,clmul")) "((arcv_rhx100_issueA_fuse0 + arcv_rhx100_ALU_A_fuse0_early) | (arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse1_early)) | ((arcv_rhx100_issueB_fuse0 + arcv_rhx100_ALU_B_fuse0_early) | (arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse1_early))") +(define_insn_reservation "arcv_rhx100_imul_fused" 4 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "imul_fused")) + "(arcv_rhx100_issueA_fuse0 + arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse0_early + arcv_rhx100_ALU_A_fuse1_early + arcv_rhx100_MPY32), nothing*3") + +(define_insn_reservation "arcv_rhx100_alu_fused" 1 + (and (eq_attr "tune" "arcv_rhx100") + (eq_attr "type" "alu_fused")) + "(arcv_rhx100_issueA_fuse0 + arcv_rhx100_issueA_fuse1 + arcv_rhx100_ALU_A_fuse0_early + arcv_rhx100_ALU_A_fuse1_early) | (arcv_rhx100_issueB_fuse0 + arcv_rhx100_issueB_fuse1 + arcv_rhx100_ALU_B_fuse0_early + arcv_rhx100_ALU_B_fuse1_early)") + (define_insn_reservation "arcv_rhx100_jmp_insn" 1 (and (eq_attr "tune" "arcv_rhx100") (eq_attr "type" "branch,jump,call,jalr,ret,trap")) diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md index 35de17f76cd9..2d595e85fcd2 100644 --- a/gcc/config/riscv/iterators.md +++ b/gcc/config/riscv/iterators.md @@ -218,6 +218,8 @@ (zero_extract "srliw")]) (define_code_attr extract_shift [(sign_extract "ashiftrt") (zero_extract "lshiftrt")]) +(define_code_attr is_zero_extract [(sign_extract "false") + (zero_extract "true")]) ;; This code iterator allows the two right shift instructions to be ;; generated from the same template. diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 005817a61f4d..c95b74100418 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -4446,7 +4446,8 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN } gcc_fallthrough (); case SIGN_EXTRACT: - if (TARGET_XTHEADBB && outer_code == SET + if ((TARGET_ARCV_RHX100 || TARGET_XTHEADBB) + && outer_code == SET && CONST_INT_P (XEXP (x, 1)) && CONST_INT_P (XEXP (x, 2))) { diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 194526c47573..6c5e0b6e161d 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -3073,6 +3073,7 @@ ;; * Single-bit extraction (SFB) ;; * Extraction instruction th.ext(u) (XTheadBb) ;; * lshrsi3_extend_2 (see above) +;; * Zero extraction fusion (ARC-V) (define_insn_and_split "*3" [(set (match_operand:GPR 0 "register_operand" "=r") (any_extract:GPR @@ -3085,6 +3086,8 @@ && (INTVAL (operands[2]) == 1)) && !TARGET_XTHEADBB && !TARGET_XANDESPERF + && !(TARGET_ARCV_RHX100 + && ) && !(TARGET_64BIT && (INTVAL (operands[3]) > 0) && (INTVAL (operands[2]) + INTVAL (operands[3]) == 32))" @@ -4525,7 +4528,63 @@ (mult:SI (sign_extend:SI (match_operand:HI 1 "register_operand")) (sign_extend:SI (match_operand:HI 2 "register_operand"))) (match_operand:SI 3 "register_operand")))] - "TARGET_XTHEADMAC" + "TARGET_XTHEADMAC || (TARGET_ARCV_RHX100 + && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL))" + { + if (TARGET_ARCV_RHX100) + { + rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode); + emit_insn (gen_extendhisi2 (tmp0, operands[1])); + emit_insn (gen_extendhisi2 (tmp1, operands[2])); + + if (TARGET_64BIT) + { + rtx op0 = gen_reg_rtx (DImode); + emit_insn (gen_madd_split_extended (op0, tmp0, tmp1, operands[3])); + op0 = gen_lowpart (SImode, op0); + SUBREG_PROMOTED_VAR_P (op0) = 1; + SUBREG_PROMOTED_SET (op0, SRP_SIGNED); + emit_move_insn (operands[0], op0); + } + else + { + emit_insn (gen_madd_split (operands[0], tmp0, tmp1, operands[3])); + } + + DONE; + } + } +) + +(define_expand "umaddhisi4" + [(set (match_operand:SI 0 "register_operand") + (plus:SI + (mult:SI (zero_extend:SI (match_operand:HI 1 "register_operand")) + (zero_extend:SI (match_operand:HI 2 "register_operand"))) + (match_operand:SI 3 "register_operand")))] + "TARGET_ARCV_RHX100 + && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)" + { + rtx tmp0 = gen_reg_rtx (SImode), tmp1 = gen_reg_rtx (SImode); + emit_insn (gen_zero_extendhisi2 (tmp0, operands[1])); + emit_insn (gen_zero_extendhisi2 (tmp1, operands[2])); + + if (TARGET_64BIT) + { + rtx op0 = gen_reg_rtx (DImode); + emit_insn (gen_madd_split_extended (op0, tmp0, tmp1, operands[3])); + op0 = gen_lowpart (SImode, op0); + SUBREG_PROMOTED_VAR_P (op0) = 1; + SUBREG_PROMOTED_SET (op0, SRP_SIGNED); + emit_move_insn (operands[0], op0); + } + else + { + emit_insn (gen_madd_split (operands[0], tmp0, tmp1, operands[3])); + } + + DONE; + } ) (define_expand "msubhisi4" @@ -4537,6 +4596,83 @@ "TARGET_XTHEADMAC" ) +(define_insn_and_split "madd_split" + [(set (match_operand:SI 0 "register_operand" "=&r,r") + (plus:SI + (mult:SI (match_operand:SI 1 "register_operand" "r,r") + (match_operand:SI 2 "register_operand" "r,r")) + (match_operand:SI 3 "register_operand" "r,?0"))) + (clobber (match_scratch:SI 4 "=&r,&r"))] + "TARGET_ARCV_RHX100 + && !TARGET_64BIT && (TARGET_ZMMUL || TARGET_MUL)" + "#" + "&& reload_completed" + [(const_int 0)] + "{ + if (REGNO (operands[0]) == REGNO (operands[3])) + { + emit_insn (gen_mulsi3 (operands[4], operands[1], operands[2])); + emit_insn (gen_addsi3 (operands[0], operands[3], operands[4])); + } + else + { + emit_insn (gen_mulsi3 (operands[0], operands[1], operands[2])); + emit_insn (gen_addsi3 (operands[0], operands[0], operands[3])); + } + DONE; + }" + [(set_attr "type" "imul_fused")] +) + +(define_insn_and_split "madd_split_extended" + [(set (match_operand:DI 0 "register_operand" "=&r,r") + (sign_extend:DI + (plus:SI + (mult:SI (match_operand:SI 1 "register_operand" "r,r") + (match_operand:SI 2 "register_operand" "r,r")) + (match_operand:SI 3 "register_operand" "r,?0")))) + (clobber (match_scratch:SI 4 "=&r,&r"))] + "TARGET_ARCV_RHX100 + && (TARGET_ZMMUL || TARGET_MUL)" + "#" + "&& reload_completed" + [(const_int 0)] + "{ + if (REGNO (operands[0]) == REGNO (operands[3])) + { + emit_insn (gen_mulsi3_extended (operands[4], operands[1], operands[2])); + emit_insn (gen_addsi3_extended (operands[4], operands[3], operands[4])); + emit_move_insn (operands[0], operands[4]); + } + else + { + emit_insn (gen_mulsi3_extended (operands[0], operands[1], operands[2])); + emit_insn (gen_addsi3_extended (operands[0], operands[0], operands[3])); + } + DONE; + }" + [(set_attr "type" "imul_fused")] +) + +(define_insn_and_split "*zero_extract_fused" + [(set (match_operand:SI 0 "register_operand" "=r") + (zero_extract:SI (match_operand:SI 1 "register_operand" "r") + (match_operand 2 "const_int_operand") + (match_operand 3 "const_int_operand")))] + "TARGET_ARCV_RHX100 && !TARGET_64BIT + && (INTVAL (operands[2]) > 1 || !TARGET_ZBS)" + "#" + "&& reload_completed" + [(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2))) + (set (match_dup 0) (lshiftrt:SI (match_dup 0) (match_dup 3)))] + "{ + int amount = INTVAL (operands[2]); + int end = INTVAL (operands[3]) + amount; + operands[2] = GEN_INT (BITS_PER_WORD - end); + operands[3] = GEN_INT (BITS_PER_WORD - amount); + }" + [(set_attr "type" "alu_fused")]) + ;; String compare with length insn. ;; Argument 0 is the target (result) ;; Argument 1 is the source1 diff --git a/gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c b/gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c new file mode 100644 index 000000000000..cc2a56a2e086 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/arcv-fusion-limm-condbr.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=arc-v-rhx-100-series" } */ + +int +f (int x) +{ + begin: + if (x <= 3) + goto begin; +} + +/* { dg-final { scan-assembler "\\sli\\sa5,3\n\\sble\\sa0,a5,.L\[0-9\]+\n" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c b/gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c new file mode 100644 index 000000000000..eb8665f576c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/arcv-fusion-madd.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target rv32 } */ +/* { dg-skip-if "" { *-*-* } { "-g" "-flto" "-O0" } } */ +/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im -mabi=ilp32" } */ + +int +f (int x, int y, int z, int v, int w) +{ + return x + y * z + v * w; +} + +/* { dg-final { scan-assembler {\smul\s([ast][0-9]+),a1,a2\n\sadd\s\1,\1,a0\n\smul\sa0,a3,a4\n\sadd\sa0,a0,\1\n} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c b/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c new file mode 100644 index 000000000000..b471c20ae573 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/arcv-fusion-xbfu.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target rv32 } */ +/* { dg-skip-if "" { *-*-* } { "-g" "-flto" "-O0" "-Oz" "-Os" } } */ +/* { dg-options "-mtune=arc-v-rhx-100-series -march=rv32im_zbs -mabi=ilp32" } */ + +#define bit_extract(x,start,amt) (((x)>>(start)) & (~(0xffffffff << (amt)))) + +int +f (int x) +{ + return bit_extract(x,10,14) + bit_extract(x,1,1); +} + +/* { dg-final { scan-assembler {\sslli\s([ast][0-9]+),a0,8\n\ssrli\s([ast][0-9]+),\1,18\n\sbexti\sa0,a0,1.*\n\sadd\sa0,\2,a0.*\n} } } */