diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 048547aa1b..c55f7ef93e 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -47,6 +47,7 @@ source code and documentation. - [examples/monolithic_build_native/config_768.h](examples/monolithic_build_native/config_768.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/src/config.h](mlkem/src/config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 0000000000..34f8cbec66 --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..1c75346689 --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 0000000000..fa0f7097f5 --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include +#include +#include + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 0000000000..96cf7cfc91 --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_INTT_OFFSET 1104 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#else +#define r0 0 +#define r1 1 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..d311138275 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,791 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +.machine "any" +.text + +/* Barrett reduce constatnts */ +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +/* Montgomery reduce constatnts */ +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro SAVE_REGS + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 +.endm + +.macro RESTORE_REGS + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 +.endm + +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V8, V12, V16, V20 + * final r[j]: V21, V22, V23, V24 + */ +.macro Compute_4Coeffs + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm v25, v8, v21 + vsubuhm v26, v12, v22 + vsubuhm v30, v16, v23 + vsubuhm v31, v20, v24 + vadduhm v8, v8, v21 + vadduhm v12, v12, v22 + vadduhm v16, v16, v23 + vadduhm v20, v20, v24 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * register used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next +.endm + +/* + * Load coefficient vectors for r[j] (r) and r[j+len] (r'): + * Load coefficient in r' vectors from offset, R10, R17, R19 and R21 + * Load coefficient in r vectors from offset, R9, R16, R18 and R20 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + */ +.macro Load_4Rjp + lxvd2x 32+v8, r3, r10 /* V8: vector r'0 */ + lxvd2x 32+v12, r3, r17 /* V12: vector for r'1 */ + lxvd2x 32+v16, r3, r19 /* V16: vector for r'2 */ + lxvd2x 32+v20, r3, r21 /* V20: vector for r'3 */ + + lxvd2x 32+v21, r3, r9 /* V21: vector r0 */ + lxvd2x 32+v22, r3, r16 /* V22: vector r1 */ + lxvd2x 32+v23, r3, r18 /* V23: vector r2 */ + lxvd2x 32+v24, r3, r20 /* V24: vector r3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp + Compute_4Coeffs +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + * + * In order to do the coefficient computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v8, v25, v26 + vmrgow v21, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v12, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v16, v25, v26 + vmrgow v23, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v20, v25, v26 + vmrgow v24, v25, v26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x vs10, 0, r5 + lxvd2x vs11, r10, r5 + xxpermdi 32+v8, vs11, vs10, 3 + xxpermdi 32+v21, vs11, vs10, 0 + lxvd2x vs10, r11, r5 + lxvd2x vs11, r12, r5 + xxpermdi 32+v12, vs11, vs10, 3 + xxpermdi 32+v22, vs11, vs10, 0 + lxvd2x vs10, r15, r5 + lxvd2x vs11, r16, r5 + xxpermdi 32+v16, vs11, vs10, 3 + xxpermdi 32+v23, vs11, vs10, 0 + lxvd2x vs10, r17, r5 + lxvd2x vs11, r18, r5 + xxpermdi 32+v20, vs11, vs10, 3 + xxpermdi 32+v24, vs11, vs10, 0 +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + /* Restore constant vectors + V_MKQ, V_25 and V_26 */ + vxor v7, v7, v7 + xxlor 32+v3, vs6, vs6 + xxlor 32+v1, vs7, vs7 + xxlor 32+v2, vs8, vs8 + /* Multify Odd/Even signed halfword; + Results word bound by 2^32 in abs value. */ + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 + /* Right shift and pack lower halfword, + results bond to 2^16 in abs value */ + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 + /* Modulo multify-Low unsigned halfword; + results bond to 2^16 * q in abs value. */ + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 +.endm + +/* + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) + */ +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + /* Modular multification bond by 2^16 * q in abs value */ + vmladduhm v15, v25, \_vz0, v3 + vmladduhm v20, v26, \_vz1, v3 + vmladduhm v27, v30, \_vz2, v3 + vmladduhm v28, v31, \_vz3, v3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs v14, v25, \_vz0, v3 + vmhraddshs v19, v26, \_vz1, v3 + vmhraddshs v24, v30, \_vz2, v3 + vmhraddshs v29, v31, \_vz3, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v27, V_QINV, v3 + vmladduhm v30, v28, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 + + /* Shift right 1 bit */ + vsrah \_vo0, v15, v4 + vsrah \_vo1, v20, v4 + vsrah \_vo2, v25, v4 + vsrah \_vo3, v30, v4 +.endm + +/* + * setup constant vectors for Montgmery multiplication + * V_NMKQ, V_QINV, Zero vector, One vector + */ +.macro Set_mont_consts + xxlor 32+v5, vs0, vs0 /* V_NMKQ */ + xxlor 32+v2, vs2, vs2 /* V_QINV */ + xxlor 32+v3, vs3, vs3 /* all 0 */ + xxlor 32+v4, vs4, vs4 /* all 1 */ +.endm + +.macro Load_next_4zetas + li r8, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r8, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 +.endm + +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, r3, r9 + stxvd2x \_vs1, r3, r16 + stxvd2x \_vs2, r3, r18 + stxvd2x \_vs3, r3, r20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, r3, r10 + stxvd2x \_vs1, r3, r17 + stxvd2x \_vs2, r3, r19 + stxvd2x \_vs3, r3, r21 +.endm + +.macro Reload_4coeffs + lxvd2x 32+v25, 0, r3 + lxvd2x 32+v26, r10, r3 + lxvd2x 32+v30, r11, r3 + lxvd2x 32+v31, r12, r3 + addi r3, r3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + addi r3, r3, -128 + stxvd2x \_vs0, 0, r3 + stxvd2x \_vs1, r10, r3 + stxvd2x \_vs2, r11, r3 + stxvd2x \_vs3, r12, r3 + stxvd2x \_vs4, r15, r3 + stxvd2x \_vs5, r16, r3 + stxvd2x \_vs6, r17, r3 + stxvd2x \_vs7, r18, r3 + addi r3, r3, 128 +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL44 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + xxpermdi 32+v10, 32+v14, 32+v13, 3 + xxpermdi 32+v11, 32+v14, 32+v13, 0 + xxpermdi 32+v12, 32+v19, 32+v18, 3 + xxpermdi 32+v13, 32+v19, 32+v18, 0 + xxpermdi 32+v14, 32+v24, 32+v23, 3 + xxpermdi 32+v15, 32+v24, 32+v23, 0 + xxpermdi 32+v16, 32+v29, 32+v28, 3 + xxpermdi 32+v17, 32+v29, 32+v28, 0 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL24 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + vmrgew v10, v13, v14 + vmrgow v11, v13, v14 + vmrgew v12, v18, v19 + vmrgow v13, v18, v19 + vmrgew v14, v23, v24 + vmrgow v15, v23, v24 + vmrgew v16, v28, v29 + vmrgow v17, v28, v29 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + PermWriteL44 +.endm + +.macro INTT_REDUCE_4X start next + Load_4Coeffs \start, \next + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 +.endm + +/* + * main operations for intt + * t = r[j]; + * r[j] = barrett_reduce(t + r[j + len]); + * r[j + len] = r[j + len] - t; + * r[j + len] = fqmul(zeta, r[j + len]); + */ + +/* + * mlk_intt_ppc(r) + */ +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + SAVE_REGS + + /* init vectors and constants + Setup for Montgomery reduce */ + lxvx vs0, 0, r4 + + li r10, QINV_OFFSET + lxvx 32+V_QINV, r10, r4 + xxlxor 32+v3, 32+v3, 32+v3 + vspltish v4, 1 + xxlor vs2, 32+v2, 32+v2 /* QINV */ + xxlor vs3, 32+v3, 32+v3 /* 0 vector */ + xxlor vs4, 32+v4, 32+v4 /* 1 vector */ + + /* Setup for Barrett reduce */ + li r10, Q_OFFSET + li r11, C20159_OFFSET + lxvx vs6, r10, r4 /* V_MKQ */ + lxvx 32+V20159, r11, r4 /* V20159 */ + + vspltisw v8, 13 + vadduwm v8, v8, v8 + xxlor vs8, 32+v8, 32+v8 /* V_26 store at vs8 */ + + vspltisw v9, 1 + vsubuwm v10, v8, v9 /* value 25 */ + vslw v9, v9, v10 + xxlor vs7, 32+v9, 32+v9 /* V_25 syore at vs7 */ + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 + + /* + * Montgomery reduce loops with constant 1441 + */ + addi r14, r4, C1441_OFFSET + lvx V1441, 0, r14 + li r8, 4 + mtctr r8 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28 + MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28 + bdnz intt_ppc__Loopf + + addi r3, r3, -512 + +.align 4 + /* + * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * Update zetas vectors, each vector has 2 zetas + * Load zeta array in 2-2-2-2 layout + */ + addi r14, r4, ZETA_INTT_OFFSET + li r7, 4 /* len * 2 */ + li r8, 4 + mtctr r8 + mr r5, r3 +intt_ppc__Loop2: + INTT_REDUCE_L24 + addi r5, r5, 128 + bdnz intt_ppc__Loop2 + +.align 4 + /* + * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * Load zeta array in 4-4 layout + */ + mr r5, r3 + li r7, 8 + li r8, 4 + mtctr r8 +intt_ppc__Loop4: + INTT_REDUCE_L44 + addi r5, r5, 128 + bdnz intt_ppc__Loop4 + +.align 4 + /* + * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + */ + li r7, 16 + + INTT_REDUCE_4X 0, 32 + INTT_REDUCE_4X 128, 32 + INTT_REDUCE_4X 256, 32 + INTT_REDUCE_4X 384, 32 + +.align 4 + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + */ + li r7, 32 + + INTT_REDUCE_4X 0, 64 + + addi r14, r14, -64 + INTT_REDUCE_4X 16, 64 + + INTT_REDUCE_4X 256, 64 + + addi r14, r14, -64 + INTT_REDUCE_4X 272, 64 + +.align 4 + /* + * 5. len = 32, start = 0, 64, 128, 192 + */ + li r7, 64 + + Load_4Coeffs 0, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 128, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 256, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 384, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + +.align 4 + /* + * 6. len = 64, start = 0, 128 + */ + li r7, 128 + Load_4Coeffs 0, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 64, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 256, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 320, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + +.align 4 + /* + * 7. len = 128, start = 0 + */ + li r7, 256 /* len*2 */ + + Load_4Coeffs 0, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + xxlor vs9, 32+V_ZETA, 32+V_ZETA + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 64, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 128, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 192, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..beee949702 --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,559 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +.macro SAVE_REGS + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 +.endm + +.macro RESTORE_REGS + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * registers used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next +.endm + +/* + * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21 + * r[j+len]: V13, V18, V23, V28 + */ +.macro Load_4Rjp + lxvd2x 32+v13, r3, r10 /* V13: vector r'0 */ + lxvd2x 32+v18, r3, r17 /* V18: vector for r'1 */ + lxvd2x 32+v23, r3, r19 /* V23: vector for r'2 */ + lxvd2x 32+v28, r3, r21 /* V28: vector for r'3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V13, V18, V23, V28 + * r[j]: V12, V17, V22, V27 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v13, v25, v26 + vmrgow v12, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v18, v25, v26 + vmrgow v17, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v23, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v28, v25, v26 + vmrgow v27, v25, v26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x vs1, 0, r5 + lxvd2x vs2, r10, r5 + xxpermdi 32+v13, vs2, vs1, 3 + xxpermdi 32+v12, vs2, vs1, 0 + lxvd2x vs3, r11, r5 + lxvd2x vs4, r12, r5 + xxpermdi 32+v18, vs4, vs3, 3 + xxpermdi 32+v17, vs4, vs3, 0 + lxvd2x vs1, r15, r5 + lxvd2x vs2, r16, r5 + xxpermdi 32+v23, vs2, vs1, 3 + xxpermdi 32+v22, vs2, vs1, 0 + lxvd2x vs3, r17, r5 + lxvd2x vs4, r18, r5 + xxpermdi 32+v28, vs4, vs3, 3 + xxpermdi 32+v27, vs4, vs3, 0 +.endm + +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) + */ +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 + /* fqmul = zeta * coefficient + Modular multification bond by 2^16 * q in abs value */ + vmladduhm v15, v13, \_vz0, v3 + vmladduhm v20, v18, \_vz1, v3 + vmladduhm v25, v23, \_vz2, v3 + vmladduhm v30, v28, \_vz3, v3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs v14, v13, \_vz0, v3 + vmhraddshs v19, v18, \_vz1, v3 + vmhraddshs v24, v23, \_vz2, v3 + vmhraddshs v29, v28, \_vz3, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v30, v30, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 + + /* Shift right 1 bit */ + vsrah v13, v15, v4 + vsrah v18, v20, v4 + vsrah v23, v25, v4 + vsrah v28, v30, v4 +.endm + +/* + * Load 4 r[j] (r) coefficient vectors: + * Load coefficient in vectors from offset, R9, R16, R18 and R20 + * r[j]: V12, V17, V22, V27 + */ +.macro Load_4Rj + lxvd2x 32+v12, r3, r9 /* V12: vector r0 */ + lxvd2x 32+v17, r3, r16 /* V17: vector r1 */ + lxvd2x 32+v22, r3, r18 /* V22: vector r2 */ + lxvd2x 32+v27, r3, r20 /* V27: vector r3 */ +.endm + +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V16, V21, V26, V31 + * final r[j]: V15, V20, V25, V30 + */ +.macro Compute_4Coeffs + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm v16, v12, v13 + vadduhm v15, v13, v12 + vsubuhm v21, v17, v18 + vadduhm v20, v18, v17 + vsubuhm v26, v22, v23 + vadduhm v25, v23, v22 + vsubuhm v31, v27, v28 + vadduhm v30, v28, v27 +.endm + +.macro Write_One + stxvd2x 32+v15, r3, r9 + stxvd2x 32+v16, r3, r10 + stxvd2x 32+v20, r3, r16 + stxvd2x 32+v21, r3, r17 + stxvd2x 32+v25, r3, r18 + stxvd2x 32+v26, r3, r19 + stxvd2x 32+v30, r3, r20 + stxvd2x 32+v31, r3, r21 +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi vs0, 32+v15, 32+v16, 3 + xxpermdi vs1, 32+v15, 32+v16, 0 + xxpermdi vs2, 32+v20, 32+v21, 3 + xxpermdi vs3, 32+v20, 32+v21, 0 + xxpermdi vs4, 32+v25, 32+v26, 3 + xxpermdi vs5, 32+v25, 32+v26, 0 + xxpermdi vs6, 32+v30, 32+v31, 3 + xxpermdi vs7, 32+v30, 32+v31, 0 + stxvd2x vs0, 0, r5 + stxvd2x vs1, r10, r5 + stxvd2x vs2, r11, r5 + stxvd2x vs3, r12, r5 + stxvd2x vs4, r15, r5 + stxvd2x vs5, r16, r5 + stxvd2x vs6, r17, r5 + stxvd2x vs7, r18, r5 +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL24 + Compute_4Coeffs + vmrgew v10, v16, v15 + vmrgow v11, v16, v15 + vmrgew v12, v21, v20 + vmrgow v13, v21, v20 + vmrgew v14, v26, v25 + vmrgow v15, v26, v25 + vmrgew v16, v31, v30 + vmrgow v17, v31, v30 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 +.endm + +.macro Load_next_4zetas + li r10, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r10, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 +.endm + +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Rj + Compute_4Coeffs + Write_One +.endm + +/* + * mlk_ntt_ppc(int16_t *r) + */ +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + SAVE_REGS + + /* load MLKEM_Q */ + lvx V_NMKQ,0,r4 + + /* Register 14 as pointer to zetas array */ + addi r14, r4, ZETA_NTT_OFFSET + + vxor v3, v3, v3 + vspltish v4, 1 + + li r10, QINV_OFFSET + lvx V_QINV, r10, r4 + +.align 4 + /* + * Compute coefficients of the NTT based on the following loop. + * for (len = 128; len ≥ 2; len = len/2) + * + * 1. len = 128, start = 0 + */ + li r7, 256 /* len * 2 */ + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * 2. len = 64, start = 0, 128 + * k += 2 + */ + li r7, 128 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * 3. len = 32, start = 0, 64, 128, 192 + * k += 4 + */ + li r7, 64 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + * k += 8 + */ + li r7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 + +.align 4 + /* + * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + * k += 16 + */ + li r7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + /* + * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * k += 32 + * Load zeta vectors in 4-4 layout + */ + li r15, 4 + mtctr r15 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 8 + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 + +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + Perm_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi r5, r5, 128 + + bdnz ntt_ppc__Len4 + + /* + * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * k += 64 + * Load zeta vectors in 2-2-2-2 layout + */ + + li r8, 4 + mtctr r8 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 4 + +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi r5, r5, 128 + + bdnz ntt_ppc__Len2 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..4d16be6f05 --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,194 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_v0, _v1, _v2, _v3) + */ +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+v13, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v18, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v23, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v7, 0, r3 + addi r3, r3, 16 + + vmladduhm v15, v13, V1353, v3 + vmladduhm v20, v18, V1353, v3 + vmladduhm v25, v23, V1353, v3 + vmladduhm v9, v7, V1353, v3 + + vmhraddshs v14, v13, V1353, v3 + vmhraddshs v19, v18, V1353, v3 + vmhraddshs v24, v23, V1353, v3 + vmhraddshs v8, v7, V1353, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v9, v9, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v9, v9, V_NMKQ, v8 + + /* Shift right 1 bit */ + vsrah \_v0, v15, v4 + vsrah \_v1, v20, v4 + vsrah \_v2, v25, v4 + vsrah \_v3, v9, v4 +.endm + +.macro Write_8X + stxvd2x 32+v27, r4, r3 + stxvd2x 32+v28, r5, r3 + stxvd2x 32+v29, r6, r3 + stxvd2x 32+v30, r7, r3 + stxvd2x 32+v13, r8, r3 + stxvd2x 32+v18, r9, r3 + stxvd2x 32+v23, r10, r3 + stxvd2x 32+v7, r11, r3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu r1, -320(r1) + mflr r0 + + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + stxvx 32+v25, r11, r1 + stxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + stxvx 32+v27, r6, r1 + stxvx 32+v28, r7, r1 + stxvx 32+v29, r8, r1 + stxvx 32+v30, r9, r1 + + li r6, NQ_OFFSET + li r7, QINV_OFFSET + li r8, C1353_OFFSET + lxvx 32+V_NMKQ, r6, r4 + lxvx 32+V_QINV, r7, r4 + lxvx 32+V1353, r8, r4 + + vxor v3, v3, v3 + vspltish v4, 1 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + lxvx 32+v25, r11, r1 + lxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + lxvx 32+v27, r6, r1 + lxvx 32+v28, r7, r1 + lxvx 32+v29, r8, r1 + lxvx 32+v30, r9, r1 + mtlr r0 + addi r1, r1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 0000000000..691ce3970c --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,242 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial + * for details of the Barrett reduction + * + * Arguments: *r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+v8, 0, r3 + lxvd2x 32+v12, r14, r3 + lxvd2x 32+v16, r15, r3 + lxvd2x 32+v20, r16, r3 + addi r3, r3, 64 + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 +.endm + +.macro Write_8X + stxvd2x 32+v21, r4, r3 + stxvd2x 32+v22, r5, r3 + stxvd2x 32+v23, r6, r3 + stxvd2x 32+v24, r7, r3 + stxvd2x 32+v4, r8, r3 + stxvd2x 32+v9, r9, r3 + stxvd2x 32+v13, r10, r3 + stxvd2x 32+v17, r11, r3 +.endm + +/* + * Conditional addition to get unsigned canonical representative + */ +.macro To_unsigned_16 + lxvd2x 32+v12, 0, r3 + lxvd2x 32+v13, r14, r3 + lxvd2x 32+v14, r15, r3 + lxvd2x 32+v15, r16, r3 + addi r3, r3, 64 + vsrh v1, v12, v10 + vsrh v0, v13, v10 + vsrh v3, v14, v10 + vsrh v2, v15, v10 + vadduhm v7, v12, v11 + vadduhm v8, v13, v11 + vadduhm v5, v14, v11 + vadduhm v6, v15, v11 + vcmpequh v1, v1, v9 + vcmpequh v0, v0, v9 + vcmpequh v3, v3, v9 + vcmpequh v2, v2, v9 + xxsel 32+v1, 32+v7,32+v12, 32+v1 + xxsel 32+v0, 32+v8,32+v13, 32+v0 + xxsel 32+v3, 32+v5,32+v14, 32+v3 + xxsel 32+v2, 32+v6,32+v15, 32+v2 + stxvd2x 32+v3, r10, r3 + stxvd2x 32+v2, r11, r3 + stxvd2x 32+v1, r8, r3 + stxvd2x 32+v0, r9, r3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu r1, -224(r1) + mflr r0 + std r14, 96(r1) + std r15, 104(r1) + std r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + + vxor v7, v7, v7 + + li r6, Q_OFFSET + li r7, C20159_OFFSET + lxvx 32+V_MKQ, r6, r4 + lxvx 32+V20159, r7, r4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw v4, 1 + vsubuwm v5, V_26, v4 + vslw V_25, v4, v5 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + li r14, 16 + li r15, 32 + li r16, 48 + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + /* + * To unsigned canonical + */ +.align 4 + addi r3, r3, -512 + vxor v9, v9, v9 + vspltish v10, 15 + vmr v11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld r14, 96(r1) + ld r15, 104(r1) + ld r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + mtlr r0 + addi r1, r1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c6..9c7fe672ab 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3f..f46dbfdbf1 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478a..1b01c4d426 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 0000000000..2fa1cdbcf6 --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index 48b117404b..bc5107d7d6 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -463,6 +463,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 9100915359..1d8a0d073c 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -452,6 +452,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index 4291d629b1..89fd0de56d 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -18,6 +18,10 @@ #include "x86_64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif + #if defined(MLK_SYS_RISCV64_RVV) #include "riscv64/meta.h" #endif diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 0000000000..54b3ddd9c6 --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..dbcee3e3ee --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 0000000000..fa0f7097f5 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include +#include +#include + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 0000000000..6c59a63b0b --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_INTT_OFFSET 1104 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#else +#define r0 0 +#define r1 1 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..946ae12e01 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,789 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +.machine "any" +.text + +/* Barrett reduce constatnts */ +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +/* Montgomery reduce constatnts */ +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro SAVE_REGS + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 +.endm + +.macro RESTORE_REGS + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 +.endm + +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V8, V12, V16, V20 + * final r[j]: V21, V22, V23, V24 + */ +.macro Compute_4Coeffs + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm v25, v8, v21 + vsubuhm v26, v12, v22 + vsubuhm v30, v16, v23 + vsubuhm v31, v20, v24 + vadduhm v8, v8, v21 + vadduhm v12, v12, v22 + vadduhm v16, v16, v23 + vadduhm v20, v20, v24 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * register used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next +.endm + +/* + * Load coefficient vectors for r[j] (r) and r[j+len] (r'): + * Load coefficient in r' vectors from offset, R10, R17, R19 and R21 + * Load coefficient in r vectors from offset, R9, R16, R18 and R20 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + */ +.macro Load_4Rjp + lxvd2x 32+v8, r3, r10 /* V8: vector r'0 */ + lxvd2x 32+v12, r3, r17 /* V12: vector for r'1 */ + lxvd2x 32+v16, r3, r19 /* V16: vector for r'2 */ + lxvd2x 32+v20, r3, r21 /* V20: vector for r'3 */ + + lxvd2x 32+v21, r3, r9 /* V21: vector r0 */ + lxvd2x 32+v22, r3, r16 /* V22: vector r1 */ + lxvd2x 32+v23, r3, r18 /* V23: vector r2 */ + lxvd2x 32+v24, r3, r20 /* V24: vector r3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp + Compute_4Coeffs +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + * + * In order to do the coefficient computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v8, v25, v26 + vmrgow v21, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v12, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v16, v25, v26 + vmrgow v23, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v20, v25, v26 + vmrgow v24, v25, v26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x vs10, 0, r5 + lxvd2x vs11, r10, r5 + xxpermdi 32+v8, vs11, vs10, 3 + xxpermdi 32+v21, vs11, vs10, 0 + lxvd2x vs10, r11, r5 + lxvd2x vs11, r12, r5 + xxpermdi 32+v12, vs11, vs10, 3 + xxpermdi 32+v22, vs11, vs10, 0 + lxvd2x vs10, r15, r5 + lxvd2x vs11, r16, r5 + xxpermdi 32+v16, vs11, vs10, 3 + xxpermdi 32+v23, vs11, vs10, 0 + lxvd2x vs10, r17, r5 + lxvd2x vs11, r18, r5 + xxpermdi 32+v20, vs11, vs10, 3 + xxpermdi 32+v24, vs11, vs10, 0 +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + /* Restore constant vectors + V_MKQ, V_25 and V_26 */ + vxor v7, v7, v7 + xxlor 32+v3, vs6, vs6 + xxlor 32+v1, vs7, vs7 + xxlor 32+v2, vs8, vs8 + /* Multify Odd/Even signed halfword; + Results word bound by 2^32 in abs value. */ + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 + /* Right shift and pack lower halfword, + results bond to 2^16 in abs value */ + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 + /* Modulo multify-Low unsigned halfword; + results bond to 2^16 * q in abs value. */ + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 +.endm + +/* + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) + */ +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + /* Modular multification bond by 2^16 * q in abs value */ + vmladduhm v15, v25, \_vz0, v3 + vmladduhm v20, v26, \_vz1, v3 + vmladduhm v27, v30, \_vz2, v3 + vmladduhm v28, v31, \_vz3, v3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs v14, v25, \_vz0, v3 + vmhraddshs v19, v26, \_vz1, v3 + vmhraddshs v24, v30, \_vz2, v3 + vmhraddshs v29, v31, \_vz3, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v27, V_QINV, v3 + vmladduhm v30, v28, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 + + /* Shift right 1 bit */ + vsrah \_vo0, v15, v4 + vsrah \_vo1, v20, v4 + vsrah \_vo2, v25, v4 + vsrah \_vo3, v30, v4 +.endm + +/* + * setup constant vectors for Montgmery multiplication + * V_NMKQ, V_QINV, Zero vector, One vector + */ +.macro Set_mont_consts + xxlor 32+v5, vs0, vs0 /* V_NMKQ */ + xxlor 32+v2, vs2, vs2 /* V_QINV */ + xxlor 32+v3, vs3, vs3 /* all 0 */ + xxlor 32+v4, vs4, vs4 /* all 1 */ +.endm + +.macro Load_next_4zetas + li r8, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r8, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 +.endm + +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, r3, r9 + stxvd2x \_vs1, r3, r16 + stxvd2x \_vs2, r3, r18 + stxvd2x \_vs3, r3, r20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, r3, r10 + stxvd2x \_vs1, r3, r17 + stxvd2x \_vs2, r3, r19 + stxvd2x \_vs3, r3, r21 +.endm + +.macro Reload_4coeffs + lxvd2x 32+v25, 0, r3 + lxvd2x 32+v26, r10, r3 + lxvd2x 32+v30, r11, r3 + lxvd2x 32+v31, r12, r3 + addi r3, r3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + addi r3, r3, -128 + stxvd2x \_vs0, 0, r3 + stxvd2x \_vs1, r10, r3 + stxvd2x \_vs2, r11, r3 + stxvd2x \_vs3, r12, r3 + stxvd2x \_vs4, r15, r3 + stxvd2x \_vs5, r16, r3 + stxvd2x \_vs6, r17, r3 + stxvd2x \_vs7, r18, r3 + addi r3, r3, 128 +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL44 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + xxpermdi 32+v10, 32+v14, 32+v13, 3 + xxpermdi 32+v11, 32+v14, 32+v13, 0 + xxpermdi 32+v12, 32+v19, 32+v18, 3 + xxpermdi 32+v13, 32+v19, 32+v18, 0 + xxpermdi 32+v14, 32+v24, 32+v23, 3 + xxpermdi 32+v15, 32+v24, 32+v23, 0 + xxpermdi 32+v16, 32+v29, 32+v28, 3 + xxpermdi 32+v17, 32+v29, 32+v28, 0 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL24 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + vmrgew v10, v13, v14 + vmrgow v11, v13, v14 + vmrgew v12, v18, v19 + vmrgow v13, v18, v19 + vmrgew v14, v23, v24 + vmrgow v15, v23, v24 + vmrgew v16, v28, v29 + vmrgow v17, v28, v29 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + PermWriteL44 +.endm + +.macro INTT_REDUCE_4X start next + Load_4Coeffs \start, \next + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 +.endm + +/* + * main operations for intt + * t = r[j]; + * r[j] = barrett_reduce(t + r[j + len]); + * r[j + len] = r[j + len] - t; + * r[j + len] = fqmul(zeta, r[j + len]); + */ + +/* + * mlk_intt_ppc(r) + */ +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + SAVE_REGS + + /* init vectors and constants + Setup for Montgomery reduce */ + lxvx vs0, 0, r4 + + li r10, QINV_OFFSET + lxvx 32+V_QINV, r10, r4 + xxlxor 32+v3, 32+v3, 32+v3 + vspltish v4, 1 + xxlor vs2, 32+v2, 32+v2 /* QINV */ + xxlor vs3, 32+v3, 32+v3 /* 0 vector */ + xxlor vs4, 32+v4, 32+v4 /* 1 vector */ + + /* Setup for Barrett reduce */ + li r10, Q_OFFSET + li r11, C20159_OFFSET + lxvx vs6, r10, r4 /* V_MKQ */ + lxvx 32+V20159, r11, r4 /* V20159 */ + + vspltisw v8, 13 + vadduwm v8, v8, v8 + xxlor vs8, 32+v8, 32+v8 /* V_26 store at vs8 */ + + vspltisw v9, 1 + vsubuwm v10, v8, v9 /* value 25 */ + vslw v9, v9, v10 + xxlor vs7, 32+v9, 32+v9 /* V_25 syore at vs7 */ + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 + + /* + * Montgomery reduce loops with constant 1441 + */ + addi r14, r4, C1441_OFFSET + lvx V1441, 0, r14 + li r8, 4 + mtctr r8 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28 + MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28 + bdnz intt_ppc__Loopf + + addi r3, r3, -512 + +.align 4 + /* + * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * Update zetas vectors, each vector has 2 zetas + * Load zeta array in 2-2-2-2 layout + */ + addi r14, r4, ZETA_INTT_OFFSET + li r7, 4 /* len * 2 */ + li r8, 4 + mtctr r8 + mr r5, r3 +intt_ppc__Loop2: + INTT_REDUCE_L24 + addi r5, r5, 128 + bdnz intt_ppc__Loop2 + +.align 4 + /* + * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * Load zeta array in 4-4 layout + */ + mr r5, r3 + li r7, 8 + li r8, 4 + mtctr r8 +intt_ppc__Loop4: + INTT_REDUCE_L44 + addi r5, r5, 128 + bdnz intt_ppc__Loop4 + +.align 4 + /* + * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + */ + li r7, 16 + + INTT_REDUCE_4X 0, 32 + INTT_REDUCE_4X 128, 32 + INTT_REDUCE_4X 256, 32 + INTT_REDUCE_4X 384, 32 + +.align 4 + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + */ + li r7, 32 + + INTT_REDUCE_4X 0, 64 + + addi r14, r14, -64 + INTT_REDUCE_4X 16, 64 + + INTT_REDUCE_4X 256, 64 + + addi r14, r14, -64 + INTT_REDUCE_4X 272, 64 + +.align 4 + /* + * 5. len = 32, start = 0, 64, 128, 192 + */ + li r7, 64 + + Load_4Coeffs 0, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 128, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 256, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 384, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + +.align 4 + /* + * 6. len = 64, start = 0, 128 + */ + li r7, 128 + Load_4Coeffs 0, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 64, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 256, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 320, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + +.align 4 + /* + * 7. len = 128, start = 0 + */ + li r7, 256 /* len*2 */ + + Load_4Coeffs 0, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + lvx V_ZETA, 0, r14 + xxlor vs9, 32+V_ZETA, 32+V_ZETA + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 64, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 128, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + Load_4Coeffs 192, 16 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + Set_mont_consts + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..3c06f0a319 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,557 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +.macro SAVE_REGS + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 +.endm + +.macro RESTORE_REGS + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * registers used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next +.endm + +/* + * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21 + * r[j+len]: V13, V18, V23, V28 + */ +.macro Load_4Rjp + lxvd2x 32+v13, r3, r10 /* V13: vector r'0 */ + lxvd2x 32+v18, r3, r17 /* V18: vector for r'1 */ + lxvd2x 32+v23, r3, r19 /* V23: vector for r'2 */ + lxvd2x 32+v28, r3, r21 /* V28: vector for r'3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V13, V18, V23, V28 + * r[j]: V12, V17, V22, V27 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v13, v25, v26 + vmrgow v12, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v18, v25, v26 + vmrgow v17, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v23, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v28, v25, v26 + vmrgow v27, v25, v26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x vs1, 0, r5 + lxvd2x vs2, r10, r5 + xxpermdi 32+v13, vs2, vs1, 3 + xxpermdi 32+v12, vs2, vs1, 0 + lxvd2x vs3, r11, r5 + lxvd2x vs4, r12, r5 + xxpermdi 32+v18, vs4, vs3, 3 + xxpermdi 32+v17, vs4, vs3, 0 + lxvd2x vs1, r15, r5 + lxvd2x vs2, r16, r5 + xxpermdi 32+v23, vs2, vs1, 3 + xxpermdi 32+v22, vs2, vs1, 0 + lxvd2x vs3, r17, r5 + lxvd2x vs4, r18, r5 + xxpermdi 32+v28, vs4, vs3, 3 + xxpermdi 32+v27, vs4, vs3, 0 +.endm + +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) + */ +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 + /* fqmul = zeta * coefficient + Modular multification bond by 2^16 * q in abs value */ + vmladduhm v15, v13, \_vz0, v3 + vmladduhm v20, v18, \_vz1, v3 + vmladduhm v25, v23, \_vz2, v3 + vmladduhm v30, v28, \_vz3, v3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs v14, v13, \_vz0, v3 + vmhraddshs v19, v18, \_vz1, v3 + vmhraddshs v24, v23, \_vz2, v3 + vmhraddshs v29, v28, \_vz3, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v30, v30, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 + + /* Shift right 1 bit */ + vsrah v13, v15, v4 + vsrah v18, v20, v4 + vsrah v23, v25, v4 + vsrah v28, v30, v4 +.endm + +/* + * Load 4 r[j] (r) coefficient vectors: + * Load coefficient in vectors from offset, R9, R16, R18 and R20 + * r[j]: V12, V17, V22, V27 + */ +.macro Load_4Rj + lxvd2x 32+v12, r3, r9 /* V12: vector r0 */ + lxvd2x 32+v17, r3, r16 /* V17: vector r1 */ + lxvd2x 32+v22, r3, r18 /* V22: vector r2 */ + lxvd2x 32+v27, r3, r20 /* V27: vector r3 */ +.endm + +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V16, V21, V26, V31 + * final r[j]: V15, V20, V25, V30 + */ +.macro Compute_4Coeffs + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm v16, v12, v13 + vadduhm v15, v13, v12 + vsubuhm v21, v17, v18 + vadduhm v20, v18, v17 + vsubuhm v26, v22, v23 + vadduhm v25, v23, v22 + vsubuhm v31, v27, v28 + vadduhm v30, v28, v27 +.endm + +.macro Write_One + stxvd2x 32+v15, r3, r9 + stxvd2x 32+v16, r3, r10 + stxvd2x 32+v20, r3, r16 + stxvd2x 32+v21, r3, r17 + stxvd2x 32+v25, r3, r18 + stxvd2x 32+v26, r3, r19 + stxvd2x 32+v30, r3, r20 + stxvd2x 32+v31, r3, r21 +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi vs0, 32+v15, 32+v16, 3 + xxpermdi vs1, 32+v15, 32+v16, 0 + xxpermdi vs2, 32+v20, 32+v21, 3 + xxpermdi vs3, 32+v20, 32+v21, 0 + xxpermdi vs4, 32+v25, 32+v26, 3 + xxpermdi vs5, 32+v25, 32+v26, 0 + xxpermdi vs6, 32+v30, 32+v31, 3 + xxpermdi vs7, 32+v30, 32+v31, 0 + stxvd2x vs0, 0, r5 + stxvd2x vs1, r10, r5 + stxvd2x vs2, r11, r5 + stxvd2x vs3, r12, r5 + stxvd2x vs4, r15, r5 + stxvd2x vs5, r16, r5 + stxvd2x vs6, r17, r5 + stxvd2x vs7, r18, r5 +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL24 + Compute_4Coeffs + vmrgew v10, v16, v15 + vmrgow v11, v16, v15 + vmrgew v12, v21, v20 + vmrgow v13, v21, v20 + vmrgew v14, v26, v25 + vmrgow v15, v26, v25 + vmrgew v16, v31, v30 + vmrgow v17, v31, v30 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 +.endm + +.macro Load_next_4zetas + li r10, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r10, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 +.endm + +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Rj + Compute_4Coeffs + Write_One +.endm + +/* + * mlk_ntt_ppc(int16_t *r) + */ +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + SAVE_REGS + + /* load MLKEM_Q */ + lvx V_NMKQ,0,r4 + + /* Register 14 as pointer to zetas array */ + addi r14, r4, ZETA_NTT_OFFSET + + vxor v3, v3, v3 + vspltish v4, 1 + + li r10, QINV_OFFSET + lvx V_QINV, r10, r4 + +.align 4 + /* + * Compute coefficients of the NTT based on the following loop. + * for (len = 128; len ≥ 2; len = len/2) + * + * 1. len = 128, start = 0 + */ + li r7, 256 /* len * 2 */ + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * 2. len = 64, start = 0, 128 + * k += 2 + */ + li r7, 128 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * 3. len = 32, start = 0, 64, 128, 192 + * k += 4 + */ + li r7, 64 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + * k += 8 + */ + li r7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 + +.align 4 + /* + * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + * k += 16 + */ + li r7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + /* + * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * k += 32 + * Load zeta vectors in 4-4 layout + */ + li r15, 4 + mtctr r15 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 8 + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 + +.align 4 +ntt_ppc__Len4: + Load_next_4zetas + Perm_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi r5, r5, 128 + + bdnz ntt_ppc__Len4 + + /* + * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * k += 64 + * Load zeta vectors in 2-2-2-2 layout + */ + + li r8, 4 + mtctr r8 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 4 + +.align 4 +ntt_ppc__Len2: + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi r5, r5, 128 + + bdnz ntt_ppc__Len2 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..5c0703755c --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,192 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_v0, _v1, _v2, _v3) + */ +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+v13, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v18, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v23, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v7, 0, r3 + addi r3, r3, 16 + + vmladduhm v15, v13, V1353, v3 + vmladduhm v20, v18, V1353, v3 + vmladduhm v25, v23, V1353, v3 + vmladduhm v9, v7, V1353, v3 + + vmhraddshs v14, v13, V1353, v3 + vmhraddshs v19, v18, V1353, v3 + vmhraddshs v24, v23, V1353, v3 + vmhraddshs v8, v7, V1353, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v9, v9, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v9, v9, V_NMKQ, v8 + + /* Shift right 1 bit */ + vsrah \_v0, v15, v4 + vsrah \_v1, v20, v4 + vsrah \_v2, v25, v4 + vsrah \_v3, v9, v4 +.endm + +.macro Write_8X + stxvd2x 32+v27, r4, r3 + stxvd2x 32+v28, r5, r3 + stxvd2x 32+v29, r6, r3 + stxvd2x 32+v30, r7, r3 + stxvd2x 32+v13, r8, r3 + stxvd2x 32+v18, r9, r3 + stxvd2x 32+v23, r10, r3 + stxvd2x 32+v7, r11, r3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu r1, -320(r1) + mflr r0 + + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + stxvx 32+v25, r11, r1 + stxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + stxvx 32+v27, r6, r1 + stxvx 32+v28, r7, r1 + stxvx 32+v29, r8, r1 + stxvx 32+v30, r9, r1 + + li r6, NQ_OFFSET + li r7, QINV_OFFSET + li r8, C1353_OFFSET + lxvx 32+V_NMKQ, r6, r4 + lxvx 32+V_QINV, r7, r4 + lxvx 32+V1353, r8, r4 + + vxor v3, v3, v3 + vspltish v4, 1 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 + Write_8X + + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + lxvx 32+v25, r11, r1 + lxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + lxvx 32+v27, r6, r1 + lxvx 32+v28, r7, r1 + lxvx 32+v29, r8, r1 + lxvx 32+v30, r9, r1 + mtlr r0 + addi r1, r1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 0000000000..a6deedffc3 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,240 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial + * for details of the Barrett reduction + * + * Arguments: *r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+v8, 0, r3 + lxvd2x 32+v12, r14, r3 + lxvd2x 32+v16, r15, r3 + lxvd2x 32+v20, r16, r3 + addi r3, r3, 64 + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 +.endm + +.macro Write_8X + stxvd2x 32+v21, r4, r3 + stxvd2x 32+v22, r5, r3 + stxvd2x 32+v23, r6, r3 + stxvd2x 32+v24, r7, r3 + stxvd2x 32+v4, r8, r3 + stxvd2x 32+v9, r9, r3 + stxvd2x 32+v13, r10, r3 + stxvd2x 32+v17, r11, r3 +.endm + +/* + * Conditional addition to get unsigned canonical representative + */ +.macro To_unsigned_16 + lxvd2x 32+v12, 0, r3 + lxvd2x 32+v13, r14, r3 + lxvd2x 32+v14, r15, r3 + lxvd2x 32+v15, r16, r3 + addi r3, r3, 64 + vsrh v1, v12, v10 + vsrh v0, v13, v10 + vsrh v3, v14, v10 + vsrh v2, v15, v10 + vadduhm v7, v12, v11 + vadduhm v8, v13, v11 + vadduhm v5, v14, v11 + vadduhm v6, v15, v11 + vcmpequh v1, v1, v9 + vcmpequh v0, v0, v9 + vcmpequh v3, v3, v9 + vcmpequh v2, v2, v9 + xxsel 32+v1, 32+v7,32+v12, 32+v1 + xxsel 32+v0, 32+v8,32+v13, 32+v0 + xxsel 32+v3, 32+v5,32+v14, 32+v3 + xxsel 32+v2, 32+v6,32+v15, 32+v2 + stxvd2x 32+v3, r10, r3 + stxvd2x 32+v2, r11, r3 + stxvd2x 32+v1, r8, r3 + stxvd2x 32+v0, r9, r3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu r1, -224(r1) + mflr r0 + std r14, 96(r1) + std r15, 104(r1) + std r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + + vxor v7, v7, v7 + + li r6, Q_OFFSET + li r7, C20159_OFFSET + lxvx 32+V_MKQ, r6, r4 + lxvx 32+V20159, r7, r4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw v4, 1 + vsubuwm v5, V_26, v4 + vslw V_25, v4, v5 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + li r14, 16 + li r15, 32 + li r16, 48 + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 + Write_8X + + /* + * To unsigned canonical + */ +.align 4 + addi r3, r3, -512 + vxor v9, v9, v9 + vspltish v10, 15 + vmr v11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld r14, 96(r1) + ld r15, 104(r1) + ld r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + mtlr r0 + addi r1, r1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ diff --git a/test/mk/components.mk b/test/mk/components.mk index 05dc714149..b7abccfbbb 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -7,6 +7,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif