diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
index 048547aa1b..c55f7ef93e 100644
--- a/BIBLIOGRAPHY.md
+++ b/BIBLIOGRAPHY.md
@@ -47,6 +47,7 @@ source code and documentation.
   - [examples/monolithic_build_native/config_768.h](examples/monolithic_build_native/config_768.h)
   - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h)
   - [integration/liboqs/config_c.h](integration/liboqs/config_c.h)
+  - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h)
   - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h)
   - [mlkem/src/config.h](mlkem/src/config.h)
   - [mlkem/src/kem.c](mlkem/src/kem.c)
diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md
new file mode 100644
index 0000000000..5125a40eae
--- /dev/null
+++ b/dev/ppc64le/README.md
@@ -0,0 +1,6 @@
+[//]: # (SPDX-License-Identifier: CC-BY-4.0)
+
+# ppc64le backend (little endian)
+
+This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems.
+
diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h
new file mode 100644
index 0000000000..34f8cbec66
--- /dev/null
+++ b/dev/ppc64le/meta.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLK_DEV_PPC64LE_META_H
+#define MLK_DEV_PPC64LE_META_H
+
+/* Identifier for this backend so that source and assembly files
+ * in the build can be appropriately guarded. */
+#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT
+
+#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT
+
+/* Set of primitives that this backend replaces */
+#define MLK_USE_NATIVE_NTT
+#define MLK_USE_NATIVE_INTT
+#define MLK_USE_NATIVE_POLY_REDUCE
+#define MLK_USE_NATIVE_POLY_TOMONT
+
+#if !defined(__ASSEMBLER__)
+#include <string.h>
+#include "../../common.h"
+#include "../../params.h"
+#include "../api.h"
+#include "src/arith_native_ppc64le.h"
+
+static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
+{
+  mlk_ntt_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+
+static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
+{
+  mlk_intt_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+
+static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
+{
+  mlk_reduce_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+
+static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
+{
+  mlk_poly_tomont_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !MLK_DEV_PPC64LE_META_H */
diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h
new file mode 100644
index 0000000000..1c75346689
--- /dev/null
+++ b/dev/ppc64le/src/arith_native_ppc64le.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2024-2025 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H
+#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H
+
+#include <stdint.h>
+#include "../../../common.h"
+#include "consts.h"
+
+#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc)
+void mlk_ntt_ppc(int16_t *, const int16_t *);
+
+#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc)
+void mlk_intt_ppc(int16_t *, const int16_t *);
+
+#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc)
+void mlk_reduce_ppc(int16_t *r, const int16_t *);
+
+#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc)
+void mlk_poly_tomont_ppc(int16_t *, const int16_t *);
+
+#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */
diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c
new file mode 100644
index 0000000000..fa0f7097f5
--- /dev/null
+++ b/dev/ppc64le/src/consts.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "consts.h"
+
+MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = {
+    /* -Q */
+    -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329,
+    /* QINV */
+    -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327,
+    /* Q */
+    3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329,
+    /* const 20159 for reduce.S and intt */
+    20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159,
+    /* const 1441 for intt */
+    1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441,
+    /* for poly_tomont.S */
+    1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353,
+    /* zetas */
+    /* For ntt Len=128, offset 96 */
+    -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359,
+    -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
+    -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422,
+    1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202,
+    202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171,
+    -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577,
+    1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962,
+    962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202,
+    -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468,
+    1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573,
+    -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264,
+    264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829,
+    -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458,
+    1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130,
+    -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681,
+    -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732,
+    732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542,
+    -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411,
+    411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571,
+    -1571, -1571, -1571, -1571, -1571, -1571,
+    /* For Len=4 */
+    1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015,
+    1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282,
+    -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8,
+    -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618,
+    -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469,
+    1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271,
+    830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247,
+    -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961,
+    961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448,
+    448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275,
+    -1275,
+    /*
+     * For ntt Len=2
+     * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2)
+     * Transpose z[0], z[1], z[2], z[3]
+     *    -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2]
+     */
+    555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105,
+    105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291,
+    -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778,
+    -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590,
+    349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817,
+    817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285,
+    1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874,
+    -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530,
+    -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108,
+    991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460,
+    /*
+     * For intt Len=2, offset IZETA_NTT_OFFSET127
+     * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2)
+     * Transpose z[0], z[1], z[2], z[3]
+     *    -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2]
+     */
+    -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108,
+    -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530,
+    -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874,
+    -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218,
+    -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817,
+    817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349,
+    -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602,
+    778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291,
+    -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105,
+    105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555,
+    /* For intt Len=4 */
+    -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065,
+    448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961,
+    961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247,
+    -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830,
+    830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853,
+    1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162,
+    -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320,
+    -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282,
+    -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015,
+    1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223,
+    1223,
+    /* For intt Len=8 and others */
+    -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205,
+    -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542,
+    -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608,
+    608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017,
+    1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681,
+    -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602,
+    -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458,
+    1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383,
+    383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325,
+    -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573,
+    573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474,
+    -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202,
+    -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182,
+    182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622,
+    622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171,
+    -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287,
+    287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493,
+    1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517,
+    -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758,
+    -758, -758, -758, -758, -758, -758};
+
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h
new file mode 100644
index 0000000000..96cf7cfc91
--- /dev/null
+++ b/dev/ppc64le/src/consts.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H
+#define MLK_DEV_PPC64LE_SRC_CONSTS_H
+#include "../../../common.h"
+
+#define NQ_OFFSET 0
+#define QINV_OFFSET 16
+#define Q_OFFSET 32
+#define C20159_OFFSET 48
+#define C1441_OFFSET 64
+#define C1353_OFFSET 80
+#define ZETA_NTT_OFFSET 96
+#define ZETA_INTT_OFFSET 1104
+
+#ifndef __ASSEMBLER__
+#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata)
+extern const int16_t mlk_ppc_qdata[];
+#else
+#define r0 0
+#define r1 1
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+#define r14 14
+#define r15 15
+#define r16 16
+#define r17 17
+#define r18 18
+#define r19 19
+#define r20 20
+#define r21 21
+#define v0 0
+#define v1 1
+#define v2 2
+#define v3 3
+#define v4 4
+#define v5 5
+#define v6 6
+#define v7 7
+#define v8 8
+#define v9 9
+#define v10 10
+#define v11 11
+#define v12 12
+#define v13 13
+#define v14 14
+#define v15 15
+#define v16 16
+#define v17 17
+#define v18 18
+#define v19 19
+#define v20 20
+#define v21 21
+#define v22 22
+#define v23 23
+#define v24 24
+#define v25 25
+#define v26 26
+#define v27 27
+#define v28 28
+#define v29 29
+#define v30 30
+#define v31 31
+#define vs0 0
+#define vs1 1
+#define vs2 2
+#define vs3 3
+#define vs4 4
+#define vs5 5
+#define vs6 6
+#define vs7 7
+#define vs8 8
+#define vs9 9
+#define vs10 10
+#define vs11 11
+#define vs12 12
+#define vs13 13
+#endif
+
+#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */
diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S
new file mode 100644
index 0000000000..d311138275
--- /dev/null
+++ b/dev/ppc64le/src/intt_ppc.S
@@ -0,0 +1,791 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+#include "consts.h"
+
+.machine "any"
+.text
+
+/* Barrett reduce constatnts */
+#define V20159  0
+#define V_25    1
+#define V_26    2
+#define V_MKQ   3
+
+/* Montgomery reduce constatnts */
+#define V_QINV  2
+#define V_NMKQ  5
+#define V_Z0    7
+#define V_Z1    8
+#define V_Z2    9
+#define V_Z3    10
+#define V_ZETA  10
+#define V1441   10
+
+.macro SAVE_REGS
+        stdu    r1, -352(r1)
+        mflr    r0
+        std     r14, 56(r1)
+        std     r15, 64(r1)
+        std     r16, 72(r1)
+        std     r17, 80(r1)
+        std     r18, 88(r1)
+        std     r19, 96(r1)
+        std     r20, 104(r1)
+        std     r21, 112(r1)
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        stxvx   32+v20, r10, r1
+        stxvx   32+v21, r11, r1
+        stxvx   32+v22, r12, r1
+        stxvx   32+v23, r14, r1
+        stxvx   32+v24, r15, r1
+        stxvx   32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        stxvx   32+v26, r10, r1
+        stxvx   32+v27, r11, r1
+        stxvx   32+v28, r12, r1
+        stxvx   32+v29, r14, r1
+        stxvx   32+v30, r15, r1
+        stxvx   32+v31, r16, r1
+.endm
+
+.macro RESTORE_REGS
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        lxvx    32+v20, r10, r1
+        lxvx    32+v21, r11, r1
+        lxvx    32+v22, r12, r1
+        lxvx    32+v23, r14, r1
+        lxvx    32+v24, r15, r1
+        lxvx    32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        lxvx    32+v26, r10, r1
+        lxvx    32+v27, r11, r1
+        lxvx    32+v28, r12, r1
+        lxvx    32+v29, r14, r1
+        lxvx    32+v30, r15, r1
+        lxvx    32+v31, r16, r1
+        ld      r14, 56(r1)
+        ld      r15, 64(r1)
+        ld      r16, 72(r1)
+        ld      r17, 80(r1)
+        ld      r18, 88(r1)
+        ld      r19, 96(r1)
+        ld      r20, 104(r1)
+        ld      r21, 112(r1)
+
+        mtlr    r0
+        addi    r1, r1, 352
+.endm
+
+/*
+ * Compute final final r[j] and r[j+len]
+ *  final r[j+len]: V8, V12, V16, V20
+ *  final r[j]: V21, V22, V23, V24
+ */
+.macro Compute_4Coeffs
+        /* Since the result of the Montgomery multiplication is bounded
+           by q in absolute value.
+           Finally to complete the final update of the results with add/sub
+           r[j] = r[j] + t.
+           r[j+len] = r[j] - t
+         */
+        vsubuhm v25, v8, v21
+        vsubuhm v26, v12, v22
+        vsubuhm v30, v16, v23
+        vsubuhm v31, v20, v24
+        vadduhm v8, v8, v21
+        vadduhm v12, v12, v22
+        vadduhm v16, v16, v23
+        vadduhm v20, v20, v24
+.endm
+
+/*
+ * Init_Coeffs_offset: initial offset setup for the coeeficient array.
+ *
+ * start: beginning of the offset to the coefficient array.
+ * next: Next offset.
+ * len: Index difference between coefficients.
+ *
+ * r7: len * 2, each coefficient component is 2 bytes.
+ *
+ * register used for offset to coefficients, r[j] and r[j+len]
+ * R9: offset to r0 = j
+ * R16: offset to r1 = r0 + next
+ * R18: offset to r2 = r1 + next
+ * R20: offset to r3 = r2 + next
+ *
+ * R10: offset to r'0 = r0 + len*2
+ * R17: offset to r'1 = r'0 + step
+ * R19: offset to r'2 = r'1 + step
+ * R21: offset to r'3 = r'2 + step
+ *
+ */
+.macro Init_Coeffs_offset start next
+        li      r9, \start       /* first offset to j */
+        add     r10, r7, r9        /* J + len*2 */
+        addi    r16, r9, \next
+        addi    r17, r10, \next
+        addi    r18, r16, \next
+        addi    r19, r17, \next
+        addi    r20, r18, \next
+        addi    r21, r19, \next
+.endm
+
+/*
+ * Load coefficient vectors for r[j] (r) and r[j+len] (r'):
+ *   Load coefficient in r' vectors from offset, R10, R17, R19 and R21
+ *   Load coefficient in r vectors from offset, R9, R16, R18 and R20
+ *
+ *  r[j+len]: V8, V12, V16, V20
+ *  r[j]: V21, V22, V23, V24
+ */
+.macro Load_4Rjp
+        lxvd2x  32+v8, r3, r10     /* V8: vector r'0 */
+        lxvd2x  32+v12, r3, r17    /* V12: vector for r'1 */
+        lxvd2x  32+v16, r3, r19    /* V16: vector for r'2 */
+        lxvd2x  32+v20, r3, r21    /* V20: vector for r'3 */
+
+        lxvd2x  32+v21, r3, r9     /* V21: vector r0 */
+        lxvd2x  32+v22, r3, r16    /* V22: vector r1 */
+        lxvd2x  32+v23, r3, r18    /* V23: vector r2 */
+        lxvd2x  32+v24, r3, r20    /* V24: vector r3 */
+.endm
+
+/*
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7
+ */
+.macro Load_4Coeffs start next
+        Init_Coeffs_offset \start \next
+        Load_4Rjp
+        Compute_4Coeffs
+.endm
+
+/*
+ * Load 2 - 2 - 2 - 2 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *    rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7
+ *    rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15
+ *  Each vmrgew and vmrgow will transpose vectors as,
+ *  r[j]=      rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13
+ *  r[j+len]=  rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15
+ *
+ *  r[j+len]: V8, V12, V16, V20
+ *  r[j]: V21, V22, V23, V24
+ *
+ * In order to do the coefficient computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L24Coeffs
+        lxvd2x     32+v25, 0, r5
+        lxvd2x     32+v26, r10, r5
+        vmrgew v8, v25, v26
+        vmrgow v21, v25, v26
+        lxvd2x     32+v25, r11, r5
+        lxvd2x     32+v26, r12, r5
+        vmrgew v12, v25, v26
+        vmrgow v22, v25, v26
+        lxvd2x     32+v25, r15, r5
+        lxvd2x     32+v26, r16, r5
+        vmrgew v16, v25, v26
+        vmrgow v23, v25, v26
+        lxvd2x     32+v25, r17, r5
+        lxvd2x     32+v26, r18, r5
+        vmrgew v20, v25, v26
+        vmrgow v24, v25, v26
+.endm
+
+/*
+ * Load 4 - 4 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7
+ *  rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15
+ *
+ *  Each xxpermdi will transpose vectors as,
+ *  rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15
+ *  rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L44Coeffs
+        lxvd2x     vs10, 0, r5
+        lxvd2x     vs11, r10, r5
+        xxpermdi 32+v8, vs11, vs10, 3
+        xxpermdi 32+v21, vs11, vs10, 0
+        lxvd2x     vs10, r11, r5
+        lxvd2x     vs11, r12, r5
+        xxpermdi 32+v12, vs11, vs10, 3
+        xxpermdi 32+v22, vs11, vs10, 0
+        lxvd2x     vs10, r15, r5
+        lxvd2x     vs11, r16, r5
+        xxpermdi 32+v16, vs11, vs10, 3
+        xxpermdi 32+v23, vs11, vs10, 0
+        lxvd2x     vs10, r17, r5
+        lxvd2x     vs11, r18, r5
+        xxpermdi 32+v20, vs11, vs10, 3
+        xxpermdi 32+v24, vs11, vs10, 0
+.endm
+
+.macro BREDUCE_4X _v0 _v1 _v2 _v3
+        /* Restore constant vectors
+           V_MKQ, V_25 and V_26 */
+        vxor    v7, v7, v7
+        xxlor   32+v3, vs6, vs6
+        xxlor   32+v1, vs7, vs7
+        xxlor   32+v2, vs8, vs8
+        /* Multify Odd/Even signed halfword;
+           Results word bound by 2^32 in abs value. */
+        vmulosh v6, v8, V20159
+        vmulesh v5, v8, V20159
+        vmulosh v11, v12, V20159
+        vmulesh v10, v12, V20159
+        vmulosh v15, v16, V20159
+        vmulesh v14, v16, V20159
+        vmulosh v19, v20, V20159
+        vmulesh v18, v20, V20159
+        xxmrglw 32+v4, 32+v5, 32+v6
+        xxmrghw 32+v5, 32+v5, 32+v6
+        xxmrglw 32+v9, 32+v10, 32+v11
+        xxmrghw 32+v10, 32+v10, 32+v11
+        xxmrglw 32+v13, 32+v14, 32+v15
+        xxmrghw 32+v14, 32+v14, 32+v15
+        xxmrglw 32+v17, 32+v18, 32+v19
+        xxmrghw 32+v18, 32+v18, 32+v19
+        vadduwm v4, v4, V_25
+        vadduwm v5, v5, V_25
+        vadduwm v9, v9, V_25
+        vadduwm v10, v10, V_25
+        vadduwm v13, v13, V_25
+        vadduwm v14, v14, V_25
+        vadduwm v17, v17, V_25
+        vadduwm v18, v18, V_25
+        /* Right shift and pack lower halfword,
+           results bond to 2^16 in abs value */
+        vsraw   v4, v4, V_26
+        vsraw   v5, v5, V_26
+        vsraw   v9, v9, V_26
+        vsraw   v10, v10, V_26
+        vsraw   v13, v13, V_26
+        vsraw   v14, v14, V_26
+        vsraw   v17, v17, V_26
+        vsraw   v18, v18, V_26
+        vpkuwum v4, v5, v4
+        vsubuhm v4, v7, v4
+        vpkuwum v9, v10, v9
+        vsubuhm v9, v7, v9
+        vpkuwum v13, v14, v13
+        vsubuhm v13, v7, v13
+        vpkuwum v17, v18, v17
+        vsubuhm v17, v7, v17
+        /* Modulo multify-Low unsigned halfword;
+           results bond to 2^16 * q in abs value. */
+        vmladduhm \_v0, v4, V_MKQ, v8
+        vmladduhm \_v1, v9, V_MKQ, v12
+        vmladduhm \_v2, v13, V_MKQ, v16
+        vmladduhm \_v3, v17, V_MKQ, v20
+.endm
+
+/*
+ * -----------------------------------
+ * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3)
+ */
+.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3
+        /* Modular multification bond by 2^16 * q in abs value */
+        vmladduhm v15, v25, \_vz0, v3
+        vmladduhm v20, v26, \_vz1, v3
+        vmladduhm v27, v30, \_vz2, v3
+        vmladduhm v28, v31, \_vz3, v3
+
+        /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */
+        vmhraddshs v14, v25, \_vz0, v3
+        vmhraddshs v19, v26, \_vz1, v3
+        vmhraddshs v24, v30, \_vz2, v3
+        vmhraddshs v29, v31, \_vz3, v3
+
+        vmladduhm v15, v15, V_QINV, v3
+        vmladduhm v20, v20, V_QINV, v3
+        vmladduhm v25, v27, V_QINV, v3
+        vmladduhm v30, v28, V_QINV, v3
+
+        vmhraddshs v15, v15, V_NMKQ, v14
+        vmhraddshs v20, v20, V_NMKQ, v19
+        vmhraddshs v25, v25, V_NMKQ, v24
+        vmhraddshs v30, v30, V_NMKQ, v29
+
+        /* Shift right 1 bit */
+        vsrah \_vo0, v15, v4
+        vsrah \_vo1, v20, v4
+        vsrah \_vo2, v25, v4
+        vsrah \_vo3, v30, v4
+.endm
+
+/*
+ * setup constant vectors for Montgmery multiplication
+ * V_NMKQ, V_QINV, Zero vector, One vector
+ */
+.macro Set_mont_consts
+        xxlor   32+v5, vs0, vs0    /* V_NMKQ */
+        xxlor   32+v2, vs2, vs2    /* V_QINV */
+        xxlor   32+v3, vs3, vs3    /* all 0 */
+        xxlor   32+v4, vs4, vs4    /* all 1 */
+.endm
+
+.macro Load_next_4zetas
+        li      r8, 16
+        li      r11, 32
+        li      r12, 48
+        lxvd2x    32+V_Z0, 0, r14
+        lxvd2x    32+V_Z1, r8, r14
+        lxvd2x    32+V_Z2, r11, r14
+        lxvd2x    32+V_Z3, r12, r14
+        addi    r14, r14, 64
+.endm
+
+/*
+ * Re-ordering of the 4-4 layout zetas.
+ * Swap double-words.
+ */
+.macro Perm_4zetas
+        xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2
+        xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2
+        xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2
+        xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2
+.endm
+
+.macro Write_B4C _vs0 _vs1 _vs2 _vs3
+        stxvd2x \_vs0, r3, r9
+        stxvd2x \_vs1, r3, r16
+        stxvd2x \_vs2, r3, r18
+        stxvd2x \_vs3, r3, r20
+.endm
+
+.macro Write_M4C _vs0 _vs1 _vs2 _vs3
+        stxvd2x \_vs0, r3, r10
+        stxvd2x \_vs1, r3, r17
+        stxvd2x \_vs2, r3, r19
+        stxvd2x \_vs3, r3, r21
+.endm
+
+.macro Reload_4coeffs
+        lxvd2x  32+v25, 0, r3
+        lxvd2x  32+v26, r10, r3
+        lxvd2x  32+v30, r11, r3
+        lxvd2x  32+v31, r12, r3
+        addi    r3, r3, 64
+.endm
+
+.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7
+        addi    r3, r3, -128
+        stxvd2x \_vs0, 0, r3
+        stxvd2x \_vs1, r10, r3
+        stxvd2x \_vs2, r11, r3
+        stxvd2x \_vs3, r12, r3
+        stxvd2x \_vs4, r15, r3
+        stxvd2x \_vs5, r16, r3
+        stxvd2x \_vs6, r17, r3
+        stxvd2x \_vs7, r18, r3
+        addi    r3, r3, 128
+.endm
+
+/*
+ * Transpose the final coefficients of 4-4 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL44
+        xxlor   32+v14, vs10, vs10
+        xxlor   32+v19, vs11, vs11
+        xxlor   32+v24, vs12, vs12
+        xxlor   32+v29, vs13, vs13
+        xxpermdi 32+v10, 32+v14, 32+v13, 3
+        xxpermdi 32+v11, 32+v14, 32+v13, 0
+        xxpermdi 32+v12, 32+v19, 32+v18, 3
+        xxpermdi 32+v13, 32+v19, 32+v18, 0
+        xxpermdi 32+v14, 32+v24, 32+v23, 3
+        xxpermdi 32+v15, 32+v24, 32+v23, 0
+        xxpermdi 32+v16, 32+v29, 32+v28, 3
+        xxpermdi 32+v17, 32+v29, 32+v28, 0
+        stxvd2x    32+v10, 0, r5
+        stxvd2x    32+v11, r10, r5
+        stxvd2x    32+v12, r11, r5
+        stxvd2x    32+v13, r12, r5
+        stxvd2x    32+v14, r15, r5
+        stxvd2x    32+v15, r16, r5
+        stxvd2x    32+v16, r17, r5
+        stxvd2x    32+v17, r18, r5
+.endm
+
+/*
+ * Transpose the final coefficients of 2-2-2-2 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL24
+        xxlor   32+v14, vs10, vs10
+        xxlor   32+v19, vs11, vs11
+        xxlor   32+v24, vs12, vs12
+        xxlor   32+v29, vs13, vs13
+        vmrgew v10, v13, v14
+        vmrgow v11, v13, v14
+        vmrgew v12, v18, v19
+        vmrgow v13, v18, v19
+        vmrgew v14, v23, v24
+        vmrgow v15, v23, v24
+        vmrgew v16, v28, v29
+        vmrgow v17, v28, v29
+        stxvd2x    32+v10, 0, r5
+        stxvd2x    32+v11, r10, r5
+        stxvd2x    32+v12, r11, r5
+        stxvd2x    32+v13, r12, r5
+        stxvd2x    32+v14, r15, r5
+        stxvd2x    32+v15, r16, r5
+        stxvd2x    32+v16, r17, r5
+        stxvd2x    32+v17, r18, r5
+.endm
+
+.macro INTT_REDUCE_L24
+        Load_L24Coeffs
+        Compute_4Coeffs
+        BREDUCE_4X v4, v9, v13, v17
+        xxlor   vs10, 32+v4, 32+v4
+        xxlor   vs11, 32+v9, 32+v9
+        xxlor   vs12, 32+v13, 32+v13
+        xxlor   vs13, 32+v17, 32+v17
+        Set_mont_consts
+        Load_next_4zetas
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28
+        PermWriteL24
+.endm
+
+.macro INTT_REDUCE_L44
+        Load_L44Coeffs
+        Compute_4Coeffs
+        BREDUCE_4X v4, v9, v13, v17
+        xxlor   vs10, 32+v4, 32+v4
+        xxlor   vs11, 32+v9, 32+v9
+        xxlor   vs12, 32+v13, 32+v13
+        xxlor   vs13, 32+v17, 32+v17
+        Set_mont_consts
+        Load_next_4zetas
+        Perm_4zetas
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28
+        PermWriteL44
+.endm
+
+.macro INTT_REDUCE_4X start next
+        Load_4Coeffs \start, \next
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        Load_next_4zetas
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+.endm
+
+/*
+ * main operations for intt
+ * t = r[j];
+ * r[j] = barrett_reduce(t + r[j + len]);
+ * r[j + len] = r[j + len] - t;
+ * r[j + len] = fqmul(zeta, r[j + len]);
+ */
+
+/*
+ * mlk_intt_ppc(r)
+ */
+.global MLK_ASM_NAMESPACE(intt_ppc)
+.align 4
+MLK_ASM_FN_SYMBOL(intt_ppc)
+
+        SAVE_REGS
+
+        /* init vectors and constants
+           Setup for Montgomery reduce */
+        lxvx    vs0, 0, r4
+
+        li      r10, QINV_OFFSET
+        lxvx    32+V_QINV, r10, r4
+        xxlxor  32+v3, 32+v3, 32+v3
+        vspltish v4, 1
+        xxlor   vs2, 32+v2, 32+v2        /* QINV */
+        xxlor   vs3, 32+v3, 32+v3        /* 0 vector */
+        xxlor   vs4, 32+v4, 32+v4        /* 1 vector */
+
+        /*  Setup for Barrett reduce */
+        li      r10, Q_OFFSET
+        li      r11, C20159_OFFSET
+        lxvx    vs6, r10, r4             /* V_MKQ */
+        lxvx    32+V20159, r11, r4       /* V20159 */
+
+        vspltisw v8, 13
+        vadduwm  v8, v8, v8
+        xxlor   vs8, 32+v8, 32+v8   /* V_26 store at vs8 */
+
+        vspltisw v9, 1
+        vsubuwm v10, v8, v9        /* value 25 */
+        vslw    v9, v9, v10
+        xxlor   vs7, 32+v9, 32+v9   /* V_25 syore at vs7 */
+
+        li      r10, 16
+        li      r11, 32
+        li      r12, 48
+        li      r15, 64
+        li      r16, 80
+        li      r17, 96
+        li      r18, 112
+
+        /*
+         * Montgomery reduce loops with constant 1441
+         */
+        addi    r14, r4, C1441_OFFSET
+        lvx     V1441, 0, r14
+        li      r8, 4
+        mtctr   r8
+
+        Set_mont_consts
+intt_ppc__Loopf:
+        Reload_4coeffs
+        MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9
+        Reload_4coeffs
+        MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28
+        MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28
+        bdnz    intt_ppc__Loopf
+
+        addi    r3, r3, -512
+
+.align 4
+        /*
+         * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252
+         * Update zetas vectors, each vector has 2 zetas
+         * Load zeta array in 2-2-2-2 layout
+         */
+        addi    r14, r4, ZETA_INTT_OFFSET
+        li      r7, 4        /* len * 2 */
+        li      r8, 4
+        mtctr   r8
+        mr      r5, r3
+intt_ppc__Loop2:
+        INTT_REDUCE_L24
+        addi    r5, r5, 128
+        bdnz    intt_ppc__Loop2
+
+.align 4
+        /*
+         * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248
+         * Load zeta array in 4-4 layout
+         */
+        mr      r5, r3
+        li      r7, 8
+        li      r8, 4
+        mtctr   r8
+intt_ppc__Loop4:
+        INTT_REDUCE_L44
+        addi    r5, r5, 128
+        bdnz    intt_ppc__Loop4
+
+.align 4
+        /*
+         * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240
+         */
+        li      r7, 16
+
+        INTT_REDUCE_4X 0, 32
+        INTT_REDUCE_4X 128, 32
+        INTT_REDUCE_4X 256, 32
+        INTT_REDUCE_4X 384, 32
+
+.align 4
+        /*
+         * 4. len = 16, start = 0, 32, 64,,...160, 192, 224
+         */
+        li      r7, 32
+
+        INTT_REDUCE_4X 0, 64
+
+        addi    r14, r14, -64
+        INTT_REDUCE_4X 16, 64
+
+        INTT_REDUCE_4X 256, 64
+
+        addi    r14, r14, -64
+        INTT_REDUCE_4X 272, 64
+
+.align 4
+        /*
+         * 5. len = 32, start = 0, 64, 128, 192
+         */
+        li      r7, 64
+
+        Load_4Coeffs 0, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 128, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 256, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 384, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+.align 4
+        /*
+         * 6. len = 64, start = 0, 128
+         */
+        li      r7, 128
+        Load_4Coeffs 0, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 64, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 256, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 320, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+.align 4
+        /*
+         * 7. len = 128, start = 0
+         */
+        li      r7, 256          /* len*2 */
+
+        Load_4Coeffs 0, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        xxlor   vs9, 32+V_ZETA, 32+V_ZETA
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 64, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        xxlor   32+V_ZETA, vs9, vs9
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 128, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        xxlor   32+V_ZETA, vs9, vs9
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 192, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        xxlor   32+V_ZETA, vs9, vs9
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        RESTORE_REGS
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V20159
+#undef V_25
+#undef V_26
+#undef V_MKQ
+#undef V_QINV
+#undef V_NMKQ
+#undef V_Z0
+#undef V_Z1
+#undef V_Z2
+#undef V_Z3
+#undef V_ZETA
+#undef V1441
+
+/* simpasm: footer-start */
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S
new file mode 100644
index 0000000000..beee949702
--- /dev/null
+++ b/dev/ppc64le/src/ntt_ppc.S
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+#include "consts.h"
+
+#define V_QINV  2
+#define V_NMKQ  5
+#define V_Z0    7
+#define V_Z1    8
+#define V_Z2    9
+#define V_Z3    10
+#define V_ZETA  10
+
+.machine "any"
+.text
+
+.macro SAVE_REGS
+        stdu    r1, -352(r1)
+        mflr    r0
+        std     r14, 56(r1)
+        std     r15, 64(r1)
+        std     r16, 72(r1)
+        std     r17, 80(r1)
+        std     r18, 88(r1)
+        std     r19, 96(r1)
+        std     r20, 104(r1)
+        std     r21, 112(r1)
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        stxvx   32+v20, r10, r1
+        stxvx   32+v21, r11, r1
+        stxvx   32+v22, r12, r1
+        stxvx   32+v23, r14, r1
+        stxvx   32+v24, r15, r1
+        stxvx   32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        stxvx   32+v26, r10, r1
+        stxvx   32+v27, r11, r1
+        stxvx   32+v28, r12, r1
+        stxvx   32+v29, r14, r1
+        stxvx   32+v30, r15, r1
+        stxvx   32+v31, r16, r1
+.endm
+
+.macro RESTORE_REGS
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        lxvx    32+v20, r10, r1
+        lxvx    32+v21, r11, r1
+        lxvx    32+v22, r12, r1
+        lxvx    32+v23, r14, r1
+        lxvx    32+v24, r15, r1
+        lxvx    32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        lxvx    32+v26, r10, r1
+        lxvx    32+v27, r11, r1
+        lxvx    32+v28, r12, r1
+        lxvx    32+v29, r14, r1
+        lxvx    32+v30, r15, r1
+        lxvx    32+v31, r16, r1
+        ld      r14, 56(r1)
+        ld      r15, 64(r1)
+        ld      r16, 72(r1)
+        ld      r17, 80(r1)
+        ld      r18, 88(r1)
+        ld      r19, 96(r1)
+        ld      r20, 104(r1)
+        ld      r21, 112(r1)
+
+        mtlr    r0
+        addi    r1, r1, 352
+.endm
+
+/*
+ * Init_Coeffs_offset: initial offset setup for the coeeficient array.
+ *
+ * start: beginning of the offset to the coefficient array.
+ * next: Next offset.
+ * len: Index difference between coefficients.
+ *
+ * r7: len * 2, each coefficient component is 2 bytes.
+ *
+ * registers used for offset to coefficients, r[j] and r[j+len]
+ * R9: offset to r0 = j
+ * R16: offset to r1 = r0 + next
+ * R18: offset to r2 = r1 + next
+ * R20: offset to r3 = r2 + next
+ *
+ * R10: offset to r'0 = r0 + len*2
+ * R17: offset to r'1 = r'0 + step
+ * R19: offset to r'2 = r'1 + step
+ * R21: offset to r'3 = r'2 + step
+ *
+ */
+.macro Init_Coeffs_offset start next
+        li      r9, \start         /* first offset to j */
+        add     r10, r7, r9        /* J + len*2 */
+        addi    r16, r9, \next
+        addi    r17, r10, \next
+        addi    r18, r16, \next
+        addi    r19, r17, \next
+        addi    r20, r18, \next
+        addi    r21, r19, \next
+.endm
+
+/*
+ * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21
+ *  r[j+len]: V13, V18, V23, V28
+ */
+.macro Load_4Rjp
+        lxvd2x  32+v13, r3, r10    /* V13: vector r'0 */
+        lxvd2x  32+v18, r3, r17    /* V18: vector for r'1 */
+        lxvd2x  32+v23, r3, r19    /* V23: vector for r'2 */
+        lxvd2x  32+v28, r3, r21    /* V28: vector for r'3 */
+.endm
+
+/*
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7
+ */
+.macro Load_4Coeffs start next
+        Init_Coeffs_offset \start \next
+        Load_4Rjp
+.endm
+
+/*
+ * Load 2 - 2 - 2 - 2 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *    rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7
+ *    rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15
+ *  Each vmrgew and vmrgow will transpose vectors as,
+ *  r[j]=      rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13
+ *  r[j+len]=  rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15
+ *
+ *  r[j+len]: V13, V18, V23, V28
+ *  r[j]: V12, V17, V22, V27
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L24Coeffs
+        lxvd2x     32+v25, 0, r5
+        lxvd2x     32+v26, r10, r5
+        vmrgew v13, v25, v26
+        vmrgow v12, v25, v26
+        lxvd2x     32+v25, r11, r5
+        lxvd2x     32+v26, r12, r5
+        vmrgew v18, v25, v26
+        vmrgow v17, v25, v26
+        lxvd2x     32+v25, r15, r5
+        lxvd2x     32+v26, r16, r5
+        vmrgew v23, v25, v26
+        vmrgow v22, v25, v26
+        lxvd2x     32+v25, r17, r5
+        lxvd2x     32+v26, r18, r5
+        vmrgew v28, v25, v26
+        vmrgow v27, v25, v26
+.endm
+
+/*
+ * Load 4 - 4 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7
+ *  rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15
+ *
+ *  Each xxpermdi will transpose vectors as,
+ *  rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15
+ *  rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L44Coeffs
+        lxvd2x     vs1, 0, r5
+        lxvd2x     vs2, r10, r5
+        xxpermdi 32+v13, vs2, vs1, 3
+        xxpermdi 32+v12, vs2, vs1, 0
+        lxvd2x     vs3, r11, r5
+        lxvd2x     vs4, r12, r5
+        xxpermdi 32+v18, vs4, vs3, 3
+        xxpermdi 32+v17, vs4, vs3, 0
+        lxvd2x     vs1, r15, r5
+        lxvd2x     vs2, r16, r5
+        xxpermdi 32+v23, vs2, vs1, 3
+        xxpermdi 32+v22, vs2, vs1, 0
+        lxvd2x     vs3, r17, r5
+        lxvd2x     vs4, r18, r5
+        xxpermdi 32+v28, vs4, vs3, 3
+        xxpermdi 32+v27, vs4, vs3, 0
+.endm
+
+/*
+ * montgomery_reduce
+ * t = a * QINV
+ * t = (a - (int32_t)t*_MLKEM_Q) >> 16
+ *
+ * -----------------------------------
+ * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3)
+ */
+.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3
+        /* fqmul = zeta * coefficient
+           Modular multification bond by 2^16 * q in abs value */
+        vmladduhm v15, v13, \_vz0, v3
+        vmladduhm v20, v18, \_vz1, v3
+        vmladduhm v25, v23, \_vz2, v3
+        vmladduhm v30, v28, \_vz3, v3
+
+        /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */
+        vmhraddshs v14, v13, \_vz0, v3
+        vmhraddshs v19, v18, \_vz1, v3
+        vmhraddshs v24, v23, \_vz2, v3
+        vmhraddshs v29, v28, \_vz3, v3
+
+        vmladduhm v15, v15, V_QINV, v3
+        vmladduhm v20, v20, V_QINV, v3
+        vmladduhm v25, v25, V_QINV, v3
+        vmladduhm v30, v30, V_QINV, v3
+
+        vmhraddshs v15, v15, V_NMKQ, v14
+        vmhraddshs v20, v20, V_NMKQ, v19
+        vmhraddshs v25, v25, V_NMKQ, v24
+        vmhraddshs v30, v30, V_NMKQ, v29
+
+        /* Shift right 1 bit */
+        vsrah v13, v15, v4
+        vsrah v18, v20, v4
+        vsrah v23, v25, v4
+        vsrah v28, v30, v4
+.endm
+
+/*
+ * Load 4 r[j] (r) coefficient vectors:
+ *   Load coefficient in vectors from offset, R9, R16, R18 and R20
+ *  r[j]: V12, V17, V22, V27
+ */
+.macro Load_4Rj
+        lxvd2x  32+v12, r3, r9     /* V12: vector r0 */
+        lxvd2x  32+v17, r3, r16    /* V17: vector r1 */
+        lxvd2x  32+v22, r3, r18    /* V22: vector r2 */
+        lxvd2x  32+v27, r3, r20    /* V27: vector r3 */
+.endm
+
+/*
+ * Compute final final r[j] and r[j+len]
+ *  final r[j+len]: V16, V21, V26, V31
+ *  final r[j]: V15, V20, V25, V30
+ */
+.macro Compute_4Coeffs
+        /* Since the result of the Montgomery multiplication is bounded
+           by q in absolute value.
+           Finally to complete the final update of the results with add/sub
+           r[j] = r[j] + t.
+           r[j+len] = r[j] - t
+         */
+        vsubuhm v16, v12, v13
+        vadduhm v15, v13, v12
+        vsubuhm v21, v17, v18
+        vadduhm v20, v18, v17
+        vsubuhm v26, v22, v23
+        vadduhm v25, v23, v22
+        vsubuhm v31, v27, v28
+        vadduhm v30, v28, v27
+.endm
+
+.macro Write_One
+        stxvd2x 32+v15, r3, r9
+        stxvd2x 32+v16, r3, r10
+        stxvd2x 32+v20, r3, r16
+        stxvd2x 32+v21, r3, r17
+        stxvd2x 32+v25, r3, r18
+        stxvd2x 32+v26, r3, r19
+        stxvd2x 32+v30, r3, r20
+        stxvd2x 32+v31, r3, r21
+.endm
+
+/*
+ * Transpose the final coefficients of 4-4 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL44
+        Compute_4Coeffs
+        xxpermdi vs0, 32+v15, 32+v16, 3
+        xxpermdi vs1, 32+v15, 32+v16, 0
+        xxpermdi vs2, 32+v20, 32+v21, 3
+        xxpermdi vs3, 32+v20, 32+v21, 0
+        xxpermdi vs4, 32+v25, 32+v26, 3
+        xxpermdi vs5, 32+v25, 32+v26, 0
+        xxpermdi vs6, 32+v30, 32+v31, 3
+        xxpermdi vs7, 32+v30, 32+v31, 0
+        stxvd2x vs0, 0, r5
+        stxvd2x vs1, r10, r5
+        stxvd2x vs2, r11, r5
+        stxvd2x vs3, r12, r5
+        stxvd2x vs4, r15, r5
+        stxvd2x vs5, r16, r5
+        stxvd2x vs6, r17, r5
+        stxvd2x vs7, r18, r5
+.endm
+
+/*
+ * Transpose the final coefficients of 2-2-2-2 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL24
+        Compute_4Coeffs
+        vmrgew v10, v16, v15
+        vmrgow v11, v16, v15
+        vmrgew v12, v21, v20
+        vmrgow v13, v21, v20
+        vmrgew v14, v26, v25
+        vmrgow v15, v26, v25
+        vmrgew v16, v31, v30
+        vmrgow v17, v31, v30
+        stxvd2x 32+v10, 0, r5
+        stxvd2x 32+v11, r10, r5
+        stxvd2x 32+v12, r11, r5
+        stxvd2x 32+v13, r12, r5
+        stxvd2x 32+v14, r15, r5
+        stxvd2x 32+v15, r16, r5
+        stxvd2x 32+v16, r17, r5
+        stxvd2x 32+v17, r18, r5
+.endm
+
+.macro Load_next_4zetas
+        li      r10, 16
+        li      r11, 32
+        li      r12, 48
+        lxvd2x  32+V_Z0, 0, r14
+        lxvd2x  32+V_Z1, r10, r14
+        lxvd2x  32+V_Z2, r11, r14
+        lxvd2x  32+V_Z3, r12, r14
+        addi    r14, r14, 64
+.endm
+
+/*
+ * Re-ordering of the 4-4 layout zetas.
+ * Swap double-words.
+ */
+.macro Perm_4zetas
+        xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2
+        xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2
+        xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2
+        xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2
+.endm
+
+.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3
+        Load_4Coeffs \start, \next
+        MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3
+        Load_4Rj
+        Compute_4Coeffs
+        Write_One
+.endm
+
+/*
+ * mlk_ntt_ppc(int16_t *r)
+ */
+.global MLK_ASM_NAMESPACE(ntt_ppc)
+.align 4
+MLK_ASM_FN_SYMBOL(ntt_ppc)
+
+        SAVE_REGS
+
+        /* load MLKEM_Q */
+        lvx     V_NMKQ,0,r4
+
+        /* Register 14 as pointer to zetas array */
+        addi    r14, r4, ZETA_NTT_OFFSET
+
+        vxor    v3, v3, v3
+        vspltish v4, 1
+
+        li      r10, QINV_OFFSET
+        lvx     V_QINV, r10, r4
+
+.align 4
+        /*
+         * Compute coefficients of the NTT based on the following loop.
+         *   for (len = 128; len ≥ 2; len =  len/2)
+         *
+         * 1. len = 128, start = 0
+         */
+        li      r7, 256          /* len * 2 */
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+
+        NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+        /*
+         * 2. len = 64, start = 0, 128
+         * k += 2
+         */
+        li      r7, 128
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+        /*
+         * 3. len = 32, start = 0, 64, 128, 192
+         * k += 4
+         */
+        li      r7, 64
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+        /*
+         * 4. len = 16, start = 0, 32, 64,,...160, 192, 224
+         * k += 8
+         */
+        li      r7, 32
+        Load_next_4zetas
+        NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3
+        NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3
+        NTT_MREDUCE_4X  272, 64, V_Z0, V_Z1, V_Z2, V_Z3
+
+.align 4
+        /*
+         * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240
+         * k += 16
+         */
+        li      r7, 16
+        Load_next_4zetas
+        NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        /*
+         * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248
+         * k += 32
+         * Load zeta vectors in 4-4 layout
+         */
+        li      r15, 4
+        mtctr   r15
+        mr      r5, r3                 /* Let r5 points to coefficient array */
+        li      r7, 8
+
+        li      r10, 16
+        li      r11, 32
+        li      r12, 48
+        li      r15, 64
+        li      r16, 80
+        li      r17, 96
+        li      r18, 112
+
+.align 4
+ntt_ppc__Len4:
+        Load_next_4zetas
+        Perm_4zetas
+        Load_L44Coeffs
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3
+        PermWriteL44
+        addi    r5, r5, 128
+
+        bdnz    ntt_ppc__Len4
+
+        /*
+         * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252
+         * k += 64
+         * Load zeta vectors in 2-2-2-2 layout
+         */
+
+        li      r8, 4
+        mtctr   r8
+        mr      r5, r3                  /* Let r5 points to coefficient array */
+        li      r7, 4
+
+.align 4
+ntt_ppc__Len2:
+        Load_next_4zetas
+        Load_L24Coeffs
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3
+        PermWriteL24
+        addi    r5, r5, 128
+
+        bdnz    ntt_ppc__Len2
+
+        RESTORE_REGS
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V_QINV
+#undef V_NMKQ
+#undef V_ZETA
+
+/* simpasm: footer-start */
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S
new file mode 100644
index 0000000000..4d16be6f05
--- /dev/null
+++ b/dev/ppc64le/src/poly_tomont.S
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+/*
+ * Poly_tomont: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:*r: pointer to input/output polynomial
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+#include "consts.h"
+
+#define V1353   0
+#define V_QINV  2
+#define V_NMKQ  5
+
+.machine "any"
+.text
+
+/*
+ * montgomery_reduce
+ * t = a * QINV
+ * t = (a - (int32_t)t*_MLKEM_Q) >> 16
+ *
+ * -----------------------------------
+ * MREDUCE_4X(_v0, _v1, _v2, _v3)
+ */
+.macro MREDUCE_4X _v0 _v1 _v2 _v3
+        lxvd2x  32+v13, 0, r3
+        addi    r3, r3, 16
+        lxvd2x  32+v18, 0, r3
+        addi    r3, r3, 16
+        lxvd2x  32+v23, 0, r3
+        addi    r3, r3, 16
+        lxvd2x  32+v7, 0, r3
+        addi    r3, r3, 16
+
+        vmladduhm v15, v13, V1353, v3
+        vmladduhm v20, v18, V1353, v3
+        vmladduhm v25, v23, V1353, v3
+        vmladduhm v9, v7, V1353, v3
+
+        vmhraddshs v14, v13, V1353, v3
+        vmhraddshs v19, v18, V1353, v3
+        vmhraddshs v24, v23, V1353, v3
+        vmhraddshs v8, v7, V1353, v3
+
+        vmladduhm v15, v15, V_QINV, v3
+        vmladduhm v20, v20, V_QINV, v3
+        vmladduhm v25, v25, V_QINV, v3
+        vmladduhm v9, v9, V_QINV, v3
+
+        vmhraddshs v15, v15, V_NMKQ, v14
+        vmhraddshs v20, v20, V_NMKQ, v19
+        vmhraddshs v25, v25, V_NMKQ, v24
+        vmhraddshs v9, v9, V_NMKQ, v8
+
+        /* Shift right 1 bit */
+        vsrah \_v0, v15, v4
+        vsrah \_v1, v20, v4
+        vsrah \_v2, v25, v4
+        vsrah \_v3, v9, v4
+.endm
+
+.macro Write_8X
+        stxvd2x 32+v27, r4, r3
+        stxvd2x 32+v28, r5, r3
+        stxvd2x 32+v29, r6, r3
+        stxvd2x 32+v30, r7, r3
+        stxvd2x 32+v13, r8, r3
+        stxvd2x 32+v18, r9, r3
+        stxvd2x 32+v23, r10, r3
+        stxvd2x 32+v7, r11, r3
+.endm
+
+.align 4
+.globl MLK_ASM_NAMESPACE(poly_tomont_ppc)
+MLK_ASM_FN_SYMBOL(poly_tomont_ppc)
+        stdu    r1, -320(r1)
+        mflr    r0
+
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        li      r11, 208
+        li      r12, 224
+        stxvx   32+v20, r6, r1
+        stxvx   32+v21, r7, r1
+        stxvx   32+v22, r8, r1
+        stxvx   32+v23, r9, r1
+        stxvx   32+v24, r10, r1
+        stxvx   32+v25, r11, r1
+        stxvx   32+v26, r12, r1
+        li      r6, 240
+        li      r7, 256
+        li      r8, 272
+        li      r9, 288
+        stxvx   32+v27, r6, r1
+        stxvx   32+v28, r7, r1
+        stxvx   32+v29, r8, r1
+        stxvx   32+v30, r9, r1
+
+        li      r6, NQ_OFFSET
+        li      r7, QINV_OFFSET
+        li      r8, C1353_OFFSET
+        lxvx    32+V_NMKQ, r6, r4
+        lxvx    32+V_QINV, r7, r4
+        lxvx    32+V1353, r8, r4
+
+        vxor    v3, v3, v3
+        vspltish v4, 1
+
+        li      r4, -128
+        li      r5, -112
+        li      r6, -96
+        li      r7, -80
+        li      r8, -64
+        li      r9, -48
+        li      r10, -32
+        li      r11, -16
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        li      r11, 208
+        li      r12, 224
+        lxvx    32+v20, r6, r1
+        lxvx    32+v21, r7, r1
+        lxvx    32+v22, r8, r1
+        lxvx    32+v23, r9, r1
+        lxvx    32+v24, r10, r1
+        lxvx    32+v25, r11, r1
+        lxvx    32+v26, r12, r1
+        li      r6, 240
+        li      r7, 256
+        li      r8, 272
+        li      r9, 288
+        lxvx    32+v27, r6, r1
+        lxvx    32+v28, r7, r1
+        lxvx    32+v29, r8, r1
+        lxvx    32+v30, r9, r1
+        mtlr    r0
+        addi    r1, r1, 320
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V1353
+#undef V_QINV
+#undef V_NMKQ
+
+/* simpasm: footer-start */
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V1353
+#undef V_QINV
+#undef V_NMKQ
diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S
new file mode 100644
index 0000000000..691ce3970c
--- /dev/null
+++ b/dev/ppc64le/src/reduce.S
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+/*
+ * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial
+ *              for details of the Barrett reduction
+ *
+ * Arguments: *r: pointer to input/output polynomial
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+#include "consts.h"
+
+# Barrett reduce constatnts
+#define V20159  0
+#define V_25    1
+#define V_26    2
+#define V_MKQ   3
+
+.machine "any"
+.text
+
+.macro BREDUCE_4X _v0 _v1 _v2 _v3
+        lxvd2x  32+v8, 0, r3
+        lxvd2x  32+v12, r14, r3
+        lxvd2x  32+v16, r15, r3
+        lxvd2x  32+v20, r16, r3
+        addi    r3, r3, 64
+        vmulosh v6, v8, V20159
+        vmulesh v5, v8, V20159
+        vmulosh v11, v12, V20159
+        vmulesh v10, v12, V20159
+        vmulosh v15, v16, V20159
+        vmulesh v14, v16, V20159
+        vmulosh v19, v20, V20159
+        vmulesh v18, v20, V20159
+        xxmrglw 32+v4, 32+v5, 32+v6
+        xxmrghw 32+v5, 32+v5, 32+v6
+        xxmrglw 32+v9, 32+v10, 32+v11
+        xxmrghw 32+v10, 32+v10, 32+v11
+        xxmrglw 32+v13, 32+v14, 32+v15
+        xxmrghw 32+v14, 32+v14, 32+v15
+        xxmrglw 32+v17, 32+v18, 32+v19
+        xxmrghw 32+v18, 32+v18, 32+v19
+        vadduwm v4, v4, V_25
+        vadduwm v5, v5, V_25
+        vadduwm v9, v9, V_25
+        vadduwm v10, v10, V_25
+        vadduwm v13, v13, V_25
+        vadduwm v14, v14, V_25
+        vadduwm v17, v17, V_25
+        vadduwm v18, v18, V_25
+        vsraw   v4, v4, V_26
+        vsraw   v5, v5, V_26
+        vsraw   v9, v9, V_26
+        vsraw   v10, v10, V_26
+        vsraw   v13, v13, V_26
+        vsraw   v14, v14, V_26
+        vsraw   v17, v17, V_26
+        vsraw   v18, v18, V_26
+        vpkuwum v4, v5, v4
+        vsubuhm v4, v7, v4
+        vpkuwum v9, v10, v9
+        vsubuhm v9, v7, v9
+        vpkuwum v13, v14, v13
+        vsubuhm v13, v7, v13
+        vpkuwum v17, v18, v17
+        vsubuhm v17, v7, v17
+        vmladduhm \_v0, v4, V_MKQ, v8
+        vmladduhm \_v1, v9, V_MKQ, v12
+        vmladduhm \_v2, v13, V_MKQ, v16
+        vmladduhm \_v3, v17, V_MKQ, v20
+.endm
+
+.macro Write_8X
+        stxvd2x 32+v21, r4, r3
+        stxvd2x 32+v22, r5, r3
+        stxvd2x 32+v23, r6, r3
+        stxvd2x 32+v24, r7, r3
+        stxvd2x 32+v4, r8, r3
+        stxvd2x 32+v9, r9, r3
+        stxvd2x 32+v13, r10, r3
+        stxvd2x 32+v17, r11, r3
+.endm
+
+/*
+ * Conditional addition to get unsigned canonical representative
+ */
+.macro To_unsigned_16
+        lxvd2x    32+v12, 0, r3
+        lxvd2x    32+v13, r14, r3
+        lxvd2x    32+v14, r15, r3
+        lxvd2x    32+v15, r16, r3
+        addi    r3, r3, 64
+        vsrh    v1, v12, v10
+        vsrh    v0, v13, v10
+        vsrh    v3, v14, v10
+        vsrh    v2, v15, v10
+        vadduhm v7, v12, v11
+        vadduhm v8, v13, v11
+        vadduhm v5, v14, v11
+        vadduhm v6, v15, v11
+        vcmpequh v1, v1, v9
+        vcmpequh v0, v0, v9
+        vcmpequh v3, v3, v9
+        vcmpequh v2, v2, v9
+        xxsel   32+v1, 32+v7,32+v12, 32+v1
+        xxsel   32+v0, 32+v8,32+v13, 32+v0
+        xxsel   32+v3, 32+v5,32+v14, 32+v3
+        xxsel   32+v2, 32+v6,32+v15, 32+v2
+        stxvd2x 32+v3, r10, r3
+        stxvd2x 32+v2, r11, r3
+        stxvd2x 32+v1, r8, r3
+        stxvd2x 32+v0, r9, r3
+.endm
+
+.align 4
+.globl MLK_ASM_NAMESPACE(reduce_ppc)
+MLK_ASM_FN_SYMBOL(reduce_ppc)
+        stdu    r1, -224(r1)
+        mflr    r0
+        std     r14, 96(r1)
+        std     r15, 104(r1)
+        std     r16, 112(r1)
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        stxvx   32+v20, r6, r1
+        stxvx   32+v21, r7, r1
+        stxvx   32+v22, r8, r1
+        stxvx   32+v23, r9, r1
+        stxvx   32+v24, r10, r1
+
+        vxor    v7, v7, v7
+
+        li      r6, Q_OFFSET
+        li      r7, C20159_OFFSET
+        lxvx    32+V_MKQ, r6, r4
+        lxvx    32+V20159, r7, r4
+
+        vspltisw V_26, 13
+        vadduwm V_26, V_26, V_26
+        vspltisw v4, 1
+        vsubuwm v5, V_26, v4
+        vslw    V_25, v4, v5
+
+        li      r4, -128
+        li      r5, -112
+        li      r6, -96
+        li      r7, -80
+        li      r8, -64
+        li      r9, -48
+        li      r10, -32
+        li      r11, -16
+
+        li      r14, 16
+        li      r15, 32
+        li      r16, 48
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        /*
+         * To unsigned canonical
+         */
+.align 4
+        addi    r3, r3, -512
+        vxor    v9, v9, v9
+        vspltish v10, 15
+        vmr     v11, V_MKQ
+
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+
+        ld      r14, 96(r1)
+        ld      r15, 104(r1)
+        ld      r16, 112(r1)
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        lxvx    32+v20, r6, r1
+        lxvx    32+v21, r7, r1
+        lxvx    32+v22, r8, r1
+        lxvx    32+v23, r9, r1
+        lxvx    32+v24, r10, r1
+        mtlr    r0
+        addi    r1, r1, 224
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V20159
+#undef V_25
+#undef V_26
+#undef V_MKQ
+
+/* simpasm: footer-start */
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V20159
+#undef V_25
+#undef V_26
+#undef V_MKQ
diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml
index 7d8e50d4c6..9c7fe672ab 100644
--- a/integration/liboqs/ML-KEM-1024_META.yml
+++ b/integration/liboqs/ML-KEM-1024_META.yml
@@ -89,3 +89,22 @@ implementations:
     - Darwin
     required_flags:
     - asimd
+- name: ppc64le
+  version: FIPS203
+  folder_name: .
+  compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h"
+  signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair
+  signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand
+  signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc
+  signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec
+  sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h
+    mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h
+    mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c
+    mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h
+    mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h
+    mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h
+    mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le
+  supported_platforms:
+  - architecture: ppc64le
+    operating_systems:
+    - Linux
diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml
index aa88537d3f..f46dbfdbf1 100644
--- a/integration/liboqs/ML-KEM-512_META.yml
+++ b/integration/liboqs/ML-KEM-512_META.yml
@@ -89,3 +89,22 @@ implementations:
     - Darwin
     required_flags:
     - asimd
+- name: ppc64le
+  version: FIPS203
+  folder_name: .
+  compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h"
+  signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair
+  signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand
+  signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc
+  signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec
+  sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h
+    mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h
+    mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c
+    mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h
+    mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h
+    mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h
+    mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le
+  supported_platforms:
+  - architecture: ppc64le
+    operating_systems:
+    - Linux
diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml
index 254d67478a..1b01c4d426 100644
--- a/integration/liboqs/ML-KEM-768_META.yml
+++ b/integration/liboqs/ML-KEM-768_META.yml
@@ -89,3 +89,22 @@ implementations:
     - Darwin
     required_flags:
     - asimd
+- name: ppc64le
+  version: FIPS203
+  folder_name: .
+  compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h"
+  signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair
+  signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand
+  signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc
+  signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec
+  sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h
+    mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h
+    mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c
+    mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h
+    mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h
+    mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h
+    mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le
+  supported_platforms:
+  - architecture: ppc64le
+    operating_systems:
+    - Linux
diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h
new file mode 100644
index 0000000000..2fa1cdbcf6
--- /dev/null
+++ b/integration/liboqs/config_ppc64le.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [FIPS140_3_IG]
+ *   Implementation Guidance for FIPS 140-3 and the Cryptographic Module
+ *   Validation Program National Institute of Standards and Technology
+ *   https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
+ */
+
+#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H
+#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_PARAMETER_SET
+ *
+ * Description: Specifies the parameter set for ML-KEM
+ *              - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512
+ *              - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768
+ *              - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#ifndef MLK_CONFIG_PARAMETER_SET
+#define MLK_CONFIG_PARAMETER_SET \
+  768 /* Change this for different security strengths */
+#endif
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_NAMESPACE_PREFIX
+ *
+ * Description: The prefix to use to namespace global symbols from mlkem/.
+ *
+ *              In a multi-level build (that is, if either
+ *              - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or
+ *              - MLK_CONFIG_MULTILEVEL_NO_SHARED,
+ *              are set, level-dependent symbols will additionally be prefixed
+ *              with the parameter set (512/768/1024).
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#if MLK_CONFIG_PARAMETER_SET == 512
+#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE
+#elif MLK_CONFIG_PARAMETER_SET == 768
+#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE
+#elif MLK_CONFIG_PARAMETER_SET == 1024
+#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE
+#endif
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_USE_NATIVE_BACKEND_ARITH
+ *
+ * Description: Determines whether an native arithmetic backend should be used.
+ *
+ *              The arithmetic backend covers performance critical functions
+ *              such as the number-theoretic transform (NTT).
+ *
+ *              If this option is unset, the C backend will be used.
+ *
+ *              If this option is set, the arithmetic backend to be use is
+ *              determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is
+ *              unset, the default backend for your the target architecture
+ *              will be used. If set, it must be the name of a backend metadata
+ *              file.
+ *
+ *              This can also be set using CFLAGS.
+ *
+ *****************************************************************************/
+#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_ARITH_BACKEND_FILE
+ *
+ * Description: The arithmetic backend to use.
+ *
+ *              If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option
+ *              is ignored.
+ *
+ *              If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must
+ *              either be undefined or the filename of an arithmetic backend.
+ *              If unset, the default backend will be used.
+ *
+ *              This can be set using CFLAGS.
+ *
+ *****************************************************************************/
+#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h"
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_FIPS202_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mlkem-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mlkem/fips202/fips202.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/*
+#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \
+  "../../integration/liboqs/fips202_glue.h"
+*/
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
+ *
+ * Description: Custom header to use for FIPS-202-X4
+ *
+ *              This should only be set if you intend to use a custom
+ *              FIPS-202 implementation, different from the one shipped
+ *              with mlkem-native.
+ *
+ *              If set, it must be the name of a file serving as the
+ *              replacement for mlkem/fips202/fips202x4.h, and exposing
+ *              the same API (see FIPS202.md).
+ *
+ *****************************************************************************/
+/*
+#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \
+  "../../integration/liboqs/fips202x4_glue.h"
+*/
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_CUSTOM_ZEROIZE
+ *
+ * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes
+ *              intermediate stack buffers before returning from function calls.
+ *
+ *              Set this option and define `mlk_zeroize` if you want to
+ *              use a custom method to zeroize intermediate stack buffers.
+ *              The default implementation uses SecureZeroMemory on Windows
+ *              and a memset + compiler barrier otherwise. If neither of those
+ *              is available on the target platform, compilation will fail,
+ *              and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide
+ *              a custom implementation of `mlk_zeroize()`.
+ *
+ *              WARNING:
+ *              The explicit stack zeroization conducted by mlkem-native
+ *              reduces the likelihood of data leaking on the stack, but
+ *              does not eliminate it! The C standard makes no guarantee about
+ *              where a compiler allocates structures and whether/where it makes
+ *              copies of them. Also, in addition to entire structures, there
+ *              may also be potentially exploitable leakage of individual values
+ *              on the stack.
+ *
+ *              If you need bullet-proof zeroization of the stack, you need to
+ *              consider additional measures instead of of what this feature
+ *              provides. In this case, you can set mlk_zeroize to a no-op.
+ *
+ *****************************************************************************/
+/* #define MLK_CONFIG_CUSTOM_ZEROIZE
+   #if !defined(__ASSEMBLER__)
+   #include <stdint.h>
+   #include "sys.h"
+   static MLK_INLINE void mlk_zeroize(void *ptr, size_t len)
+   {
+       ... your implementation ...
+   }
+   #endif
+*/
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_CUSTOM_RANDOMBYTES
+ *
+ * Description: mlkem-native does not provide a secure randombytes
+ *              implementation. Such an implementation has to provided by the
+ *              consumer.
+ *
+ *              If this option is not set, mlkem-native expects a function
+ *              void randombytes(uint8_t *out, size_t outlen).
+ *
+ *              Set this option and define `mlk_randombytes` if you want to
+ *              use a custom method to sample randombytes with a different name
+ *              or signature.
+ *
+ *****************************************************************************/
+#define MLK_CONFIG_CUSTOM_RANDOMBYTES
+#if !defined(__ASSEMBLER__)
+#include <oqs/rand.h>
+#include <stdint.h>
+#include "../../mlkem/src/sys.h"
+static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
+{
+  OQS_randombytes(ptr, len);
+}
+#endif /* !__ASSEMBLER__ */
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_NO_ASM
+ *
+ * Description: If this option is set, mlkem-native will be built without
+ *              use of native code or inline assembly.
+ *
+ *              By default, inline assembly is used to implement value barriers.
+ *              Without inline assembly, mlkem-native will use a global volatile
+ *              'opt blocker' instead; see verify.h.
+ *
+ *              Inline assembly is also used to implement a secure zeroization
+ *              function on non-Windows platforms. If this option is set and
+ *              the target platform is not Windows, you MUST set
+ *              MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization
+ *              function.
+ *
+ *              If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and
+ *              and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no
+ *native backends will be used.
+ *
+ *****************************************************************************/
+/* #define MLK_CONFIG_NO_ASM */
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_KEYGEN_PCT
+ *
+ * Description: Compliance with @[FIPS140_3_IG, p.87] requires a
+ *              Pairwise Consistency Test (PCT) to be carried out on a freshly
+ *              generated keypair before it can be exported.
+ *
+ *              Set this option if such a check should be implemented.
+ *              In this case, crypto_kem_keypair_derand and crypto_kem_keypair
+ *              will return a non-zero error code if the PCT failed.
+ *
+ *              NOTE: This feature will drastically lower the performance of
+ *              key generation.
+ *
+ *****************************************************************************/
+/* #define MLK_CONFIG_KEYGEN_PCT */
+
+/******************************************************************************
+ * Name:        MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+ *
+ * Description: If this option is set, the user must provide a runtime
+ *              function `static inline int mlk_break_pct() { ... }` to
+ *              indicate whether the PCT should be made fail.
+ *
+ *              This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set.
+ *
+ *****************************************************************************/
+/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
+   #if !defined(__ASSEMBLER__)
+   #include "sys.h"
+   static MLK_INLINE int mlk_break_pct(void)
+   {
+       ... return 0/1 depending on whether PCT should be broken ...
+   }
+   #endif
+*/
+
+/* Enable valgrind-based assertions in mlkem-native through macro
+ * from libOQS. */
+#if !defined(__ASSEMBLER__)
+#include <oqs/common.h>
+#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
+#define MLK_CONFIG_CT_TESTING_ENABLED
+#endif
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */
diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S
index 48b117404b..bc5107d7d6 100644
--- a/mlkem/mlkem_native.S
+++ b/mlkem/mlkem_native.S
@@ -463,6 +463,33 @@
 #undef MLK_NTT_BOUND
 /* mlkem/src/native/meta.h */
 #undef MLK_NATIVE_META_H
+/* mlkem/src/native/ppc64le/meta.h */
+#undef MLK_ARITH_BACKEND_NAME
+#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT
+#undef MLK_NATIVE_PPC64LE_META_H
+#undef MLK_USE_NATIVE_INTT
+#undef MLK_USE_NATIVE_NTT
+#undef MLK_USE_NATIVE_POLY_REDUCE
+#undef MLK_USE_NATIVE_POLY_TOMONT
+/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */
+#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H
+#undef mlk_intt_ppc
+#undef mlk_ntt_ppc
+#undef mlk_poly_tomont_ppc
+#undef mlk_reduce_ppc
+/* mlkem/src/native/ppc64le/src/consts.h */
+#undef C1353_OFFSET
+#undef C1441_OFFSET
+#undef C20159_OFFSET
+#undef IZETA_NTT_OFFSET127
+#undef IZETA_NTT_OFFSET63
+#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H
+#undef NQ_OFFSET
+#undef QINV_OFFSET
+#undef Q_OFFSET
+#undef ZETA_NTT_OFFSET
+#undef ZETA_NTT_OFFSET64
+#undef mlk_ppc_qdata
 #if defined(MLK_SYS_AARCH64)
 /*
  * Undefine macros from native code (Arith, AArch64)
diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c
index 9100915359..1d8a0d073c 100644
--- a/mlkem/mlkem_native.c
+++ b/mlkem/mlkem_native.c
@@ -452,6 +452,33 @@
 #undef MLK_NTT_BOUND
 /* mlkem/src/native/meta.h */
 #undef MLK_NATIVE_META_H
+/* mlkem/src/native/ppc64le/meta.h */
+#undef MLK_ARITH_BACKEND_NAME
+#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT
+#undef MLK_NATIVE_PPC64LE_META_H
+#undef MLK_USE_NATIVE_INTT
+#undef MLK_USE_NATIVE_NTT
+#undef MLK_USE_NATIVE_POLY_REDUCE
+#undef MLK_USE_NATIVE_POLY_TOMONT
+/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */
+#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H
+#undef mlk_intt_ppc
+#undef mlk_ntt_ppc
+#undef mlk_poly_tomont_ppc
+#undef mlk_reduce_ppc
+/* mlkem/src/native/ppc64le/src/consts.h */
+#undef C1353_OFFSET
+#undef C1441_OFFSET
+#undef C20159_OFFSET
+#undef IZETA_NTT_OFFSET127
+#undef IZETA_NTT_OFFSET63
+#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H
+#undef NQ_OFFSET
+#undef QINV_OFFSET
+#undef Q_OFFSET
+#undef ZETA_NTT_OFFSET
+#undef ZETA_NTT_OFFSET64
+#undef mlk_ppc_qdata
 #if defined(MLK_SYS_AARCH64)
 /*
  * Undefine macros from native code (Arith, AArch64)
diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h
index 4291d629b1..89fd0de56d 100644
--- a/mlkem/src/native/meta.h
+++ b/mlkem/src/native/meta.h
@@ -18,6 +18,10 @@
 #include "x86_64/meta.h"
 #endif
 
+#ifdef MLK_SYS_PPC64LE
+#include "ppc64le/meta.h"
+#endif
+
 #if defined(MLK_SYS_RISCV64_RVV)
 #include "riscv64/meta.h"
 #endif
diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md
new file mode 100644
index 0000000000..5125a40eae
--- /dev/null
+++ b/mlkem/src/native/ppc64le/README.md
@@ -0,0 +1,6 @@
+[//]: # (SPDX-License-Identifier: CC-BY-4.0)
+
+# ppc64le backend (little endian)
+
+This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems.
+
diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h
new file mode 100644
index 0000000000..54b3ddd9c6
--- /dev/null
+++ b/mlkem/src/native/ppc64le/meta.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLK_NATIVE_PPC64LE_META_H
+#define MLK_NATIVE_PPC64LE_META_H
+
+/* Identifier for this backend so that source and assembly files
+ * in the build can be appropriately guarded. */
+#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT
+
+#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT
+
+/* Set of primitives that this backend replaces */
+#define MLK_USE_NATIVE_NTT
+#define MLK_USE_NATIVE_INTT
+#define MLK_USE_NATIVE_POLY_REDUCE
+#define MLK_USE_NATIVE_POLY_TOMONT
+
+#if !defined(__ASSEMBLER__)
+#include <string.h>
+#include "../../common.h"
+#include "../../params.h"
+#include "../api.h"
+#include "src/arith_native_ppc64le.h"
+
+static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
+{
+  mlk_ntt_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+
+static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
+{
+  mlk_intt_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+
+static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
+{
+  mlk_reduce_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+
+static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
+{
+  mlk_poly_tomont_ppc(data, mlk_ppc_qdata);
+  return MLK_NATIVE_FUNC_SUCCESS;
+}
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !MLK_NATIVE_PPC64LE_META_H */
diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h
new file mode 100644
index 0000000000..dbcee3e3ee
--- /dev/null
+++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2024-2025 The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H
+#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H
+
+#include <stdint.h>
+#include "../../../common.h"
+#include "consts.h"
+
+#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc)
+void mlk_ntt_ppc(int16_t *, const int16_t *);
+
+#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc)
+void mlk_intt_ppc(int16_t *, const int16_t *);
+
+#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc)
+void mlk_reduce_ppc(int16_t *r, const int16_t *);
+
+#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc)
+void mlk_poly_tomont_ppc(int16_t *, const int16_t *);
+
+#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */
diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c
new file mode 100644
index 0000000000..fa0f7097f5
--- /dev/null
+++ b/mlkem/src/native/ppc64le/src/consts.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "consts.h"
+
+MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = {
+    /* -Q */
+    -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329,
+    /* QINV */
+    -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327,
+    /* Q */
+    3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329,
+    /* const 20159 for reduce.S and intt */
+    20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159,
+    /* const 1441 for intt */
+    1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441,
+    /* for poly_tomont.S */
+    1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353,
+    /* zetas */
+    /* For ntt Len=128, offset 96 */
+    -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359,
+    -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
+    -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422,
+    1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202,
+    202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171,
+    -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577,
+    1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962,
+    962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202,
+    -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468,
+    1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573,
+    -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264,
+    264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829,
+    -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458,
+    1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130,
+    -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681,
+    -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732,
+    732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542,
+    -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411,
+    411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571,
+    -1571, -1571, -1571, -1571, -1571, -1571,
+    /* For Len=4 */
+    1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015,
+    1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282,
+    -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8,
+    -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618,
+    -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469,
+    1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271,
+    830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247,
+    -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961,
+    961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448,
+    448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275,
+    -1275,
+    /*
+     * For ntt Len=2
+     * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2)
+     * Transpose z[0], z[1], z[2], z[3]
+     *    -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2]
+     */
+    555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105,
+    105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291,
+    -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778,
+    -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590,
+    349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817,
+    817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285,
+    1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874,
+    -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530,
+    -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108,
+    991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460,
+    /*
+     * For intt Len=2, offset IZETA_NTT_OFFSET127
+     * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2)
+     * Transpose z[0], z[1], z[2], z[3]
+     *    -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2]
+     */
+    -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108,
+    -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530,
+    -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874,
+    -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218,
+    -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817,
+    817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349,
+    -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602,
+    778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291,
+    -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105,
+    105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555,
+    /* For intt Len=4 */
+    -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065,
+    448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961,
+    961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247,
+    -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830,
+    830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853,
+    1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162,
+    -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320,
+    -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282,
+    -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015,
+    1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223,
+    1223,
+    /* For intt Len=8 and others */
+    -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205,
+    -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542,
+    -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608,
+    608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017,
+    1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681,
+    -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602,
+    -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458,
+    1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383,
+    383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325,
+    -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573,
+    573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474,
+    -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202,
+    -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182,
+    182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622,
+    622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171,
+    -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287,
+    287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493,
+    1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517,
+    -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758,
+    -758, -758, -758, -758, -758, -758};
+
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h
new file mode 100644
index 0000000000..6c59a63b0b
--- /dev/null
+++ b/mlkem/src/native/ppc64le/src/consts.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H
+#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H
+#include "../../../common.h"
+
+#define NQ_OFFSET 0
+#define QINV_OFFSET 16
+#define Q_OFFSET 32
+#define C20159_OFFSET 48
+#define C1441_OFFSET 64
+#define C1353_OFFSET 80
+#define ZETA_NTT_OFFSET 96
+#define ZETA_INTT_OFFSET 1104
+
+#ifndef __ASSEMBLER__
+#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata)
+extern const int16_t mlk_ppc_qdata[];
+#else
+#define r0 0
+#define r1 1
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+#define r14 14
+#define r15 15
+#define r16 16
+#define r17 17
+#define r18 18
+#define r19 19
+#define r20 20
+#define r21 21
+#define v0 0
+#define v1 1
+#define v2 2
+#define v3 3
+#define v4 4
+#define v5 5
+#define v6 6
+#define v7 7
+#define v8 8
+#define v9 9
+#define v10 10
+#define v11 11
+#define v12 12
+#define v13 13
+#define v14 14
+#define v15 15
+#define v16 16
+#define v17 17
+#define v18 18
+#define v19 19
+#define v20 20
+#define v21 21
+#define v22 22
+#define v23 23
+#define v24 24
+#define v25 25
+#define v26 26
+#define v27 27
+#define v28 28
+#define v29 29
+#define v30 30
+#define v31 31
+#define vs0 0
+#define vs1 1
+#define vs2 2
+#define vs3 3
+#define vs4 4
+#define vs5 5
+#define vs6 6
+#define vs7 7
+#define vs8 8
+#define vs9 9
+#define vs10 10
+#define vs11 11
+#define vs12 12
+#define vs13 13
+#endif
+
+#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */
diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S
new file mode 100644
index 0000000000..946ae12e01
--- /dev/null
+++ b/mlkem/src/native/ppc64le/src/intt_ppc.S
@@ -0,0 +1,789 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "consts.h"
+
+.machine "any"
+.text
+
+/* Barrett reduce constatnts */
+#define V20159  0
+#define V_25    1
+#define V_26    2
+#define V_MKQ   3
+
+/* Montgomery reduce constatnts */
+#define V_QINV  2
+#define V_NMKQ  5
+#define V_Z0    7
+#define V_Z1    8
+#define V_Z2    9
+#define V_Z3    10
+#define V_ZETA  10
+#define V1441   10
+
+.macro SAVE_REGS
+        stdu    r1, -352(r1)
+        mflr    r0
+        std     r14, 56(r1)
+        std     r15, 64(r1)
+        std     r16, 72(r1)
+        std     r17, 80(r1)
+        std     r18, 88(r1)
+        std     r19, 96(r1)
+        std     r20, 104(r1)
+        std     r21, 112(r1)
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        stxvx   32+v20, r10, r1
+        stxvx   32+v21, r11, r1
+        stxvx   32+v22, r12, r1
+        stxvx   32+v23, r14, r1
+        stxvx   32+v24, r15, r1
+        stxvx   32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        stxvx   32+v26, r10, r1
+        stxvx   32+v27, r11, r1
+        stxvx   32+v28, r12, r1
+        stxvx   32+v29, r14, r1
+        stxvx   32+v30, r15, r1
+        stxvx   32+v31, r16, r1
+.endm
+
+.macro RESTORE_REGS
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        lxvx    32+v20, r10, r1
+        lxvx    32+v21, r11, r1
+        lxvx    32+v22, r12, r1
+        lxvx    32+v23, r14, r1
+        lxvx    32+v24, r15, r1
+        lxvx    32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        lxvx    32+v26, r10, r1
+        lxvx    32+v27, r11, r1
+        lxvx    32+v28, r12, r1
+        lxvx    32+v29, r14, r1
+        lxvx    32+v30, r15, r1
+        lxvx    32+v31, r16, r1
+        ld      r14, 56(r1)
+        ld      r15, 64(r1)
+        ld      r16, 72(r1)
+        ld      r17, 80(r1)
+        ld      r18, 88(r1)
+        ld      r19, 96(r1)
+        ld      r20, 104(r1)
+        ld      r21, 112(r1)
+
+        mtlr    r0
+        addi    r1, r1, 352
+.endm
+
+/*
+ * Compute final final r[j] and r[j+len]
+ *  final r[j+len]: V8, V12, V16, V20
+ *  final r[j]: V21, V22, V23, V24
+ */
+.macro Compute_4Coeffs
+        /* Since the result of the Montgomery multiplication is bounded
+           by q in absolute value.
+           Finally to complete the final update of the results with add/sub
+           r[j] = r[j] + t.
+           r[j+len] = r[j] - t
+         */
+        vsubuhm v25, v8, v21
+        vsubuhm v26, v12, v22
+        vsubuhm v30, v16, v23
+        vsubuhm v31, v20, v24
+        vadduhm v8, v8, v21
+        vadduhm v12, v12, v22
+        vadduhm v16, v16, v23
+        vadduhm v20, v20, v24
+.endm
+
+/*
+ * Init_Coeffs_offset: initial offset setup for the coeeficient array.
+ *
+ * start: beginning of the offset to the coefficient array.
+ * next: Next offset.
+ * len: Index difference between coefficients.
+ *
+ * r7: len * 2, each coefficient component is 2 bytes.
+ *
+ * register used for offset to coefficients, r[j] and r[j+len]
+ * R9: offset to r0 = j
+ * R16: offset to r1 = r0 + next
+ * R18: offset to r2 = r1 + next
+ * R20: offset to r3 = r2 + next
+ *
+ * R10: offset to r'0 = r0 + len*2
+ * R17: offset to r'1 = r'0 + step
+ * R19: offset to r'2 = r'1 + step
+ * R21: offset to r'3 = r'2 + step
+ *
+ */
+.macro Init_Coeffs_offset start next
+        li      r9, \start       /* first offset to j */
+        add     r10, r7, r9        /* J + len*2 */
+        addi    r16, r9, \next
+        addi    r17, r10, \next
+        addi    r18, r16, \next
+        addi    r19, r17, \next
+        addi    r20, r18, \next
+        addi    r21, r19, \next
+.endm
+
+/*
+ * Load coefficient vectors for r[j] (r) and r[j+len] (r'):
+ *   Load coefficient in r' vectors from offset, R10, R17, R19 and R21
+ *   Load coefficient in r vectors from offset, R9, R16, R18 and R20
+ *
+ *  r[j+len]: V8, V12, V16, V20
+ *  r[j]: V21, V22, V23, V24
+ */
+.macro Load_4Rjp
+        lxvd2x  32+v8, r3, r10     /* V8: vector r'0 */
+        lxvd2x  32+v12, r3, r17    /* V12: vector for r'1 */
+        lxvd2x  32+v16, r3, r19    /* V16: vector for r'2 */
+        lxvd2x  32+v20, r3, r21    /* V20: vector for r'3 */
+
+        lxvd2x  32+v21, r3, r9     /* V21: vector r0 */
+        lxvd2x  32+v22, r3, r16    /* V22: vector r1 */
+        lxvd2x  32+v23, r3, r18    /* V23: vector r2 */
+        lxvd2x  32+v24, r3, r20    /* V24: vector r3 */
+.endm
+
+/*
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7
+ */
+.macro Load_4Coeffs start next
+        Init_Coeffs_offset \start \next
+        Load_4Rjp
+        Compute_4Coeffs
+.endm
+
+/*
+ * Load 2 - 2 - 2 - 2 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *    rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7
+ *    rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15
+ *  Each vmrgew and vmrgow will transpose vectors as,
+ *  r[j]=      rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13
+ *  r[j+len]=  rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15
+ *
+ *  r[j+len]: V8, V12, V16, V20
+ *  r[j]: V21, V22, V23, V24
+ *
+ * In order to do the coefficient computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L24Coeffs
+        lxvd2x     32+v25, 0, r5
+        lxvd2x     32+v26, r10, r5
+        vmrgew v8, v25, v26
+        vmrgow v21, v25, v26
+        lxvd2x     32+v25, r11, r5
+        lxvd2x     32+v26, r12, r5
+        vmrgew v12, v25, v26
+        vmrgow v22, v25, v26
+        lxvd2x     32+v25, r15, r5
+        lxvd2x     32+v26, r16, r5
+        vmrgew v16, v25, v26
+        vmrgow v23, v25, v26
+        lxvd2x     32+v25, r17, r5
+        lxvd2x     32+v26, r18, r5
+        vmrgew v20, v25, v26
+        vmrgow v24, v25, v26
+.endm
+
+/*
+ * Load 4 - 4 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7
+ *  rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15
+ *
+ *  Each xxpermdi will transpose vectors as,
+ *  rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15
+ *  rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L44Coeffs
+        lxvd2x     vs10, 0, r5
+        lxvd2x     vs11, r10, r5
+        xxpermdi 32+v8, vs11, vs10, 3
+        xxpermdi 32+v21, vs11, vs10, 0
+        lxvd2x     vs10, r11, r5
+        lxvd2x     vs11, r12, r5
+        xxpermdi 32+v12, vs11, vs10, 3
+        xxpermdi 32+v22, vs11, vs10, 0
+        lxvd2x     vs10, r15, r5
+        lxvd2x     vs11, r16, r5
+        xxpermdi 32+v16, vs11, vs10, 3
+        xxpermdi 32+v23, vs11, vs10, 0
+        lxvd2x     vs10, r17, r5
+        lxvd2x     vs11, r18, r5
+        xxpermdi 32+v20, vs11, vs10, 3
+        xxpermdi 32+v24, vs11, vs10, 0
+.endm
+
+.macro BREDUCE_4X _v0 _v1 _v2 _v3
+        /* Restore constant vectors
+           V_MKQ, V_25 and V_26 */
+        vxor    v7, v7, v7
+        xxlor   32+v3, vs6, vs6
+        xxlor   32+v1, vs7, vs7
+        xxlor   32+v2, vs8, vs8
+        /* Multify Odd/Even signed halfword;
+           Results word bound by 2^32 in abs value. */
+        vmulosh v6, v8, V20159
+        vmulesh v5, v8, V20159
+        vmulosh v11, v12, V20159
+        vmulesh v10, v12, V20159
+        vmulosh v15, v16, V20159
+        vmulesh v14, v16, V20159
+        vmulosh v19, v20, V20159
+        vmulesh v18, v20, V20159
+        xxmrglw 32+v4, 32+v5, 32+v6
+        xxmrghw 32+v5, 32+v5, 32+v6
+        xxmrglw 32+v9, 32+v10, 32+v11
+        xxmrghw 32+v10, 32+v10, 32+v11
+        xxmrglw 32+v13, 32+v14, 32+v15
+        xxmrghw 32+v14, 32+v14, 32+v15
+        xxmrglw 32+v17, 32+v18, 32+v19
+        xxmrghw 32+v18, 32+v18, 32+v19
+        vadduwm v4, v4, V_25
+        vadduwm v5, v5, V_25
+        vadduwm v9, v9, V_25
+        vadduwm v10, v10, V_25
+        vadduwm v13, v13, V_25
+        vadduwm v14, v14, V_25
+        vadduwm v17, v17, V_25
+        vadduwm v18, v18, V_25
+        /* Right shift and pack lower halfword,
+           results bond to 2^16 in abs value */
+        vsraw   v4, v4, V_26
+        vsraw   v5, v5, V_26
+        vsraw   v9, v9, V_26
+        vsraw   v10, v10, V_26
+        vsraw   v13, v13, V_26
+        vsraw   v14, v14, V_26
+        vsraw   v17, v17, V_26
+        vsraw   v18, v18, V_26
+        vpkuwum v4, v5, v4
+        vsubuhm v4, v7, v4
+        vpkuwum v9, v10, v9
+        vsubuhm v9, v7, v9
+        vpkuwum v13, v14, v13
+        vsubuhm v13, v7, v13
+        vpkuwum v17, v18, v17
+        vsubuhm v17, v7, v17
+        /* Modulo multify-Low unsigned halfword;
+           results bond to 2^16 * q in abs value. */
+        vmladduhm \_v0, v4, V_MKQ, v8
+        vmladduhm \_v1, v9, V_MKQ, v12
+        vmladduhm \_v2, v13, V_MKQ, v16
+        vmladduhm \_v3, v17, V_MKQ, v20
+.endm
+
+/*
+ * -----------------------------------
+ * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3)
+ */
+.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3
+        /* Modular multification bond by 2^16 * q in abs value */
+        vmladduhm v15, v25, \_vz0, v3
+        vmladduhm v20, v26, \_vz1, v3
+        vmladduhm v27, v30, \_vz2, v3
+        vmladduhm v28, v31, \_vz3, v3
+
+        /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */
+        vmhraddshs v14, v25, \_vz0, v3
+        vmhraddshs v19, v26, \_vz1, v3
+        vmhraddshs v24, v30, \_vz2, v3
+        vmhraddshs v29, v31, \_vz3, v3
+
+        vmladduhm v15, v15, V_QINV, v3
+        vmladduhm v20, v20, V_QINV, v3
+        vmladduhm v25, v27, V_QINV, v3
+        vmladduhm v30, v28, V_QINV, v3
+
+        vmhraddshs v15, v15, V_NMKQ, v14
+        vmhraddshs v20, v20, V_NMKQ, v19
+        vmhraddshs v25, v25, V_NMKQ, v24
+        vmhraddshs v30, v30, V_NMKQ, v29
+
+        /* Shift right 1 bit */
+        vsrah \_vo0, v15, v4
+        vsrah \_vo1, v20, v4
+        vsrah \_vo2, v25, v4
+        vsrah \_vo3, v30, v4
+.endm
+
+/*
+ * setup constant vectors for Montgmery multiplication
+ * V_NMKQ, V_QINV, Zero vector, One vector
+ */
+.macro Set_mont_consts
+        xxlor   32+v5, vs0, vs0    /* V_NMKQ */
+        xxlor   32+v2, vs2, vs2    /* V_QINV */
+        xxlor   32+v3, vs3, vs3    /* all 0 */
+        xxlor   32+v4, vs4, vs4    /* all 1 */
+.endm
+
+.macro Load_next_4zetas
+        li      r8, 16
+        li      r11, 32
+        li      r12, 48
+        lxvd2x    32+V_Z0, 0, r14
+        lxvd2x    32+V_Z1, r8, r14
+        lxvd2x    32+V_Z2, r11, r14
+        lxvd2x    32+V_Z3, r12, r14
+        addi    r14, r14, 64
+.endm
+
+/*
+ * Re-ordering of the 4-4 layout zetas.
+ * Swap double-words.
+ */
+.macro Perm_4zetas
+        xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2
+        xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2
+        xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2
+        xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2
+.endm
+
+.macro Write_B4C _vs0 _vs1 _vs2 _vs3
+        stxvd2x \_vs0, r3, r9
+        stxvd2x \_vs1, r3, r16
+        stxvd2x \_vs2, r3, r18
+        stxvd2x \_vs3, r3, r20
+.endm
+
+.macro Write_M4C _vs0 _vs1 _vs2 _vs3
+        stxvd2x \_vs0, r3, r10
+        stxvd2x \_vs1, r3, r17
+        stxvd2x \_vs2, r3, r19
+        stxvd2x \_vs3, r3, r21
+.endm
+
+.macro Reload_4coeffs
+        lxvd2x  32+v25, 0, r3
+        lxvd2x  32+v26, r10, r3
+        lxvd2x  32+v30, r11, r3
+        lxvd2x  32+v31, r12, r3
+        addi    r3, r3, 64
+.endm
+
+.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7
+        addi    r3, r3, -128
+        stxvd2x \_vs0, 0, r3
+        stxvd2x \_vs1, r10, r3
+        stxvd2x \_vs2, r11, r3
+        stxvd2x \_vs3, r12, r3
+        stxvd2x \_vs4, r15, r3
+        stxvd2x \_vs5, r16, r3
+        stxvd2x \_vs6, r17, r3
+        stxvd2x \_vs7, r18, r3
+        addi    r3, r3, 128
+.endm
+
+/*
+ * Transpose the final coefficients of 4-4 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL44
+        xxlor   32+v14, vs10, vs10
+        xxlor   32+v19, vs11, vs11
+        xxlor   32+v24, vs12, vs12
+        xxlor   32+v29, vs13, vs13
+        xxpermdi 32+v10, 32+v14, 32+v13, 3
+        xxpermdi 32+v11, 32+v14, 32+v13, 0
+        xxpermdi 32+v12, 32+v19, 32+v18, 3
+        xxpermdi 32+v13, 32+v19, 32+v18, 0
+        xxpermdi 32+v14, 32+v24, 32+v23, 3
+        xxpermdi 32+v15, 32+v24, 32+v23, 0
+        xxpermdi 32+v16, 32+v29, 32+v28, 3
+        xxpermdi 32+v17, 32+v29, 32+v28, 0
+        stxvd2x    32+v10, 0, r5
+        stxvd2x    32+v11, r10, r5
+        stxvd2x    32+v12, r11, r5
+        stxvd2x    32+v13, r12, r5
+        stxvd2x    32+v14, r15, r5
+        stxvd2x    32+v15, r16, r5
+        stxvd2x    32+v16, r17, r5
+        stxvd2x    32+v17, r18, r5
+.endm
+
+/*
+ * Transpose the final coefficients of 2-2-2-2 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL24
+        xxlor   32+v14, vs10, vs10
+        xxlor   32+v19, vs11, vs11
+        xxlor   32+v24, vs12, vs12
+        xxlor   32+v29, vs13, vs13
+        vmrgew v10, v13, v14
+        vmrgow v11, v13, v14
+        vmrgew v12, v18, v19
+        vmrgow v13, v18, v19
+        vmrgew v14, v23, v24
+        vmrgow v15, v23, v24
+        vmrgew v16, v28, v29
+        vmrgow v17, v28, v29
+        stxvd2x    32+v10, 0, r5
+        stxvd2x    32+v11, r10, r5
+        stxvd2x    32+v12, r11, r5
+        stxvd2x    32+v13, r12, r5
+        stxvd2x    32+v14, r15, r5
+        stxvd2x    32+v15, r16, r5
+        stxvd2x    32+v16, r17, r5
+        stxvd2x    32+v17, r18, r5
+.endm
+
+.macro INTT_REDUCE_L24
+        Load_L24Coeffs
+        Compute_4Coeffs
+        BREDUCE_4X v4, v9, v13, v17
+        xxlor   vs10, 32+v4, 32+v4
+        xxlor   vs11, 32+v9, 32+v9
+        xxlor   vs12, 32+v13, 32+v13
+        xxlor   vs13, 32+v17, 32+v17
+        Set_mont_consts
+        Load_next_4zetas
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28
+        PermWriteL24
+.endm
+
+.macro INTT_REDUCE_L44
+        Load_L44Coeffs
+        Compute_4Coeffs
+        BREDUCE_4X v4, v9, v13, v17
+        xxlor   vs10, 32+v4, 32+v4
+        xxlor   vs11, 32+v9, 32+v9
+        xxlor   vs12, 32+v13, 32+v13
+        xxlor   vs13, 32+v17, 32+v17
+        Set_mont_consts
+        Load_next_4zetas
+        Perm_4zetas
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28
+        PermWriteL44
+.endm
+
+.macro INTT_REDUCE_4X start next
+        Load_4Coeffs \start, \next
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        Load_next_4zetas
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+.endm
+
+/*
+ * main operations for intt
+ * t = r[j];
+ * r[j] = barrett_reduce(t + r[j + len]);
+ * r[j + len] = r[j + len] - t;
+ * r[j + len] = fqmul(zeta, r[j + len]);
+ */
+
+/*
+ * mlk_intt_ppc(r)
+ */
+.global MLK_ASM_NAMESPACE(intt_ppc)
+.align 4
+MLK_ASM_FN_SYMBOL(intt_ppc)
+
+        SAVE_REGS
+
+        /* init vectors and constants
+           Setup for Montgomery reduce */
+        lxvx    vs0, 0, r4
+
+        li      r10, QINV_OFFSET
+        lxvx    32+V_QINV, r10, r4
+        xxlxor  32+v3, 32+v3, 32+v3
+        vspltish v4, 1
+        xxlor   vs2, 32+v2, 32+v2        /* QINV */
+        xxlor   vs3, 32+v3, 32+v3        /* 0 vector */
+        xxlor   vs4, 32+v4, 32+v4        /* 1 vector */
+
+        /*  Setup for Barrett reduce */
+        li      r10, Q_OFFSET
+        li      r11, C20159_OFFSET
+        lxvx    vs6, r10, r4             /* V_MKQ */
+        lxvx    32+V20159, r11, r4       /* V20159 */
+
+        vspltisw v8, 13
+        vadduwm  v8, v8, v8
+        xxlor   vs8, 32+v8, 32+v8   /* V_26 store at vs8 */
+
+        vspltisw v9, 1
+        vsubuwm v10, v8, v9        /* value 25 */
+        vslw    v9, v9, v10
+        xxlor   vs7, 32+v9, 32+v9   /* V_25 syore at vs7 */
+
+        li      r10, 16
+        li      r11, 32
+        li      r12, 48
+        li      r15, 64
+        li      r16, 80
+        li      r17, 96
+        li      r18, 112
+
+        /*
+         * Montgomery reduce loops with constant 1441
+         */
+        addi    r14, r4, C1441_OFFSET
+        lvx     V1441, 0, r14
+        li      r8, 4
+        mtctr   r8
+
+        Set_mont_consts
+intt_ppc__Loopf:
+        Reload_4coeffs
+        MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9
+        Reload_4coeffs
+        MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28
+        MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28
+        bdnz    intt_ppc__Loopf
+
+        addi    r3, r3, -512
+
+.align 4
+        /*
+         * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252
+         * Update zetas vectors, each vector has 2 zetas
+         * Load zeta array in 2-2-2-2 layout
+         */
+        addi    r14, r4, ZETA_INTT_OFFSET
+        li      r7, 4        /* len * 2 */
+        li      r8, 4
+        mtctr   r8
+        mr      r5, r3
+intt_ppc__Loop2:
+        INTT_REDUCE_L24
+        addi    r5, r5, 128
+        bdnz    intt_ppc__Loop2
+
+.align 4
+        /*
+         * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248
+         * Load zeta array in 4-4 layout
+         */
+        mr      r5, r3
+        li      r7, 8
+        li      r8, 4
+        mtctr   r8
+intt_ppc__Loop4:
+        INTT_REDUCE_L44
+        addi    r5, r5, 128
+        bdnz    intt_ppc__Loop4
+
+.align 4
+        /*
+         * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240
+         */
+        li      r7, 16
+
+        INTT_REDUCE_4X 0, 32
+        INTT_REDUCE_4X 128, 32
+        INTT_REDUCE_4X 256, 32
+        INTT_REDUCE_4X 384, 32
+
+.align 4
+        /*
+         * 4. len = 16, start = 0, 32, 64,,...160, 192, 224
+         */
+        li      r7, 32
+
+        INTT_REDUCE_4X 0, 64
+
+        addi    r14, r14, -64
+        INTT_REDUCE_4X 16, 64
+
+        INTT_REDUCE_4X 256, 64
+
+        addi    r14, r14, -64
+        INTT_REDUCE_4X 272, 64
+
+.align 4
+        /*
+         * 5. len = 32, start = 0, 64, 128, 192
+         */
+        li      r7, 64
+
+        Load_4Coeffs 0, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 128, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 256, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 384, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+.align 4
+        /*
+         * 6. len = 64, start = 0, 128
+         */
+        li      r7, 128
+        Load_4Coeffs 0, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 64, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 256, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 320, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+.align 4
+        /*
+         * 7. len = 128, start = 0
+         */
+        li      r7, 256          /* len*2 */
+
+        Load_4Coeffs 0, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        lvx     V_ZETA, 0, r14
+        xxlor   vs9, 32+V_ZETA, 32+V_ZETA
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 64, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        xxlor   32+V_ZETA, vs9, vs9
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 128, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        xxlor   32+V_ZETA, vs9, vs9
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        Load_4Coeffs 192, 16
+        BREDUCE_4X v4, v9, v13, v17
+        Write_B4C 32+v4, 32+v9, 32+v13, 32+v17
+        Set_mont_consts
+        xxlor   32+V_ZETA, vs9, vs9
+        MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28
+        Write_M4C 32+v13, 32+v18, 32+v23, 32+v28
+
+        RESTORE_REGS
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V20159
+#undef V_25
+#undef V_26
+#undef V_MKQ
+#undef V_QINV
+#undef V_NMKQ
+#undef V_Z0
+#undef V_Z1
+#undef V_Z2
+#undef V_Z3
+#undef V_ZETA
+#undef V1441
+
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S
new file mode 100644
index 0000000000..3c06f0a319
--- /dev/null
+++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "consts.h"
+
+#define V_QINV  2
+#define V_NMKQ  5
+#define V_Z0    7
+#define V_Z1    8
+#define V_Z2    9
+#define V_Z3    10
+#define V_ZETA  10
+
+.machine "any"
+.text
+
+.macro SAVE_REGS
+        stdu    r1, -352(r1)
+        mflr    r0
+        std     r14, 56(r1)
+        std     r15, 64(r1)
+        std     r16, 72(r1)
+        std     r17, 80(r1)
+        std     r18, 88(r1)
+        std     r19, 96(r1)
+        std     r20, 104(r1)
+        std     r21, 112(r1)
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        stxvx   32+v20, r10, r1
+        stxvx   32+v21, r11, r1
+        stxvx   32+v22, r12, r1
+        stxvx   32+v23, r14, r1
+        stxvx   32+v24, r15, r1
+        stxvx   32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        stxvx   32+v26, r10, r1
+        stxvx   32+v27, r11, r1
+        stxvx   32+v28, r12, r1
+        stxvx   32+v29, r14, r1
+        stxvx   32+v30, r15, r1
+        stxvx   32+v31, r16, r1
+.endm
+
+.macro RESTORE_REGS
+        li      r10, 128
+        li      r11, 144
+        li      r12, 160
+        li      r14, 176
+        li      r15, 192
+        li      r16, 208
+        lxvx    32+v20, r10, r1
+        lxvx    32+v21, r11, r1
+        lxvx    32+v22, r12, r1
+        lxvx    32+v23, r14, r1
+        lxvx    32+v24, r15, r1
+        lxvx    32+v25, r16, r1
+        li      r10, 224
+        li      r11, 240
+        li      r12, 256
+        li      r14, 272
+        li      r15, 288
+        li      r16, 304
+        lxvx    32+v26, r10, r1
+        lxvx    32+v27, r11, r1
+        lxvx    32+v28, r12, r1
+        lxvx    32+v29, r14, r1
+        lxvx    32+v30, r15, r1
+        lxvx    32+v31, r16, r1
+        ld      r14, 56(r1)
+        ld      r15, 64(r1)
+        ld      r16, 72(r1)
+        ld      r17, 80(r1)
+        ld      r18, 88(r1)
+        ld      r19, 96(r1)
+        ld      r20, 104(r1)
+        ld      r21, 112(r1)
+
+        mtlr    r0
+        addi    r1, r1, 352
+.endm
+
+/*
+ * Init_Coeffs_offset: initial offset setup for the coeeficient array.
+ *
+ * start: beginning of the offset to the coefficient array.
+ * next: Next offset.
+ * len: Index difference between coefficients.
+ *
+ * r7: len * 2, each coefficient component is 2 bytes.
+ *
+ * registers used for offset to coefficients, r[j] and r[j+len]
+ * R9: offset to r0 = j
+ * R16: offset to r1 = r0 + next
+ * R18: offset to r2 = r1 + next
+ * R20: offset to r3 = r2 + next
+ *
+ * R10: offset to r'0 = r0 + len*2
+ * R17: offset to r'1 = r'0 + step
+ * R19: offset to r'2 = r'1 + step
+ * R21: offset to r'3 = r'2 + step
+ *
+ */
+.macro Init_Coeffs_offset start next
+        li      r9, \start         /* first offset to j */
+        add     r10, r7, r9        /* J + len*2 */
+        addi    r16, r9, \next
+        addi    r17, r10, \next
+        addi    r18, r16, \next
+        addi    r19, r17, \next
+        addi    r20, r18, \next
+        addi    r21, r19, \next
+.endm
+
+/*
+ * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21
+ *  r[j+len]: V13, V18, V23, V28
+ */
+.macro Load_4Rjp
+        lxvd2x  32+v13, r3, r10    /* V13: vector r'0 */
+        lxvd2x  32+v18, r3, r17    /* V18: vector for r'1 */
+        lxvd2x  32+v23, r3, r19    /* V23: vector for r'2 */
+        lxvd2x  32+v28, r3, r21    /* V28: vector for r'3 */
+.endm
+
+/*
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7
+ */
+.macro Load_4Coeffs start next
+        Init_Coeffs_offset \start \next
+        Load_4Rjp
+.endm
+
+/*
+ * Load 2 - 2 - 2 - 2 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *    rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7
+ *    rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15
+ *  Each vmrgew and vmrgow will transpose vectors as,
+ *  r[j]=      rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13
+ *  r[j+len]=  rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15
+ *
+ *  r[j+len]: V13, V18, V23, V28
+ *  r[j]: V12, V17, V22, V27
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L24Coeffs
+        lxvd2x     32+v25, 0, r5
+        lxvd2x     32+v26, r10, r5
+        vmrgew v13, v25, v26
+        vmrgow v12, v25, v26
+        lxvd2x     32+v25, r11, r5
+        lxvd2x     32+v26, r12, r5
+        vmrgew v18, v25, v26
+        vmrgow v17, v25, v26
+        lxvd2x     32+v25, r15, r5
+        lxvd2x     32+v26, r16, r5
+        vmrgew v23, v25, v26
+        vmrgow v22, v25, v26
+        lxvd2x     32+v25, r17, r5
+        lxvd2x     32+v26, r18, r5
+        vmrgew v28, v25, v26
+        vmrgow v27, v25, v26
+.endm
+
+/*
+ * Load 4 - 4 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ *  rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7
+ *  rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15
+ *
+ *  Each xxpermdi will transpose vectors as,
+ *  rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15
+ *  rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_L44Coeffs
+        lxvd2x     vs1, 0, r5
+        lxvd2x     vs2, r10, r5
+        xxpermdi 32+v13, vs2, vs1, 3
+        xxpermdi 32+v12, vs2, vs1, 0
+        lxvd2x     vs3, r11, r5
+        lxvd2x     vs4, r12, r5
+        xxpermdi 32+v18, vs4, vs3, 3
+        xxpermdi 32+v17, vs4, vs3, 0
+        lxvd2x     vs1, r15, r5
+        lxvd2x     vs2, r16, r5
+        xxpermdi 32+v23, vs2, vs1, 3
+        xxpermdi 32+v22, vs2, vs1, 0
+        lxvd2x     vs3, r17, r5
+        lxvd2x     vs4, r18, r5
+        xxpermdi 32+v28, vs4, vs3, 3
+        xxpermdi 32+v27, vs4, vs3, 0
+.endm
+
+/*
+ * montgomery_reduce
+ * t = a * QINV
+ * t = (a - (int32_t)t*_MLKEM_Q) >> 16
+ *
+ * -----------------------------------
+ * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3)
+ */
+.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3
+        /* fqmul = zeta * coefficient
+           Modular multification bond by 2^16 * q in abs value */
+        vmladduhm v15, v13, \_vz0, v3
+        vmladduhm v20, v18, \_vz1, v3
+        vmladduhm v25, v23, \_vz2, v3
+        vmladduhm v30, v28, \_vz3, v3
+
+        /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */
+        vmhraddshs v14, v13, \_vz0, v3
+        vmhraddshs v19, v18, \_vz1, v3
+        vmhraddshs v24, v23, \_vz2, v3
+        vmhraddshs v29, v28, \_vz3, v3
+
+        vmladduhm v15, v15, V_QINV, v3
+        vmladduhm v20, v20, V_QINV, v3
+        vmladduhm v25, v25, V_QINV, v3
+        vmladduhm v30, v30, V_QINV, v3
+
+        vmhraddshs v15, v15, V_NMKQ, v14
+        vmhraddshs v20, v20, V_NMKQ, v19
+        vmhraddshs v25, v25, V_NMKQ, v24
+        vmhraddshs v30, v30, V_NMKQ, v29
+
+        /* Shift right 1 bit */
+        vsrah v13, v15, v4
+        vsrah v18, v20, v4
+        vsrah v23, v25, v4
+        vsrah v28, v30, v4
+.endm
+
+/*
+ * Load 4 r[j] (r) coefficient vectors:
+ *   Load coefficient in vectors from offset, R9, R16, R18 and R20
+ *  r[j]: V12, V17, V22, V27
+ */
+.macro Load_4Rj
+        lxvd2x  32+v12, r3, r9     /* V12: vector r0 */
+        lxvd2x  32+v17, r3, r16    /* V17: vector r1 */
+        lxvd2x  32+v22, r3, r18    /* V22: vector r2 */
+        lxvd2x  32+v27, r3, r20    /* V27: vector r3 */
+.endm
+
+/*
+ * Compute final final r[j] and r[j+len]
+ *  final r[j+len]: V16, V21, V26, V31
+ *  final r[j]: V15, V20, V25, V30
+ */
+.macro Compute_4Coeffs
+        /* Since the result of the Montgomery multiplication is bounded
+           by q in absolute value.
+           Finally to complete the final update of the results with add/sub
+           r[j] = r[j] + t.
+           r[j+len] = r[j] - t
+         */
+        vsubuhm v16, v12, v13
+        vadduhm v15, v13, v12
+        vsubuhm v21, v17, v18
+        vadduhm v20, v18, v17
+        vsubuhm v26, v22, v23
+        vadduhm v25, v23, v22
+        vsubuhm v31, v27, v28
+        vadduhm v30, v28, v27
+.endm
+
+.macro Write_One
+        stxvd2x 32+v15, r3, r9
+        stxvd2x 32+v16, r3, r10
+        stxvd2x 32+v20, r3, r16
+        stxvd2x 32+v21, r3, r17
+        stxvd2x 32+v25, r3, r18
+        stxvd2x 32+v26, r3, r19
+        stxvd2x 32+v30, r3, r20
+        stxvd2x 32+v31, r3, r21
+.endm
+
+/*
+ * Transpose the final coefficients of 4-4 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL44
+        Compute_4Coeffs
+        xxpermdi vs0, 32+v15, 32+v16, 3
+        xxpermdi vs1, 32+v15, 32+v16, 0
+        xxpermdi vs2, 32+v20, 32+v21, 3
+        xxpermdi vs3, 32+v20, 32+v21, 0
+        xxpermdi vs4, 32+v25, 32+v26, 3
+        xxpermdi vs5, 32+v25, 32+v26, 0
+        xxpermdi vs6, 32+v30, 32+v31, 3
+        xxpermdi vs7, 32+v30, 32+v31, 0
+        stxvd2x vs0, 0, r5
+        stxvd2x vs1, r10, r5
+        stxvd2x vs2, r11, r5
+        stxvd2x vs3, r12, r5
+        stxvd2x vs4, r15, r5
+        stxvd2x vs5, r16, r5
+        stxvd2x vs6, r17, r5
+        stxvd2x vs7, r18, r5
+.endm
+
+/*
+ * Transpose the final coefficients of 2-2-2-2 layout to the orginal
+ * coefficient array order.
+ */
+.macro PermWriteL24
+        Compute_4Coeffs
+        vmrgew v10, v16, v15
+        vmrgow v11, v16, v15
+        vmrgew v12, v21, v20
+        vmrgow v13, v21, v20
+        vmrgew v14, v26, v25
+        vmrgow v15, v26, v25
+        vmrgew v16, v31, v30
+        vmrgow v17, v31, v30
+        stxvd2x 32+v10, 0, r5
+        stxvd2x 32+v11, r10, r5
+        stxvd2x 32+v12, r11, r5
+        stxvd2x 32+v13, r12, r5
+        stxvd2x 32+v14, r15, r5
+        stxvd2x 32+v15, r16, r5
+        stxvd2x 32+v16, r17, r5
+        stxvd2x 32+v17, r18, r5
+.endm
+
+.macro Load_next_4zetas
+        li      r10, 16
+        li      r11, 32
+        li      r12, 48
+        lxvd2x  32+V_Z0, 0, r14
+        lxvd2x  32+V_Z1, r10, r14
+        lxvd2x  32+V_Z2, r11, r14
+        lxvd2x  32+V_Z3, r12, r14
+        addi    r14, r14, 64
+.endm
+
+/*
+ * Re-ordering of the 4-4 layout zetas.
+ * Swap double-words.
+ */
+.macro Perm_4zetas
+        xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2
+        xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2
+        xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2
+        xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2
+.endm
+
+.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3
+        Load_4Coeffs \start, \next
+        MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3
+        Load_4Rj
+        Compute_4Coeffs
+        Write_One
+.endm
+
+/*
+ * mlk_ntt_ppc(int16_t *r)
+ */
+.global MLK_ASM_NAMESPACE(ntt_ppc)
+.align 4
+MLK_ASM_FN_SYMBOL(ntt_ppc)
+
+        SAVE_REGS
+
+        /* load MLKEM_Q */
+        lvx     V_NMKQ,0,r4
+
+        /* Register 14 as pointer to zetas array */
+        addi    r14, r4, ZETA_NTT_OFFSET
+
+        vxor    v3, v3, v3
+        vspltish v4, 1
+
+        li      r10, QINV_OFFSET
+        lvx     V_QINV, r10, r4
+
+.align 4
+        /*
+         * Compute coefficients of the NTT based on the following loop.
+         *   for (len = 128; len ≥ 2; len =  len/2)
+         *
+         * 1. len = 128, start = 0
+         */
+        li      r7, 256          /* len * 2 */
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+
+        NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+        /*
+         * 2. len = 64, start = 0, 128
+         * k += 2
+         */
+        li      r7, 128
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+        NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+        /*
+         * 3. len = 32, start = 0, 64, 128, 192
+         * k += 4
+         */
+        li      r7, 64
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+        lvx     V_ZETA, 0, r14
+        addi    r14, r14, 16
+        NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+        /*
+         * 4. len = 16, start = 0, 32, 64,,...160, 192, 224
+         * k += 8
+         */
+        li      r7, 32
+        Load_next_4zetas
+        NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3
+        NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3
+        NTT_MREDUCE_4X  272, 64, V_Z0, V_Z1, V_Z2, V_Z3
+
+.align 4
+        /*
+         * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240
+         * k += 16
+         */
+        li      r7, 16
+        Load_next_4zetas
+        NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        Load_next_4zetas
+        NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+        /*
+         * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248
+         * k += 32
+         * Load zeta vectors in 4-4 layout
+         */
+        li      r15, 4
+        mtctr   r15
+        mr      r5, r3                 /* Let r5 points to coefficient array */
+        li      r7, 8
+
+        li      r10, 16
+        li      r11, 32
+        li      r12, 48
+        li      r15, 64
+        li      r16, 80
+        li      r17, 96
+        li      r18, 112
+
+.align 4
+ntt_ppc__Len4:
+        Load_next_4zetas
+        Perm_4zetas
+        Load_L44Coeffs
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3
+        PermWriteL44
+        addi    r5, r5, 128
+
+        bdnz    ntt_ppc__Len4
+
+        /*
+         * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252
+         * k += 64
+         * Load zeta vectors in 2-2-2-2 layout
+         */
+
+        li      r8, 4
+        mtctr   r8
+        mr      r5, r3                  /* Let r5 points to coefficient array */
+        li      r7, 4
+
+.align 4
+ntt_ppc__Len2:
+        Load_next_4zetas
+        Load_L24Coeffs
+        MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3
+        PermWriteL24
+        addi    r5, r5, 128
+
+        bdnz    ntt_ppc__Len2
+
+        RESTORE_REGS
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V_QINV
+#undef V_NMKQ
+#undef V_ZETA
+
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S
new file mode 100644
index 0000000000..5c0703755c
--- /dev/null
+++ b/mlkem/src/native/ppc64le/src/poly_tomont.S
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+/*
+ * Poly_tomont: Inplace conversion of all coefficients of a polynomial
+ *              from normal domain to Montgomery domain
+ *
+ * Arguments:*r: pointer to input/output polynomial
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "consts.h"
+
+#define V1353   0
+#define V_QINV  2
+#define V_NMKQ  5
+
+.machine "any"
+.text
+
+/*
+ * montgomery_reduce
+ * t = a * QINV
+ * t = (a - (int32_t)t*_MLKEM_Q) >> 16
+ *
+ * -----------------------------------
+ * MREDUCE_4X(_v0, _v1, _v2, _v3)
+ */
+.macro MREDUCE_4X _v0 _v1 _v2 _v3
+        lxvd2x  32+v13, 0, r3
+        addi    r3, r3, 16
+        lxvd2x  32+v18, 0, r3
+        addi    r3, r3, 16
+        lxvd2x  32+v23, 0, r3
+        addi    r3, r3, 16
+        lxvd2x  32+v7, 0, r3
+        addi    r3, r3, 16
+
+        vmladduhm v15, v13, V1353, v3
+        vmladduhm v20, v18, V1353, v3
+        vmladduhm v25, v23, V1353, v3
+        vmladduhm v9, v7, V1353, v3
+
+        vmhraddshs v14, v13, V1353, v3
+        vmhraddshs v19, v18, V1353, v3
+        vmhraddshs v24, v23, V1353, v3
+        vmhraddshs v8, v7, V1353, v3
+
+        vmladduhm v15, v15, V_QINV, v3
+        vmladduhm v20, v20, V_QINV, v3
+        vmladduhm v25, v25, V_QINV, v3
+        vmladduhm v9, v9, V_QINV, v3
+
+        vmhraddshs v15, v15, V_NMKQ, v14
+        vmhraddshs v20, v20, V_NMKQ, v19
+        vmhraddshs v25, v25, V_NMKQ, v24
+        vmhraddshs v9, v9, V_NMKQ, v8
+
+        /* Shift right 1 bit */
+        vsrah \_v0, v15, v4
+        vsrah \_v1, v20, v4
+        vsrah \_v2, v25, v4
+        vsrah \_v3, v9, v4
+.endm
+
+.macro Write_8X
+        stxvd2x 32+v27, r4, r3
+        stxvd2x 32+v28, r5, r3
+        stxvd2x 32+v29, r6, r3
+        stxvd2x 32+v30, r7, r3
+        stxvd2x 32+v13, r8, r3
+        stxvd2x 32+v18, r9, r3
+        stxvd2x 32+v23, r10, r3
+        stxvd2x 32+v7, r11, r3
+.endm
+
+.align 4
+.globl MLK_ASM_NAMESPACE(poly_tomont_ppc)
+MLK_ASM_FN_SYMBOL(poly_tomont_ppc)
+        stdu    r1, -320(r1)
+        mflr    r0
+
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        li      r11, 208
+        li      r12, 224
+        stxvx   32+v20, r6, r1
+        stxvx   32+v21, r7, r1
+        stxvx   32+v22, r8, r1
+        stxvx   32+v23, r9, r1
+        stxvx   32+v24, r10, r1
+        stxvx   32+v25, r11, r1
+        stxvx   32+v26, r12, r1
+        li      r6, 240
+        li      r7, 256
+        li      r8, 272
+        li      r9, 288
+        stxvx   32+v27, r6, r1
+        stxvx   32+v28, r7, r1
+        stxvx   32+v29, r8, r1
+        stxvx   32+v30, r9, r1
+
+        li      r6, NQ_OFFSET
+        li      r7, QINV_OFFSET
+        li      r8, C1353_OFFSET
+        lxvx    32+V_NMKQ, r6, r4
+        lxvx    32+V_QINV, r7, r4
+        lxvx    32+V1353, r8, r4
+
+        vxor    v3, v3, v3
+        vspltish v4, 1
+
+        li      r4, -128
+        li      r5, -112
+        li      r6, -96
+        li      r7, -80
+        li      r8, -64
+        li      r9, -48
+        li      r10, -32
+        li      r11, -16
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        MREDUCE_4X v27, v28, v29, v30
+        MREDUCE_4X v13, v18, v23, v7
+        Write_8X
+
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        li      r11, 208
+        li      r12, 224
+        lxvx    32+v20, r6, r1
+        lxvx    32+v21, r7, r1
+        lxvx    32+v22, r8, r1
+        lxvx    32+v23, r9, r1
+        lxvx    32+v24, r10, r1
+        lxvx    32+v25, r11, r1
+        lxvx    32+v26, r12, r1
+        li      r6, 240
+        li      r7, 256
+        li      r8, 272
+        li      r9, 288
+        lxvx    32+v27, r6, r1
+        lxvx    32+v28, r7, r1
+        lxvx    32+v29, r8, r1
+        lxvx    32+v30, r9, r1
+        mtlr    r0
+        addi    r1, r1, 320
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V1353
+#undef V_QINV
+#undef V_NMKQ
+
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V1353
+#undef V_QINV
+#undef V_NMKQ
diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S
new file mode 100644
index 0000000000..a6deedffc3
--- /dev/null
+++ b/mlkem/src/native/ppc64le/src/reduce.S
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * Copyright 2025- IBM Corp.
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen@us.ibm.com>
+ */
+
+/*
+ * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial
+ *              for details of the Barrett reduction
+ *
+ * Arguments: *r: pointer to input/output polynomial
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \
+    !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "consts.h"
+
+# Barrett reduce constatnts
+#define V20159  0
+#define V_25    1
+#define V_26    2
+#define V_MKQ   3
+
+.machine "any"
+.text
+
+.macro BREDUCE_4X _v0 _v1 _v2 _v3
+        lxvd2x  32+v8, 0, r3
+        lxvd2x  32+v12, r14, r3
+        lxvd2x  32+v16, r15, r3
+        lxvd2x  32+v20, r16, r3
+        addi    r3, r3, 64
+        vmulosh v6, v8, V20159
+        vmulesh v5, v8, V20159
+        vmulosh v11, v12, V20159
+        vmulesh v10, v12, V20159
+        vmulosh v15, v16, V20159
+        vmulesh v14, v16, V20159
+        vmulosh v19, v20, V20159
+        vmulesh v18, v20, V20159
+        xxmrglw 32+v4, 32+v5, 32+v6
+        xxmrghw 32+v5, 32+v5, 32+v6
+        xxmrglw 32+v9, 32+v10, 32+v11
+        xxmrghw 32+v10, 32+v10, 32+v11
+        xxmrglw 32+v13, 32+v14, 32+v15
+        xxmrghw 32+v14, 32+v14, 32+v15
+        xxmrglw 32+v17, 32+v18, 32+v19
+        xxmrghw 32+v18, 32+v18, 32+v19
+        vadduwm v4, v4, V_25
+        vadduwm v5, v5, V_25
+        vadduwm v9, v9, V_25
+        vadduwm v10, v10, V_25
+        vadduwm v13, v13, V_25
+        vadduwm v14, v14, V_25
+        vadduwm v17, v17, V_25
+        vadduwm v18, v18, V_25
+        vsraw   v4, v4, V_26
+        vsraw   v5, v5, V_26
+        vsraw   v9, v9, V_26
+        vsraw   v10, v10, V_26
+        vsraw   v13, v13, V_26
+        vsraw   v14, v14, V_26
+        vsraw   v17, v17, V_26
+        vsraw   v18, v18, V_26
+        vpkuwum v4, v5, v4
+        vsubuhm v4, v7, v4
+        vpkuwum v9, v10, v9
+        vsubuhm v9, v7, v9
+        vpkuwum v13, v14, v13
+        vsubuhm v13, v7, v13
+        vpkuwum v17, v18, v17
+        vsubuhm v17, v7, v17
+        vmladduhm \_v0, v4, V_MKQ, v8
+        vmladduhm \_v1, v9, V_MKQ, v12
+        vmladduhm \_v2, v13, V_MKQ, v16
+        vmladduhm \_v3, v17, V_MKQ, v20
+.endm
+
+.macro Write_8X
+        stxvd2x 32+v21, r4, r3
+        stxvd2x 32+v22, r5, r3
+        stxvd2x 32+v23, r6, r3
+        stxvd2x 32+v24, r7, r3
+        stxvd2x 32+v4, r8, r3
+        stxvd2x 32+v9, r9, r3
+        stxvd2x 32+v13, r10, r3
+        stxvd2x 32+v17, r11, r3
+.endm
+
+/*
+ * Conditional addition to get unsigned canonical representative
+ */
+.macro To_unsigned_16
+        lxvd2x    32+v12, 0, r3
+        lxvd2x    32+v13, r14, r3
+        lxvd2x    32+v14, r15, r3
+        lxvd2x    32+v15, r16, r3
+        addi    r3, r3, 64
+        vsrh    v1, v12, v10
+        vsrh    v0, v13, v10
+        vsrh    v3, v14, v10
+        vsrh    v2, v15, v10
+        vadduhm v7, v12, v11
+        vadduhm v8, v13, v11
+        vadduhm v5, v14, v11
+        vadduhm v6, v15, v11
+        vcmpequh v1, v1, v9
+        vcmpequh v0, v0, v9
+        vcmpequh v3, v3, v9
+        vcmpequh v2, v2, v9
+        xxsel   32+v1, 32+v7,32+v12, 32+v1
+        xxsel   32+v0, 32+v8,32+v13, 32+v0
+        xxsel   32+v3, 32+v5,32+v14, 32+v3
+        xxsel   32+v2, 32+v6,32+v15, 32+v2
+        stxvd2x 32+v3, r10, r3
+        stxvd2x 32+v2, r11, r3
+        stxvd2x 32+v1, r8, r3
+        stxvd2x 32+v0, r9, r3
+.endm
+
+.align 4
+.globl MLK_ASM_NAMESPACE(reduce_ppc)
+MLK_ASM_FN_SYMBOL(reduce_ppc)
+        stdu    r1, -224(r1)
+        mflr    r0
+        std     r14, 96(r1)
+        std     r15, 104(r1)
+        std     r16, 112(r1)
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        stxvx   32+v20, r6, r1
+        stxvx   32+v21, r7, r1
+        stxvx   32+v22, r8, r1
+        stxvx   32+v23, r9, r1
+        stxvx   32+v24, r10, r1
+
+        vxor    v7, v7, v7
+
+        li      r6, Q_OFFSET
+        li      r7, C20159_OFFSET
+        lxvx    32+V_MKQ, r6, r4
+        lxvx    32+V20159, r7, r4
+
+        vspltisw V_26, 13
+        vadduwm V_26, V_26, V_26
+        vspltisw v4, 1
+        vsubuwm v5, V_26, v4
+        vslw    V_25, v4, v5
+
+        li      r4, -128
+        li      r5, -112
+        li      r6, -96
+        li      r7, -80
+        li      r8, -64
+        li      r9, -48
+        li      r10, -32
+        li      r11, -16
+
+        li      r14, 16
+        li      r15, 32
+        li      r16, 48
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        BREDUCE_4X v21, v22, v23, v24
+        BREDUCE_4X v4, v9, v13, v17
+        Write_8X
+
+        /*
+         * To unsigned canonical
+         */
+.align 4
+        addi    r3, r3, -512
+        vxor    v9, v9, v9
+        vspltish v10, 15
+        vmr     v11, V_MKQ
+
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+        To_unsigned_16
+
+        ld      r14, 96(r1)
+        ld      r15, 104(r1)
+        ld      r16, 112(r1)
+        li      r6, 128
+        li      r7, 144
+        li      r8, 160
+        li      r9, 176
+        li      r10, 192
+        lxvx    32+v20, r6, r1
+        lxvx    32+v21, r7, r1
+        lxvx    32+v22, r8, r1
+        lxvx    32+v23, r9, r1
+        lxvx    32+v24, r10, r1
+        mtlr    r0
+        addi    r1, r1, 224
+        blr
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V20159
+#undef V_25
+#undef V_26
+#undef V_MKQ
+
+#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \
+          !MLK_CONFIG_MULTILEVEL_NO_SHARED */
+
+/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
+ * Don't modify by hand -- this is auto-generated by scripts/autogen. */
+#undef V20159
+#undef V_25
+#undef V_26
+#undef V_MKQ
diff --git a/test/mk/components.mk b/test/mk/components.mk
index 05dc714149..b7abccfbbb 100644
--- a/test/mk/components.mk
+++ b/test/mk/components.mk
@@ -7,6 +7,7 @@ endif
 
 SOURCES += $(wildcard mlkem/src/*.c)
 ifeq ($(OPT),1)
+	SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS])
 	SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS])
 	CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202
 endif