diff --git a/src/string/aarch64/memcpy.S b/src/string/aarch64/memcpy.S
new file mode 100644
index 00000000..584fcf9c
--- /dev/null
+++ b/src/string/aarch64/memcpy.S
@@ -0,0 +1,348 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2022, Arm Limited.
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin     x0
+#define src       x1
+#define count     x2
+#define dst       x3
+#define srcend    x4
+#define dstend    x5
+#define A_l       x6
+#define A_lw      w6
+#define A_h       x7
+#define B_l       x8
+#define B_lw      w8
+#define B_h       x9
+#define C_l       x10
+#define C_lw      w10
+#define C_h       x11
+#define D_l       x12
+#define D_h       x13
+#define E_l       x14
+#define E_h       x15
+#define F_l       x16
+#define F_h       x17
+#define G_l       count
+#define G_h       dst
+#define H_l       src
+#define H_h       srcend
+#define tmp1      x14
+#define tmp2      x16
+#define SMALL_BUFFER_SIZE    48
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+.global memcpy
+.type memcpy,%function
+memcpy:
+    .cfi_startproc
+    add    srcend, src, count
+    add    dstend, dstin, count
+    cmp    count, 128
+    b.hi    .Lcopy_long
+    cmp    count, 32
+    b.hi    .Lcopy32_128
+
+    /* Small copies: 0..32 bytes.  */
+    cmp    count, 16
+    b.lo    .Lcopy16
+    ldp    A_l, A_h, [src]
+    ldp    D_l, D_h, [srcend, -16]
+    stp    A_l, A_h, [dstin]
+    stp    D_l, D_h, [dstend, -16]
+    ret
+
+    /* Copy 8-15 bytes.  */
+.Lcopy16:
+    tbz    count, 3, .Lcopy8
+    ldr    A_l, [src]
+    ldr    A_h, [srcend, -8]
+    str    A_l, [dstin]
+    str    A_h, [dstend, -8]
+    ret
+
+    .p2align 3
+    /* Copy 4-7 bytes.  */
+.Lcopy8:
+    tbz    count, 2, .Lcopy4
+    ldr    A_lw, [src]
+    ldr    B_lw, [srcend, -4]
+    str    A_lw, [dstin]
+    str    B_lw, [dstend, -4]
+    ret
+
+    /* Copy 0..3 bytes using a branchless sequence.  */
+.Lcopy4:
+    cbz    count, .Lcopy0
+    lsr    tmp1, count, 1
+    ldrb    A_lw, [src]
+    ldrb    C_lw, [srcend, -1]
+    ldrb    B_lw, [src, tmp1]
+    strb    A_lw, [dstin]
+    strb    B_lw, [dstin, tmp1]
+    strb    C_lw, [dstend, -1]
+.Lcopy0:
+    ret
+
+    .p2align 4
+    /* Medium copies: 33..128 bytes.  */
+.Lcopy32_128:
+    ldp    A_l, A_h, [src]
+    ldp    B_l, B_h, [src, 16]
+    ldp    C_l, C_h, [srcend, -32]
+    ldp    D_l, D_h, [srcend, -16]
+    cmp    count, 64
+    b.hi    .Lcopy128
+    stp    A_l, A_h, [dstin]
+    stp    B_l, B_h, [dstin, 16]
+    stp    C_l, C_h, [dstend, -32]
+    stp    D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    /* Copy 65..128 bytes.  */
+.Lcopy128:
+    ldp    E_l, E_h, [src, 32]
+    ldp    F_l, F_h, [src, 48]
+    cmp    count, 96
+    b.ls    .Lcopy96
+    ldp    G_l, G_h, [srcend, -64]
+    ldp    H_l, H_h, [srcend, -48]
+    stp    G_l, G_h, [dstend, -64]
+    stp    H_l, H_h, [dstend, -48]
+.Lcopy96:
+    stp    A_l, A_h, [dstin]
+    stp    B_l, B_h, [dstin, 16]
+    stp    E_l, E_h, [dstin, 32]
+    stp    F_l, F_h, [dstin, 48]
+    stp    C_l, C_h, [dstend, -32]
+    stp    D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    /* Copy more than 128 bytes.  */
+.Lcopy_long:
+    mov tmp2, #SMALL_BUFFER_SIZE
+    cmp count, tmp2, LSL#10
+    bgt .Lcopy_long_nt
+    /* Use backwards copy if there is an overlap.  */
+    sub    tmp1, dstin, src
+    cbz    tmp1, .Lcopy0
+    cmp    tmp1, count
+    b.lo    .Lcopy_long_backwards
+
+    /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+    ldp    D_l, D_h, [src]
+    and    tmp1, dstin, 15
+    bic    dst, dstin, 15
+    sub    src, src, tmp1
+    add    count, count, tmp1    /* Count is now 16 too large.  */
+    ldp    A_l, A_h, [src, 16]
+    stp    D_l, D_h, [dstin]
+    ldp    B_l, B_h, [src, 32]
+    ldp    C_l, C_h, [src, 48]
+    ldp    D_l, D_h, [src, 64]!
+    subs    count, count, 128 + 16    /* Test and readjust count.  */
+    b.ls    .Lcopy64_from_end
+
+.Lloop64:
+    stp    A_l, A_h, [dst, 16]
+    ldp    A_l, A_h, [src, 16]
+    stp    B_l, B_h, [dst, 32]
+    ldp    B_l, B_h, [src, 32]
+    stp    C_l, C_h, [dst, 48]
+    ldp    C_l, C_h, [src, 48]
+    stp    D_l, D_h, [dst, 64]!
+    ldp    D_l, D_h, [src, 64]!
+    subs    count, count, 64
+    b.hi    .Lloop64
+
+    /* Write the last iteration and copy 64 bytes from the end.  */
+.Lcopy64_from_end:
+    ldp    E_l, E_h, [srcend, -64]
+    stp    A_l, A_h, [dst, 16]
+    ldp    A_l, A_h, [srcend, -48]
+    stp    B_l, B_h, [dst, 32]
+    ldp    B_l, B_h, [srcend, -32]
+    stp    C_l, C_h, [dst, 48]
+    ldp    C_l, C_h, [srcend, -16]
+    stp    D_l, D_h, [dst, 64]
+    stp    E_l, E_h, [dstend, -64]
+    stp    A_l, A_h, [dstend, -48]
+    stp    B_l, B_h, [dstend, -32]
+    stp    C_l, C_h, [dstend, -16]
+    ret
+
+    .p2align 4
+
+    /* Large backwards copy for overlapping copies.
+       Copy 16 bytes and then align dst to 16-byte alignment.  */
+.Lcopy_long_backwards:
+    ldp    D_l, D_h, [srcend, -16]
+    and    tmp1, dstend, 15
+    sub    srcend, srcend, tmp1
+    sub    count, count, tmp1
+    ldp    A_l, A_h, [srcend, -16]
+    stp    D_l, D_h, [dstend, -16]
+    ldp    B_l, B_h, [srcend, -32]
+    ldp    C_l, C_h, [srcend, -48]
+    ldp    D_l, D_h, [srcend, -64]!
+    sub    dstend, dstend, tmp1
+    subs    count, count, 128
+    b.ls    .Lcopy64_from_start
+
+.Lloop64_backwards:
+    stp    A_l, A_h, [dstend, -16]
+    ldp    A_l, A_h, [srcend, -16]
+    stp    B_l, B_h, [dstend, -32]
+    ldp    B_l, B_h, [srcend, -32]
+    stp    C_l, C_h, [dstend, -48]
+    ldp    C_l, C_h, [srcend, -48]
+    stp    D_l, D_h, [dstend, -64]!
+    ldp    D_l, D_h, [srcend, -64]!
+    subs    count, count, 64
+    b.hi    .Lloop64_backwards
+
+    /* Write the last iteration and copy 64 bytes from the start.  */
+.Lcopy64_from_start:
+    ldp    G_l, G_h, [src, 48]
+    stp    A_l, A_h, [dstend, -16]
+    ldp    A_l, A_h, [src, 32]
+    stp    B_l, B_h, [dstend, -32]
+    ldp    B_l, B_h, [src, 16]
+    stp    C_l, C_h, [dstend, -48]
+    ldp    C_l, C_h, [src]
+    stp    D_l, D_h, [dstend, -64]
+    stp    G_l, G_h, [dstin, 48]
+    stp    A_l, A_h, [dstin, 32]
+    stp    B_l, B_h, [dstin, 16]
+    stp    C_l, C_h, [dstin]
+    ret
+
+    .p2align 4
+    /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions.  */
+.Lcopy_long_nt:
+    /* Use backwards copy if there is an overlap.  */
+    sub    tmp1, dstin, src
+    cbz    tmp1, .Lcopy0
+    cmp    tmp1, count
+    b.lo    .Lcopy_long_backwards_nt
+
+    /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+    ldnp    D_l, D_h, [src]
+    and    tmp1, dstin, 15
+    bic    dst, dstin, 15
+    sub    src, src, tmp1
+    add    count, count, tmp1    /* Count is now 16 too large.  */
+    ldnp    A_l, A_h, [src, 16]
+    stnp    D_l, D_h, [dstin]
+    ldnp    B_l, B_h, [src, 32]
+    ldnp    C_l, C_h, [src, 48]
+    ldnp    D_l, D_h, [src, 64]
+    add     src, src, #64
+    subs    count, count, 128 + 16    /* Test and readjust count.  */
+    b.ls    .Lcopy64_from_end_nt
+
+.Lloop64_nt:
+    stnp    A_l, A_h, [dst, 16]
+    ldnp    A_l, A_h, [src, 16]
+    stnp    B_l, B_h, [dst, 32]
+    ldnp    B_l, B_h, [src, 32]
+    stnp    C_l, C_h, [dst, 48]
+    ldnp    C_l, C_h, [src, 48]
+    stnp    D_l, D_h, [dst, 64]
+    add dst, dst, #64
+    ldnp    D_l, D_h, [src, 64]
+    add src, src, #64
+    subs    count, count, 64
+    b.hi    .Lloop64_nt
+
+    /* Write the last iteration and copy 64 bytes from the end.  */
+.Lcopy64_from_end_nt:
+    ldnp    E_l, E_h, [srcend, -64]
+    stnp    A_l, A_h, [dst, 16]
+    ldnp    A_l, A_h, [srcend, -48]
+    stnp    B_l, B_h, [dst, 32]
+    ldnp    B_l, B_h, [srcend, -32]
+    stnp    C_l, C_h, [dst, 48]
+    ldnp    C_l, C_h, [srcend, -16]
+    stnp    D_l, D_h, [dst, 64]
+    stnp    E_l, E_h, [dstend, -64]
+    stnp    A_l, A_h, [dstend, -48]
+    stnp    B_l, B_h, [dstend, -32]
+    stnp    C_l, C_h, [dstend, -16]
+    ret
+
+    .p2align 4
+
+    /* Large backwards copy for overlapping copies.
+       Copy 16 bytes and then align dst to 16-byte alignment.  */
+.Lcopy_long_backwards_nt:
+    ldnp    D_l, D_h, [srcend, -16]
+    and    tmp1, dstend, 15
+    sub    srcend, srcend, tmp1
+    sub    count, count, tmp1
+    ldnp    A_l, A_h, [srcend, -16]
+    stnp    D_l, D_h, [dstend, -16]
+    ldnp    B_l, B_h, [srcend, -32]
+    ldnp    C_l, C_h, [srcend, -48]
+    ldnp    D_l, D_h, [srcend, -64]
+    add     srcend, srcend, #-64
+    sub    dstend, dstend, tmp1
+    subs    count, count, 128
+    b.ls    .Lcopy64_from_start_nt
+
+.Lloop64_backwards_nt:
+    stnp    A_l, A_h, [dstend, -16]
+    ldnp    A_l, A_h, [srcend, -16]
+    stnp    B_l, B_h, [dstend, -32]
+    ldnp    B_l, B_h, [srcend, -32]
+    stnp    C_l, C_h, [dstend, -48]
+    ldnp    C_l, C_h, [srcend, -48]
+    stnp    D_l, D_h, [dstend, -64]
+    add     dstend, dstend, #-64
+    ldnp    D_l, D_h, [srcend, -64]
+    add     srcend, srcend, #-64
+    subs    count, count, 64
+    b.hi    .Lloop64_backwards_nt
+
+    /* Write the last iteration and copy 64 bytes from the start.  */
+.Lcopy64_from_start_nt:
+    ldnp    G_l, G_h, [src, 48]
+    stnp    A_l, A_h, [dstend, -16]
+    ldnp    A_l, A_h, [src, 32]
+    stnp    B_l, B_h, [dstend, -32]
+    ldnp    B_l, B_h, [src, 16]
+    stnp    C_l, C_h, [dstend, -48]
+    ldnp    C_l, C_h, [src]
+    stnp    D_l, D_h, [dstend, -64]
+    stnp    G_l, G_h, [dstin, 48]
+    stnp    A_l, A_h, [dstin, 32]
+    stnp    B_l, B_h, [dstin, 16]
+    stnp    C_l, C_h, [dstin]
+    ret
+    .cfi_endproc
diff --git a/src/string/aarch64/memset.S b/src/string/aarch64/memset.S
new file mode 100644
index 00000000..0aa0f614
--- /dev/null
+++ b/src/string/aarch64/memset.S
@@ -0,0 +1,220 @@
+/* Copyright (c) 2012, Linaro Limited
+   All rights reserved.
+   Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ *
+ */
+
+#define dstin		x0
+#define val		w1
+#define count		x2
+#define tmp1		x3
+#define tmp1w		w3
+#define tmp2		x4
+#define tmp2w		w4
+#define zva_len_x	x5
+#define zva_len		w5
+#define zva_bits_x	x6
+#define A_l		x7
+#define A_lw		w7
+#define dst		x8
+#define tmp3w		w9
+#define tmp4		x10
+#define SMALL_BUFFER_SIZE    96
+
+.global memset
+.type memset,%function
+memset:
+    .cfi_startproc
+    mov	dst, dstin		/* Preserve return value.  */
+    ands	A_lw, val, #255
+    b.eq	.Lzero_mem  /* Use DC ZVA instruction if the val = 0 */
+    orr	A_lw, A_lw, A_lw, lsl #8
+    orr	A_lw, A_lw, A_lw, lsl #16
+    orr	A_l, A_l, A_l, lsl #32
+.Ltail_maybe_long:
+    cmp	count, #64
+    b.ge	.Lnot_short
+.Ltail_maybe_tiny:
+    cmp	count, #15
+    b.le	.Ltail15tiny
+.Ltail63:
+    ands	tmp1, count, #0x30
+    b.eq	.Ltail15
+    add	dst, dst, tmp1
+    cmp	tmp1w, #0x20
+    b.eq	1f
+    b.lt	2f
+    stp	A_l, A_l, [dst, #-48]
+1:
+    stp	A_l, A_l, [dst, #-32]
+2:
+    stp	A_l, A_l, [dst, #-16]
+.Ltail15:
+    and	count, count, #15
+    add	dst, dst, count
+    stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+    ret
+.Ltail15tiny:
+    /* Set up to 15 bytes.  Does not assume earlier memory
+       being set.  */
+    tbz	count, #3, 1f
+    str	A_l, [dst], #8
+1:
+    tbz	count, #2, 1f
+    str	A_lw, [dst], #4
+1:
+    tbz	count, #1, 1f
+    strh	A_lw, [dst], #2
+1:
+    tbz	count, #0, 1f
+    strb	A_lw, [dst]
+1:
+    ret
+    /* Critical loop.  Start at a new cache line boundary.  Assuming
+     * 64 bytes per line, this ensures the entire loop is in one line.  */
+    .p2align 6
+.Lnot_short:
+    mov tmp4, #SMALL_BUFFER_SIZE
+    cmp count, tmp4, LSL#10
+    /* Use non-temporal instruction if count > SMALL_BUFFER_SIZE */
+    bgt .Lnot_short_nt
+    neg	tmp2, dst
+    ands	tmp2, tmp2, #15
+    b.eq	2f
+    /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
+     * more than that to set, so we simply store 16 bytes and advance by
+     * the amount required to reach alignment.  */
+    sub	count, count, tmp2
+    stp	A_l, A_l, [dst]
+    add	dst, dst, tmp2
+    /* There may be less than 63 bytes to go now.  */
+    cmp	count, #63
+    b.le	.Ltail63
+2:
+    sub	dst, dst, #16		/* Pre-bias.  */
+    sub	count, count, #64
+1:
+    stp	A_l, A_l, [dst, #16]
+    stp	A_l, A_l, [dst, #32]
+    stp	A_l, A_l, [dst, #48]
+    stp	A_l, A_l, [dst, #64]!
+    subs	count, count, #64
+    b.ge	1b
+    tst	count, #0x3f
+    add	dst, dst, #16
+    b.ne	.Ltail63
+    ret
+.Lnot_short_nt:
+    neg	tmp2, dst
+    ands	tmp2, tmp2, #15
+    b.eq	2f
+    /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
+     * more than that to set, so we simply store 16 bytes and advance by
+     * the amount required to reach alignment.  */
+    sub	count, count, tmp2
+    stnp	A_l, A_l, [dst]
+    add	dst, dst, tmp2
+    /* There may be less than 63 bytes to go now.  */
+    cmp	count, #63
+    b.le	.Ltail63
+2:
+    sub	dst, dst, #16		/* Pre-bias.  */
+    sub	count, count, #64
+1:
+    stnp	A_l, A_l, [dst, #16]
+    stnp	A_l, A_l, [dst, #32]
+    stnp	A_l, A_l, [dst, #48]
+    stnp	A_l, A_l, [dst, #64]
+    add     dst, dst, #64
+    subs	count, count, #64
+    b.ge	1b
+    tst	count, #0x3f
+    add	dst, dst, #16
+    b.ne	.Ltail63
+    ret
+.Lzero_mem:
+    mov	A_l, #0
+    cmp	count, #63
+    b.le	.Ltail_maybe_tiny
+    neg	tmp2, dst
+    ands	tmp2, tmp2, #15
+    b.eq	1f
+    sub	count, count, tmp2
+    stp	A_l, A_l, [dst]
+    add	dst, dst, tmp2
+    cmp	count, #63
+    b.le	.Ltail63
+1:
+    /* For zeroing small amounts of memory, it's not worth setting up
+     * the line-clear code.  */
+    cmp	count, #128
+    b.lt	.Lnot_short
+    mrs	tmp1, dczid_el0
+    tbnz	tmp1, #4, .Lnot_short
+    mov	tmp3w, #4
+    and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+    lsl	zva_len, tmp3w, zva_len
+.Lzero_by_line:
+    /* Compute how far we need to go to become suitably aligned.  We're
+     * already at quad-word alignment.  */
+    cmp	count, zva_len_x
+    b.lt	.Lnot_short		/* Not enough to reach alignment.  */
+    sub	zva_bits_x, zva_len_x, #1
+    neg	tmp2, dst
+    ands	tmp2, tmp2, zva_bits_x
+    b.eq	1f			/* Already aligned.  */
+    /* Not aligned, check that there's enough to copy after alignment.  */
+    sub	tmp1, count, tmp2
+    cmp	tmp1, #64
+    ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
+    b.lt	.Lnot_short
+    /* We know that there's at least 64 bytes to zero and that it's safe
+     * to overrun by 64 bytes.  */
+    mov	count, tmp1
+2:
+    stp	A_l, A_l, [dst]
+    stp	A_l, A_l, [dst, #16]
+    stp	A_l, A_l, [dst, #32]
+    subs	tmp2, tmp2, #64
+    stp	A_l, A_l, [dst, #48]
+    add	dst, dst, #64
+    b.ge	2b
+    /* We've overrun a bit, so adjust dst downwards.  */
+    add	dst, dst, tmp2
+1:
+    sub	count, count, zva_len_x
+3:
+    dc	zva, dst
+    add	dst, dst, zva_len_x
+    subs	count, count, zva_len_x
+    b.ge	3b
+    ands	count, count, zva_bits_x
+    b.ne	.Ltail_maybe_long
+    ret
+    .cfi_endproc