diff --git a/src/string/aarch64/memcpy.S b/src/string/aarch64/memcpy.S new file mode 100644 index 00000000..584fcf9c --- /dev/null +++ b/src/string/aarch64/memcpy.S @@ -0,0 +1,348 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2012-2022, Arm Limited. + * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 +#define tmp2 x16 +#define SMALL_BUFFER_SIZE 48 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +.global memcpy +.type memcpy,%function +memcpy: + .cfi_startproc + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi .Lcopy_long + cmp count, 32 + b.hi .Lcopy32_128 + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo .Lcopy16 + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +.Lcopy16: + tbz count, 3, .Lcopy8 + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +.Lcopy8: + tbz count, 2, .Lcopy4 + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +.Lcopy4: + cbz count, .Lcopy0 + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +.Lcopy0: + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +.Lcopy32_128: + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi .Lcopy128 + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +.Lcopy128: + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls .Lcopy96 + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +.Lcopy96: + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +.Lcopy_long: + mov tmp2, #SMALL_BUFFER_SIZE + cmp count, tmp2, LSL#10 + bgt .Lcopy_long_nt + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, .Lcopy0 + cmp tmp1, count + b.lo .Lcopy_long_backwards + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls .Lcopy64_from_end + +.Lloop64: + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi .Lloop64 + + /* Write the last iteration and copy 64 bytes from the end. */ +.Lcopy64_from_end: + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +.Lcopy_long_backwards: + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls .Lcopy64_from_start + +.Lloop64_backwards: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi .Lloop64_backwards + + /* Write the last iteration and copy 64 bytes from the start. */ +.Lcopy64_from_start: + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + + .p2align 4 + /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions. */ +.Lcopy_long_nt: + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, .Lcopy0 + cmp tmp1, count + b.lo .Lcopy_long_backwards_nt + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldnp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldnp A_l, A_h, [src, 16] + stnp D_l, D_h, [dstin] + ldnp B_l, B_h, [src, 32] + ldnp C_l, C_h, [src, 48] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls .Lcopy64_from_end_nt + +.Lloop64_nt: + stnp A_l, A_h, [dst, 16] + ldnp A_l, A_h, [src, 16] + stnp B_l, B_h, [dst, 32] + ldnp B_l, B_h, [src, 32] + stnp C_l, C_h, [dst, 48] + ldnp C_l, C_h, [src, 48] + stnp D_l, D_h, [dst, 64] + add dst, dst, #64 + ldnp D_l, D_h, [src, 64] + add src, src, #64 + subs count, count, 64 + b.hi .Lloop64_nt + + /* Write the last iteration and copy 64 bytes from the end. */ +.Lcopy64_from_end_nt: + ldnp E_l, E_h, [srcend, -64] + stnp A_l, A_h, [dst, 16] + ldnp A_l, A_h, [srcend, -48] + stnp B_l, B_h, [dst, 32] + ldnp B_l, B_h, [srcend, -32] + stnp C_l, C_h, [dst, 48] + ldnp C_l, C_h, [srcend, -16] + stnp D_l, D_h, [dst, 64] + stnp E_l, E_h, [dstend, -64] + stnp A_l, A_h, [dstend, -48] + stnp B_l, B_h, [dstend, -32] + stnp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +.Lcopy_long_backwards_nt: + ldnp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldnp A_l, A_h, [srcend, -16] + stnp D_l, D_h, [dstend, -16] + ldnp B_l, B_h, [srcend, -32] + ldnp C_l, C_h, [srcend, -48] + ldnp D_l, D_h, [srcend, -64] + add srcend, srcend, #-64 + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls .Lcopy64_from_start_nt + +.Lloop64_backwards_nt: + stnp A_l, A_h, [dstend, -16] + ldnp A_l, A_h, [srcend, -16] + stnp B_l, B_h, [dstend, -32] + ldnp B_l, B_h, [srcend, -32] + stnp C_l, C_h, [dstend, -48] + ldnp C_l, C_h, [srcend, -48] + stnp D_l, D_h, [dstend, -64] + add dstend, dstend, #-64 + ldnp D_l, D_h, [srcend, -64] + add srcend, srcend, #-64 + subs count, count, 64 + b.hi .Lloop64_backwards_nt + + /* Write the last iteration and copy 64 bytes from the start. */ +.Lcopy64_from_start_nt: + ldnp G_l, G_h, [src, 48] + stnp A_l, A_h, [dstend, -16] + ldnp A_l, A_h, [src, 32] + stnp B_l, B_h, [dstend, -32] + ldnp B_l, B_h, [src, 16] + stnp C_l, C_h, [dstend, -48] + ldnp C_l, C_h, [src] + stnp D_l, D_h, [dstend, -64] + stnp G_l, G_h, [dstin, 48] + stnp A_l, A_h, [dstin, 32] + stnp B_l, B_h, [dstin, 16] + stnp C_l, C_h, [dstin] + ret + .cfi_endproc diff --git a/src/string/aarch64/memset.S b/src/string/aarch64/memset.S new file mode 100644 index 00000000..0aa0f614 --- /dev/null +++ b/src/string/aarch64/memset.S @@ -0,0 +1,220 @@ +/* Copyright (c) 2012, Linaro Limited + All rights reserved. + Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Linaro nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* Assumptions: + * + * ARMv8-a, AArch64 + * Unaligned accesses + * + */ + +#define dstin x0 +#define val w1 +#define count x2 +#define tmp1 x3 +#define tmp1w w3 +#define tmp2 x4 +#define tmp2w w4 +#define zva_len_x x5 +#define zva_len w5 +#define zva_bits_x x6 +#define A_l x7 +#define A_lw w7 +#define dst x8 +#define tmp3w w9 +#define tmp4 x10 +#define SMALL_BUFFER_SIZE 96 + +.global memset +.type memset,%function +memset: + .cfi_startproc + mov dst, dstin /* Preserve return value. */ + ands A_lw, val, #255 + b.eq .Lzero_mem /* Use DC ZVA instruction if the val = 0 */ + orr A_lw, A_lw, A_lw, lsl #8 + orr A_lw, A_lw, A_lw, lsl #16 + orr A_l, A_l, A_l, lsl #32 +.Ltail_maybe_long: + cmp count, #64 + b.ge .Lnot_short +.Ltail_maybe_tiny: + cmp count, #15 + b.le .Ltail15tiny +.Ltail63: + ands tmp1, count, #0x30 + b.eq .Ltail15 + add dst, dst, tmp1 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + stp A_l, A_l, [dst, #-48] +1: + stp A_l, A_l, [dst, #-32] +2: + stp A_l, A_l, [dst, #-16] +.Ltail15: + and count, count, #15 + add dst, dst, count + stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ + ret +.Ltail15tiny: + /* Set up to 15 bytes. Does not assume earlier memory + being set. */ + tbz count, #3, 1f + str A_l, [dst], #8 +1: + tbz count, #2, 1f + str A_lw, [dst], #4 +1: + tbz count, #1, 1f + strh A_lw, [dst], #2 +1: + tbz count, #0, 1f + strb A_lw, [dst] +1: + ret + /* Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line, this ensures the entire loop is in one line. */ + .p2align 6 +.Lnot_short: + mov tmp4, #SMALL_BUFFER_SIZE + cmp count, tmp4, LSL#10 + /* Use non-temporal instruction if count > SMALL_BUFFER_SIZE */ + bgt .Lnot_short_nt + neg tmp2, dst + ands tmp2, tmp2, #15 + b.eq 2f + /* Bring DST to 128-bit (16-byte) alignment. We know that there's + * more than that to set, so we simply store 16 bytes and advance by + * the amount required to reach alignment. */ + sub count, count, tmp2 + stp A_l, A_l, [dst] + add dst, dst, tmp2 + /* There may be less than 63 bytes to go now. */ + cmp count, #63 + b.le .Ltail63 +2: + sub dst, dst, #16 /* Pre-bias. */ + sub count, count, #64 +1: + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + stp A_l, A_l, [dst, #48] + stp A_l, A_l, [dst, #64]! + subs count, count, #64 + b.ge 1b + tst count, #0x3f + add dst, dst, #16 + b.ne .Ltail63 + ret +.Lnot_short_nt: + neg tmp2, dst + ands tmp2, tmp2, #15 + b.eq 2f + /* Bring DST to 128-bit (16-byte) alignment. We know that there's + * more than that to set, so we simply store 16 bytes and advance by + * the amount required to reach alignment. */ + sub count, count, tmp2 + stnp A_l, A_l, [dst] + add dst, dst, tmp2 + /* There may be less than 63 bytes to go now. */ + cmp count, #63 + b.le .Ltail63 +2: + sub dst, dst, #16 /* Pre-bias. */ + sub count, count, #64 +1: + stnp A_l, A_l, [dst, #16] + stnp A_l, A_l, [dst, #32] + stnp A_l, A_l, [dst, #48] + stnp A_l, A_l, [dst, #64] + add dst, dst, #64 + subs count, count, #64 + b.ge 1b + tst count, #0x3f + add dst, dst, #16 + b.ne .Ltail63 + ret +.Lzero_mem: + mov A_l, #0 + cmp count, #63 + b.le .Ltail_maybe_tiny + neg tmp2, dst + ands tmp2, tmp2, #15 + b.eq 1f + sub count, count, tmp2 + stp A_l, A_l, [dst] + add dst, dst, tmp2 + cmp count, #63 + b.le .Ltail63 +1: + /* For zeroing small amounts of memory, it's not worth setting up + * the line-clear code. */ + cmp count, #128 + b.lt .Lnot_short + mrs tmp1, dczid_el0 + tbnz tmp1, #4, .Lnot_short + mov tmp3w, #4 + and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ + lsl zva_len, tmp3w, zva_len +.Lzero_by_line: + /* Compute how far we need to go to become suitably aligned. We're + * already at quad-word alignment. */ + cmp count, zva_len_x + b.lt .Lnot_short /* Not enough to reach alignment. */ + sub zva_bits_x, zva_len_x, #1 + neg tmp2, dst + ands tmp2, tmp2, zva_bits_x + b.eq 1f /* Already aligned. */ + /* Not aligned, check that there's enough to copy after alignment. */ + sub tmp1, count, tmp2 + cmp tmp1, #64 + ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ + b.lt .Lnot_short + /* We know that there's at least 64 bytes to zero and that it's safe + * to overrun by 64 bytes. */ + mov count, tmp1 +2: + stp A_l, A_l, [dst] + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + subs tmp2, tmp2, #64 + stp A_l, A_l, [dst, #48] + add dst, dst, #64 + b.ge 2b + /* We've overrun a bit, so adjust dst downwards. */ + add dst, dst, tmp2 +1: + sub count, count, zva_len_x +3: + dc zva, dst + add dst, dst, zva_len_x + subs count, count, zva_len_x + b.ge 3b + ands count, count, zva_bits_x + b.ne .Ltail_maybe_long + ret + .cfi_endproc