/*
 * memcpy - copy memory area
 *
 * Copyright (c) 2012-2022, Arm Limited.
 * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 */

/* Assumptions:
 *
 * ARMv8-a, AArch64, unaligned accesses.
 *
 */

#include <private/bionic_asm.h>

#define dstin     x0
#define src       x1
#define count     x2
#define dst       x3
#define srcend    x4
#define dstend    x5
#define A_l       x6
#define A_lw      w6
#define A_h       x7
#define B_l       x8
#define B_lw      w8
#define B_h       x9
#define C_l       x10
#define C_lw      w10
#define C_h       x11
#define D_l       x12
#define D_h       x13
#define E_l       x14
#define E_h       x15
#define F_l       x16
#define F_h       x17
#define G_l       count
#define G_h       dst
#define H_l       src
#define H_h       srcend
#define tmp1      x14
#define tmp2      x16
#define SMALL_BUFFER_SIZE    48

/* This implementation handles overlaps and supports both memcpy and memmove
   from a single entry point.  It uses unaligned accesses and branchless
   sequences to keep the code small, simple and improve performance.

   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
   copies of up to 128 bytes, and large copies.  The overhead of the overlap
   check is negligible since it is only required for large copies.

   Large copies use a software pipelined loop processing 64 bytes per iteration.
   The destination pointer is 16-byte aligned to minimize unaligned accesses.
   The loop tail is handled by always copying 64 bytes from the end.
*/

ALIAS_SYMBOL (__memmove_aarch64_nt, __memcpy_aarch64_nt)
ENTRY (__memcpy_aarch64_nt)

    add    srcend, src, count
    add    dstend, dstin, count
    cmp    count, 128
    b.hi    L(copy_long)
    cmp    count, 32
    b.hi    L(copy32_128)

    /* Small copies: 0..32 bytes.  */
    cmp    count, 16
    b.lo    L(copy16)
    ldp    A_l, A_h, [src]
    ldp    D_l, D_h, [srcend, -16]
    stp    A_l, A_h, [dstin]
    stp    D_l, D_h, [dstend, -16]
    ret

    /* Copy 8-15 bytes.  */
L(copy16):
    tbz    count, 3, L(copy8)
    ldr    A_l, [src]
    ldr    A_h, [srcend, -8]
    str    A_l, [dstin]
    str    A_h, [dstend, -8]
    ret

    .p2align 3
    /* Copy 4-7 bytes.  */
L(copy8):
    tbz    count, 2, L(copy4)
    ldr    A_lw, [src]
    ldr    B_lw, [srcend, -4]
    str    A_lw, [dstin]
    str    B_lw, [dstend, -4]
    ret

    /* Copy 0..3 bytes using a branchless sequence.  */
L(copy4):
    cbz    count, L(copy0)
    lsr    tmp1, count, 1
    ldrb    A_lw, [src]
    ldrb    C_lw, [srcend, -1]
    ldrb    B_lw, [src, tmp1]
    strb    A_lw, [dstin]
    strb    B_lw, [dstin, tmp1]
    strb    C_lw, [dstend, -1]
L(copy0):
    ret

    .p2align 4
    /* Medium copies: 33..128 bytes.  */
L(copy32_128):
    ldp    A_l, A_h, [src]
    ldp    B_l, B_h, [src, 16]
    ldp    C_l, C_h, [srcend, -32]
    ldp    D_l, D_h, [srcend, -16]
    cmp    count, 64
    b.hi    L(copy128)
    stp    A_l, A_h, [dstin]
    stp    B_l, B_h, [dstin, 16]
    stp    C_l, C_h, [dstend, -32]
    stp    D_l, D_h, [dstend, -16]
    ret

    .p2align 4
    /* Copy 65..128 bytes.  */
L(copy128):
    ldp    E_l, E_h, [src, 32]
    ldp    F_l, F_h, [src, 48]
    cmp    count, 96
    b.ls    L(copy96)
    ldp    G_l, G_h, [srcend, -64]
    ldp    H_l, H_h, [srcend, -48]
    stp    G_l, G_h, [dstend, -64]
    stp    H_l, H_h, [dstend, -48]
L(copy96):
    stp    A_l, A_h, [dstin]
    stp    B_l, B_h, [dstin, 16]
    stp    E_l, E_h, [dstin, 32]
    stp    F_l, F_h, [dstin, 48]
    stp    C_l, C_h, [dstend, -32]
    stp    D_l, D_h, [dstend, -16]
    ret

    .p2align 4
    /* Copy more than 128 bytes.  */
L(copy_long):
    mov tmp2, #SMALL_BUFFER_SIZE
    cmp count, tmp2, LSL#10
    bgt L(copy_long_nt)
    /* Use backwards copy if there is an overlap.  */
    sub    tmp1, dstin, src
    cbz    tmp1, L(copy0)
    cmp    tmp1, count
    b.lo    L(copy_long_backwards)

    /* Copy 16 bytes and then align dst to 16-byte alignment.  */

    ldp    D_l, D_h, [src]
    and    tmp1, dstin, 15
    bic    dst, dstin, 15
    sub    src, src, tmp1
    add    count, count, tmp1    /* Count is now 16 too large.  */
    ldp    A_l, A_h, [src, 16]
    stp    D_l, D_h, [dstin]
    ldp    B_l, B_h, [src, 32]
    ldp    C_l, C_h, [src, 48]
    ldp    D_l, D_h, [src, 64]!
    subs    count, count, 128 + 16    /* Test and readjust count.  */
    b.ls    L(copy64_from_end)

L(loop64):
    stp    A_l, A_h, [dst, 16]
    ldp    A_l, A_h, [src, 16]
    stp    B_l, B_h, [dst, 32]
    ldp    B_l, B_h, [src, 32]
    stp    C_l, C_h, [dst, 48]
    ldp    C_l, C_h, [src, 48]
    stp    D_l, D_h, [dst, 64]!
    ldp    D_l, D_h, [src, 64]!
    subs    count, count, 64
    b.hi    L(loop64)

    /* Write the last iteration and copy 64 bytes from the end.  */
L(copy64_from_end):
    ldp    E_l, E_h, [srcend, -64]
    stp    A_l, A_h, [dst, 16]
    ldp    A_l, A_h, [srcend, -48]
    stp    B_l, B_h, [dst, 32]
    ldp    B_l, B_h, [srcend, -32]
    stp    C_l, C_h, [dst, 48]
    ldp    C_l, C_h, [srcend, -16]
    stp    D_l, D_h, [dst, 64]
    stp    E_l, E_h, [dstend, -64]
    stp    A_l, A_h, [dstend, -48]
    stp    B_l, B_h, [dstend, -32]
    stp    C_l, C_h, [dstend, -16]
    ret

    .p2align 4

    /* Large backwards copy for overlapping copies.
       Copy 16 bytes and then align dst to 16-byte alignment.  */
L(copy_long_backwards):
    ldp    D_l, D_h, [srcend, -16]
    and    tmp1, dstend, 15
    sub    srcend, srcend, tmp1
    sub    count, count, tmp1
    ldp    A_l, A_h, [srcend, -16]
    stp    D_l, D_h, [dstend, -16]
    ldp    B_l, B_h, [srcend, -32]
    ldp    C_l, C_h, [srcend, -48]
    ldp    D_l, D_h, [srcend, -64]!
    sub    dstend, dstend, tmp1
    subs    count, count, 128
    b.ls    L(copy64_from_start)

L(loop64_backwards):
    stp    A_l, A_h, [dstend, -16]
    ldp    A_l, A_h, [srcend, -16]
    stp    B_l, B_h, [dstend, -32]
    ldp    B_l, B_h, [srcend, -32]
    stp    C_l, C_h, [dstend, -48]
    ldp    C_l, C_h, [srcend, -48]
    stp    D_l, D_h, [dstend, -64]!
    ldp    D_l, D_h, [srcend, -64]!
    subs    count, count, 64
    b.hi    L(loop64_backwards)

    /* Write the last iteration and copy 64 bytes from the start.  */
L(copy64_from_start):
    ldp    G_l, G_h, [src, 48]
    stp    A_l, A_h, [dstend, -16]
    ldp    A_l, A_h, [src, 32]
    stp    B_l, B_h, [dstend, -32]
    ldp    B_l, B_h, [src, 16]
    stp    C_l, C_h, [dstend, -48]
    ldp    C_l, C_h, [src]
    stp    D_l, D_h, [dstend, -64]
    stp    G_l, G_h, [dstin, 48]
    stp    A_l, A_h, [dstin, 32]
    stp    B_l, B_h, [dstin, 16]
    stp    C_l, C_h, [dstin]
    ret

    .p2align 4
    /* Copy more than 48 KB using ldnp+stnp (non-temporal) instructions.  */
L(copy_long_nt):
    /* Use backwards copy if there is an overlap.  */
    sub    tmp1, dstin, src
    cbz    tmp1, L(copy0)
    cmp    tmp1, count
    b.lo    L(copy_long_backwards_nt)

    /* Copy 16 bytes and then align dst to 16-byte alignment.  */

    ldnp    D_l, D_h, [src]
    and    tmp1, dstin, 15
    bic    dst, dstin, 15
    sub    src, src, tmp1
    add    count, count, tmp1    /* Count is now 16 too large.  */
    ldnp    A_l, A_h, [src, 16]
    stnp    D_l, D_h, [dstin]
    ldnp    B_l, B_h, [src, 32]
    ldnp    C_l, C_h, [src, 48]
    ldnp    D_l, D_h, [src, 64]
    add     src, src, #64
    subs    count, count, 128 + 16    /* Test and readjust count.  */
    b.ls    L(copy64_from_end_nt)

L(loop64_nt):
    stnp    A_l, A_h, [dst, 16]
    ldnp    A_l, A_h, [src, 16]
    stnp    B_l, B_h, [dst, 32]
    ldnp    B_l, B_h, [src, 32]
    stnp    C_l, C_h, [dst, 48]
    ldnp    C_l, C_h, [src, 48]
    stnp    D_l, D_h, [dst, 64]
    add dst, dst, #64
    ldnp    D_l, D_h, [src, 64]
    add src, src, #64
    subs    count, count, 64
    b.hi    L(loop64_nt)

    /* Write the last iteration and copy 64 bytes from the end.  */
L(copy64_from_end_nt):
    ldnp    E_l, E_h, [srcend, -64]
    stnp    A_l, A_h, [dst, 16]
    ldnp    A_l, A_h, [srcend, -48]
    stnp    B_l, B_h, [dst, 32]
    ldnp    B_l, B_h, [srcend, -32]
    stnp    C_l, C_h, [dst, 48]
    ldnp    C_l, C_h, [srcend, -16]
    stnp    D_l, D_h, [dst, 64]
    stnp    E_l, E_h, [dstend, -64]
    stnp    A_l, A_h, [dstend, -48]
    stnp    B_l, B_h, [dstend, -32]
    stnp    C_l, C_h, [dstend, -16]
    ret

    .p2align 4

    /* Large backwards copy for overlapping copies.
       Copy 16 bytes and then align dst to 16-byte alignment.  */
L(copy_long_backwards_nt):
    ldnp    D_l, D_h, [srcend, -16]
    and    tmp1, dstend, 15
    sub    srcend, srcend, tmp1
    sub    count, count, tmp1
    ldnp    A_l, A_h, [srcend, -16]
    stnp    D_l, D_h, [dstend, -16]
    ldnp    B_l, B_h, [srcend, -32]
    ldnp    C_l, C_h, [srcend, -48]
    ldnp    D_l, D_h, [srcend, -64]
    add     srcend, srcend, #-64
    sub    dstend, dstend, tmp1
    subs    count, count, 128
    b.ls    L(copy64_from_start_nt)

L(loop64_backwards_nt):
    stnp    A_l, A_h, [dstend, -16]
    ldnp    A_l, A_h, [srcend, -16]
    stnp    B_l, B_h, [dstend, -32]
    ldnp    B_l, B_h, [srcend, -32]
    stnp    C_l, C_h, [dstend, -48]
    ldnp    C_l, C_h, [srcend, -48]
    stnp    D_l, D_h, [dstend, -64]
    add     dstend, dstend, #-64
    ldnp    D_l, D_h, [srcend, -64]
    add     srcend, srcend, #-64
    subs    count, count, 64
    b.hi    L(loop64_backwards_nt)

    /* Write the last iteration and copy 64 bytes from the start.  */
L(copy64_from_start_nt):
    ldnp    G_l, G_h, [src, 48]
    stnp    A_l, A_h, [dstend, -16]
    ldnp    A_l, A_h, [src, 32]
    stnp    B_l, B_h, [dstend, -32]
    ldnp    B_l, B_h, [src, 16]
    stnp    C_l, C_h, [dstend, -48]
    ldnp    C_l, C_h, [src]
    stnp    D_l, D_h, [dstend, -64]
    stnp    G_l, G_h, [dstin, 48]
    stnp    A_l, A_h, [dstin, 32]
    stnp    B_l, B_h, [dstin, 16]
    stnp    C_l, C_h, [dstin]
    ret

END (__memcpy_aarch64_nt)