1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2012-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses. 11 * 12 */ 13 14#include "../asmdefs.h" 15 16#define dstin x0 17#define src x1 18#define count x2 19#define dst x3 20#define srcend x4 21#define dstend x5 22#define A_l x6 23#define A_lw w6 24#define A_h x7 25#define B_l x8 26#define B_lw w8 27#define B_h x9 28#define C_l x10 29#define C_lw w10 30#define C_h x11 31#define D_l x12 32#define D_h x13 33#define E_l x14 34#define E_h x15 35#define F_l x16 36#define F_h x17 37#define G_l count 38#define G_h dst 39#define H_l src 40#define H_h srcend 41#define tmp1 x14 42 43/* This implementation handles overlaps and supports both memcpy and memmove 44 from a single entry point. It uses unaligned accesses and branchless 45 sequences to keep the code small, simple and improve performance. 46 47 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 48 copies of up to 128 bytes, and large copies. The overhead of the overlap 49 check is negligible since it is only required for large copies. 50 51 Large copies use a software pipelined loop processing 64 bytes per iteration. 52 The destination pointer is 16-byte aligned to minimize unaligned accesses. 53 The loop tail is handled by always copying 64 bytes from the end. 54*/ 55 56ENTRY_ALIAS (__memmove_aarch64) 57ENTRY (__memcpy_aarch64) 58 PTR_ARG (0) 59 PTR_ARG (1) 60 SIZE_ARG (2) 61 prfm PLDL1KEEP, [src] 62 add srcend, src, count 63 add dstend, dstin, count 64 cmp count, 128 65 b.hi L(copy_long) 66 cmp count, 32 67 b.hi L(copy32_128) 68 69 /* Small copies: 0..32 bytes. */ 70 cmp count, 16 71 b.lo L(copy16) 72 ldp A_l, A_h, [src] 73 ldp D_l, D_h, [srcend, -16] 74 stp A_l, A_h, [dstin] 75 stp D_l, D_h, [dstend, -16] 76 ret 77 78 /* Copy 8-15 bytes. */ 79L(copy16): 80 tbz count, 3, L(copy8) 81 ldr A_l, [src] 82 ldr A_h, [srcend, -8] 83 str A_l, [dstin] 84 str A_h, [dstend, -8] 85 ret 86 87 .p2align 3 88 /* Copy 4-7 bytes. */ 89L(copy8): 90 tbz count, 2, L(copy4) 91 ldr A_lw, [src] 92 ldr B_lw, [srcend, -4] 93 str A_lw, [dstin] 94 str B_lw, [dstend, -4] 95 ret 96 97 /* Copy 0..3 bytes using a branchless sequence. */ 98L(copy4): 99 cbz count, L(copy0) 100 lsr tmp1, count, 1 101 ldrb A_lw, [src] 102 ldrb C_lw, [srcend, -1] 103 ldrb B_lw, [src, tmp1] 104 strb A_lw, [dstin] 105 strb B_lw, [dstin, tmp1] 106 strb C_lw, [dstend, -1] 107L(copy0): 108 ret 109 110 .p2align 4 111 /* Medium copies: 33..128 bytes. */ 112L(copy32_128): 113 ldp A_l, A_h, [src] 114 ldp B_l, B_h, [src, 16] 115 ldp C_l, C_h, [srcend, -32] 116 ldp D_l, D_h, [srcend, -16] 117 cmp count, 64 118 b.hi L(copy128) 119 stp A_l, A_h, [dstin] 120 stp B_l, B_h, [dstin, 16] 121 stp C_l, C_h, [dstend, -32] 122 stp D_l, D_h, [dstend, -16] 123 ret 124 125 .p2align 4 126 /* Copy 65..128 bytes. */ 127L(copy128): 128 ldp E_l, E_h, [src, 32] 129 ldp F_l, F_h, [src, 48] 130 cmp count, 96 131 b.ls L(copy96) 132 ldp G_l, G_h, [srcend, -64] 133 ldp H_l, H_h, [srcend, -48] 134 stp G_l, G_h, [dstend, -64] 135 stp H_l, H_h, [dstend, -48] 136L(copy96): 137 stp A_l, A_h, [dstin] 138 stp B_l, B_h, [dstin, 16] 139 stp E_l, E_h, [dstin, 32] 140 stp F_l, F_h, [dstin, 48] 141 stp C_l, C_h, [dstend, -32] 142 stp D_l, D_h, [dstend, -16] 143 ret 144 145 .p2align 4 146 /* Copy more than 128 bytes. */ 147L(copy_long): 148 /* Use backwards copy if there is an overlap. */ 149 sub tmp1, dstin, src 150 cbz tmp1, L(copy0) 151 cmp tmp1, count 152 b.lo L(copy_long_backwards) 153 154 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 155 156 ldp D_l, D_h, [src] 157 and tmp1, dstin, 15 158 bic dst, dstin, 15 159 sub src, src, tmp1 160 add count, count, tmp1 /* Count is now 16 too large. */ 161 ldp A_l, A_h, [src, 16] 162 stp D_l, D_h, [dstin] 163 ldp B_l, B_h, [src, 32] 164 ldp C_l, C_h, [src, 48] 165 ldp D_l, D_h, [src, 64]! 166 subs count, count, 128 + 16 /* Test and readjust count. */ 167 b.ls L(copy64_from_end) 168 169L(loop64): 170 stp A_l, A_h, [dst, 16] 171 ldp A_l, A_h, [src, 16] 172 stp B_l, B_h, [dst, 32] 173 ldp B_l, B_h, [src, 32] 174 stp C_l, C_h, [dst, 48] 175 ldp C_l, C_h, [src, 48] 176 stp D_l, D_h, [dst, 64]! 177 ldp D_l, D_h, [src, 64]! 178 subs count, count, 64 179 b.hi L(loop64) 180 181 /* Write the last iteration and copy 64 bytes from the end. */ 182L(copy64_from_end): 183 ldp E_l, E_h, [srcend, -64] 184 stp A_l, A_h, [dst, 16] 185 ldp A_l, A_h, [srcend, -48] 186 stp B_l, B_h, [dst, 32] 187 ldp B_l, B_h, [srcend, -32] 188 stp C_l, C_h, [dst, 48] 189 ldp C_l, C_h, [srcend, -16] 190 stp D_l, D_h, [dst, 64] 191 stp E_l, E_h, [dstend, -64] 192 stp A_l, A_h, [dstend, -48] 193 stp B_l, B_h, [dstend, -32] 194 stp C_l, C_h, [dstend, -16] 195 ret 196 197 .p2align 4 198 199 /* Large backwards copy for overlapping copies. 200 Copy 16 bytes and then align dst to 16-byte alignment. */ 201L(copy_long_backwards): 202 ldp D_l, D_h, [srcend, -16] 203 and tmp1, dstend, 15 204 sub srcend, srcend, tmp1 205 sub count, count, tmp1 206 ldp A_l, A_h, [srcend, -16] 207 stp D_l, D_h, [dstend, -16] 208 ldp B_l, B_h, [srcend, -32] 209 ldp C_l, C_h, [srcend, -48] 210 ldp D_l, D_h, [srcend, -64]! 211 sub dstend, dstend, tmp1 212 subs count, count, 128 213 b.ls L(copy64_from_start) 214 215L(loop64_backwards): 216 stp A_l, A_h, [dstend, -16] 217 ldp A_l, A_h, [srcend, -16] 218 stp B_l, B_h, [dstend, -32] 219 ldp B_l, B_h, [srcend, -32] 220 stp C_l, C_h, [dstend, -48] 221 ldp C_l, C_h, [srcend, -48] 222 stp D_l, D_h, [dstend, -64]! 223 ldp D_l, D_h, [srcend, -64]! 224 subs count, count, 64 225 b.hi L(loop64_backwards) 226 227 /* Write the last iteration and copy 64 bytes from the start. */ 228L(copy64_from_start): 229 ldp G_l, G_h, [src, 48] 230 stp A_l, A_h, [dstend, -16] 231 ldp A_l, A_h, [src, 32] 232 stp B_l, B_h, [dstend, -32] 233 ldp B_l, B_h, [src, 16] 234 stp C_l, C_h, [dstend, -48] 235 ldp C_l, C_h, [src] 236 stp D_l, D_h, [dstend, -64] 237 stp G_l, G_h, [dstin, 48] 238 stp A_l, A_h, [dstin, 32] 239 stp B_l, B_h, [dstin, 16] 240 stp C_l, C_h, [dstin] 241 ret 242 243END (__memcpy_aarch64) 244 245