1/* 2 * memcpy/memmove using SIMD registers 3 * 4 * Copyright (c) 2019, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses. 11 * 12 */ 13 14#include "../asmdefs.h" 15 16#define dstin x0 17#define src x1 18#define count x2 19#define dst x3 20#define srcend x4 21#define dstend x5 22#define A_l x6 23#define A_lw w6 24#define A_h x7 25#define A_hw w7 26#define B_l x8 27#define B_lw w8 28#define B_h x9 29#define C_l x10 30#define C_h x11 31#define D_l x12 32#define D_h x13 33#define E_l x14 34#define E_h x15 35#define F_l x16 36#define F_h x17 37#define G_l count 38#define G_h dst 39#define H_l src 40#define H_h srcend 41#define tmp1 x14 42 43#define A_q q0 44#define B_q q1 45#define C_q q2 46#define D_q q3 47#define E_q q4 48#define F_q q5 49#define G_q q6 50#define H_q q7 51 52/* This implementation of memcpy correctly handles overlaps, therefore 53 __memmove_aarch64_simd aliases to __memcpy_aarch64_simd. By moving the 54 src and dst buffer overlap check from the start of memmove code to the 55 beginning of large copy code, the overhead of combining memcpy 56 and memmove implementations is negligible. 57 58 Copies are split into 3 main cases: small copies of up to 16 bytes, 59 medium copies of 17..128 bytes which are fully unrolled, and large 60 copies (moves). 61 62 Large forward moves align the source and use an unrolled loop 63 processing 64 bytes per iteration. 64 65 Large backward moves align srcend and use an unrolled loop processing 66 64 bytes per iteration. 67*/ 68 69ENTRY (__memcpy_aarch64_simd) 70ENTRY_ALIAS (__memmove_aarch64_simd) 71 add srcend, src, count 72 add dstend, dstin, count 73 cmp count, 16 74 b.ls L(copy16_simd) 75 cmp count, 128 76 b.hi L(move_long_simd) 77 78 /* Medium copies: 17..128 bytes. */ 79 ldr A_q, [src] 80 ldr D_q, [srcend, -16] 81 cmp count, 32 82 b.hi L(copy33_128_simd) 83 str A_q, [dstin] 84 str D_q, [dstend, -16] 85 ret 86 87 .p2align 4 88 /* Small copies: 0..16 bytes. */ 89L(copy16_simd): 90 /* 8-15 bytes. */ 91 cmp count, 8 92 b.lo 1f 93 ldr A_l, [src] 94 ldr A_h, [srcend, -8] 95 str A_l, [dstin] 96 str A_h, [dstend, -8] 97 ret 98 99 .p2align 4 1001: 101 /* 4-7 bytes. */ 102 tbz count, 2, 1f 103 ldr A_lw, [src] 104 ldr A_hw, [srcend, -4] 105 str A_lw, [dstin] 106 str A_hw, [dstend, -4] 107 ret 108 109 .p2align 4 110 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 111 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1121: 113 cbz count, 2f 114 lsr tmp1, count, 1 115 ldrb A_lw, [src] 116 ldrb A_hw, [srcend, -1] 117 ldrb B_lw, [src, tmp1] 118 strb A_lw, [dstin] 119 strb B_lw, [dstin, tmp1] 120 strb A_hw, [dstend, -1] 1212: ret 122 123 .p2align 4 124 /* Copy 33..128 bytes. */ 125L(copy33_128_simd): 126 ldr B_q, [src, 16] 127 ldr C_q, [srcend, -32] 128 cmp count, 64 129 b.hi L(copy65_128_simd) 130 str A_q, [dstin] 131 str D_q, [dstend, -16] 132 str B_q, [dstin, 16] 133 str C_q, [dstend, -32] 134 ret 135 136 .p2align 4 137 /* Copy 65..128 bytes. */ 138L(copy65_128_simd): 139 ldr E_q, [src, 32] 140 ldr F_q, [src, 48] 141 ldr G_q, [srcend, -64] 142 ldr H_q, [srcend, -48] 143 str A_q, [dstin] 144 str D_q, [dstend, -16] 145 str B_q, [dstin, 16] 146 str C_q, [dstend, -32] 147 str E_q, [dstin, 32] 148 str F_q, [dstin, 48] 149 str G_q, [dstend, -64] 150 str H_q, [dstend, -48] 151 ret 152 153 .p2align 4 154 /* Move more than 128 bytes. */ 155L(move_long_simd): 156 sub tmp1, dstin, src /* Overlap check. */ 157 cbz tmp1, L(copy0_simd) 158 cmp tmp1, count 159 b.lo L(move_long_backwards_simd) 160 161 /* Align src to 16 byte alignment so that we don't cross cache line 162 boundaries on both loads and stores. There are at least 128 bytes 163 to copy, so copy 16 bytes unaligned and then align. The loop 164 copies 64 bytes per iteration and prefetches one iteration ahead. */ 165 166 ldr D_q, [src] 167 and tmp1, src, 15 168 bic src, src, 15 169 sub dst, dstin, tmp1 170 add count, count, tmp1 /* Count is now 16 too large. */ 171 ldr A_q, [src, 16] 172 str D_q, [dstin] 173 ldr B_q, [src, 32] 174 ldr C_q, [src, 48] 175 ldr D_q, [src, 64]! 176 subs count, count, 128 + 16 /* Test and readjust count. */ 177 b.ls L(copy64_from_end_simd) 178 179L(loop64_simd): 180 str A_q, [dst, 16] 181 ldr A_q, [src, 16] 182 str B_q, [dst, 32] 183 ldr B_q, [src, 32] 184 str C_q, [dst, 48] 185 ldr C_q, [src, 48] 186 str D_q, [dst, 64]! 187 ldr D_q, [src, 64]! 188 subs count, count, 64 189 b.hi L(loop64_simd) 190 191 /* Write the last full set of 64 bytes. The remainder is at most 64 192 bytes, so it is safe to always copy 64 bytes from the end even if 193 there is just 1 byte left. */ 194L(copy64_from_end_simd): 195 ldr E_q, [srcend, -64] 196 str A_q, [dst, 16] 197 ldr A_q, [srcend, -48] 198 str B_q, [dst, 32] 199 ldr B_q, [srcend, -32] 200 str C_q, [dst, 48] 201 ldr C_q, [srcend, -16] 202 str D_q, [dst, 64] 203 str E_q, [dstend, -64] 204 str A_q, [dstend, -48] 205 str B_q, [dstend, -32] 206 str C_q, [dstend, -16] 207 208L(copy0_simd): 209 ret 210 211 .p2align 4 212 213 /* Move more than 128 bytes where src and dst buffers overlap 214 and dst > src. 215 216 Align srcend to 16 byte alignment so that we don't cross cache line 217 boundaries on both loads and stores. There are at least 128 bytes 218 to copy, so copy 16 bytes unaligned and then align. The loop 219 copies 64 bytes per iteration and prefetches one iteration ahead. */ 220 221L(move_long_backwards_simd): 222 ldr D_q, [srcend, -16] 223 and tmp1, srcend, 15 224 sub srcend, srcend, tmp1 225 sub count, count, tmp1 226 ldr A_q, [srcend, -16] 227 str D_q, [dstend, -16] 228 ldr B_q, [srcend, -32] 229 ldr C_q, [srcend, -48] 230 ldr D_q, [srcend, -64]! 231 sub dstend, dstend, tmp1 232 subs count, count, 128 233 b.ls L(copy64_from_start_simd) 234 235L(loop64_backwards_simd): 236 str A_q, [dstend, -16] 237 ldr A_q, [srcend, -16] 238 str B_q, [dstend, -32] 239 ldr B_q, [srcend, -32] 240 str C_q, [dstend, -48] 241 ldr C_q, [srcend, -48] 242 str D_q, [dstend, -64]! 243 ldr D_q, [srcend, -64]! 244 subs count, count, 64 245 b.hi L(loop64_backwards_simd) 246 247 /* Write the last full set of 64 bytes. The remainder is at most 64 248 bytes, so it is safe to always copy 64 bytes from the start even if 249 there is just 1 byte left. */ 250L(copy64_from_start_simd): 251 ldr G_q, [src, 48] 252 str A_q, [dstend, -16] 253 ldr A_q, [src, 32] 254 str B_q, [dstend, -32] 255 ldr B_q, [src, 16] 256 str C_q, [dstend, -48] 257 ldr C_q, [src] 258 str D_q, [dstend, -64] 259 str G_q, [dstin, 48] 260 str A_q, [dstin, 32] 261 str B_q, [dstin, 16] 262 str C_q, [dstin] 263 ret 264 265END (__memcpy_aarch64_simd) 266