1// 2// Copyright (c) 2012 - 2016, Linaro Limited 3// All rights reserved. 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are met: 7// * Redistributions of source code must retain the above copyright 8// notice, this list of conditions and the following disclaimer. 9// * Redistributions in binary form must reproduce the above copyright 10// notice, this list of conditions and the following disclaimer in the 11// documentation and/or other materials provided with the distribution. 12// * Neither the name of the Linaro nor the 13// names of its contributors may be used to endorse or promote products 14// derived from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27// 28 29// 30// Copyright (c) 2015 ARM Ltd 31// All rights reserved. 32// 33// Redistribution and use in source and binary forms, with or without 34// modification, are permitted provided that the following conditions 35// are met: 36// 1. Redistributions of source code must retain the above copyright 37// notice, this list of conditions and the following disclaimer. 38// 2. Redistributions in binary form must reproduce the above copyright 39// notice, this list of conditions and the following disclaimer in the 40// documentation and/or other materials provided with the distribution. 41// 3. The name of the company may not be used to endorse or promote 42// products derived from this software without specific prior written 43// permission. 44// 45// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 46// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 47// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 48// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 49// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 50// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 51// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 52// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 53// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 54// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 55// 56 57// Assumptions: 58// 59// ARMv8-a, AArch64, unaligned accesses. 60// 61// 62 63#define dstin x0 64#define src x1 65#define count x2 66#define dst x3 67#define srcend x4 68#define dstend x5 69#define A_l x6 70#define A_lw w6 71#define A_h x7 72#define A_hw w7 73#define B_l x8 74#define B_lw w8 75#define B_h x9 76#define C_l x10 77#define C_h x11 78#define D_l x12 79#define D_h x13 80#define E_l x14 81#define E_h x15 82#define F_l srcend 83#define F_h dst 84#define tmp1 x9 85#define tmp2 x3 86 87#define L(l) .L ## l 88 89// Copies are split into 3 main cases: small copies of up to 16 bytes, 90// medium copies of 17..96 bytes which are fully unrolled. Large copies 91// of more than 96 bytes align the destination and use an unrolled loop 92// processing 64 bytes per iteration. 93// Small and medium copies read all data before writing, allowing any 94// kind of overlap, and memmove tailcalls memcpy for these cases as 95// well as non-overlapping copies. 96 97__memcpy: 98 prfm PLDL1KEEP, [src] 99 add srcend, src, count 100 add dstend, dstin, count 101 cmp count, 16 102 b.ls L(copy16) 103 cmp count, 96 104 b.hi L(copy_long) 105 106 // Medium copies: 17..96 bytes. 107 sub tmp1, count, 1 108 ldp A_l, A_h, [src] 109 tbnz tmp1, 6, L(copy96) 110 ldp D_l, D_h, [srcend, -16] 111 tbz tmp1, 5, 1f 112 ldp B_l, B_h, [src, 16] 113 ldp C_l, C_h, [srcend, -32] 114 stp B_l, B_h, [dstin, 16] 115 stp C_l, C_h, [dstend, -32] 1161: 117 stp A_l, A_h, [dstin] 118 stp D_l, D_h, [dstend, -16] 119 ret 120 121 .p2align 4 122 // Small copies: 0..16 bytes. 123L(copy16): 124 cmp count, 8 125 b.lo 1f 126 ldr A_l, [src] 127 ldr A_h, [srcend, -8] 128 str A_l, [dstin] 129 str A_h, [dstend, -8] 130 ret 131 .p2align 4 1321: 133 tbz count, 2, 1f 134 ldr A_lw, [src] 135 ldr A_hw, [srcend, -4] 136 str A_lw, [dstin] 137 str A_hw, [dstend, -4] 138 ret 139 140 // Copy 0..3 bytes. Use a branchless sequence that copies the same 141 // byte 3 times if count==1, or the 2nd byte twice if count==2. 1421: 143 cbz count, 2f 144 lsr tmp1, count, 1 145 ldrb A_lw, [src] 146 ldrb A_hw, [srcend, -1] 147 ldrb B_lw, [src, tmp1] 148 strb A_lw, [dstin] 149 strb B_lw, [dstin, tmp1] 150 strb A_hw, [dstend, -1] 1512: ret 152 153 .p2align 4 154 // Copy 64..96 bytes. Copy 64 bytes from the start and 155 // 32 bytes from the end. 156L(copy96): 157 ldp B_l, B_h, [src, 16] 158 ldp C_l, C_h, [src, 32] 159 ldp D_l, D_h, [src, 48] 160 ldp E_l, E_h, [srcend, -32] 161 ldp F_l, F_h, [srcend, -16] 162 stp A_l, A_h, [dstin] 163 stp B_l, B_h, [dstin, 16] 164 stp C_l, C_h, [dstin, 32] 165 stp D_l, D_h, [dstin, 48] 166 stp E_l, E_h, [dstend, -32] 167 stp F_l, F_h, [dstend, -16] 168 ret 169 170 // Align DST to 16 byte alignment so that we don't cross cache line 171 // boundaries on both loads and stores. There are at least 96 bytes 172 // to copy, so copy 16 bytes unaligned and then align. The loop 173 // copies 64 bytes per iteration and prefetches one iteration ahead. 174 175 .p2align 4 176L(copy_long): 177 and tmp1, dstin, 15 178 bic dst, dstin, 15 179 ldp D_l, D_h, [src] 180 sub src, src, tmp1 181 add count, count, tmp1 // Count is now 16 too large. 182 ldp A_l, A_h, [src, 16] 183 stp D_l, D_h, [dstin] 184 ldp B_l, B_h, [src, 32] 185 ldp C_l, C_h, [src, 48] 186 ldp D_l, D_h, [src, 64]! 187 subs count, count, 128 + 16 // Test and readjust count. 188 b.ls 2f 1891: 190 stp A_l, A_h, [dst, 16] 191 ldp A_l, A_h, [src, 16] 192 stp B_l, B_h, [dst, 32] 193 ldp B_l, B_h, [src, 32] 194 stp C_l, C_h, [dst, 48] 195 ldp C_l, C_h, [src, 48] 196 stp D_l, D_h, [dst, 64]! 197 ldp D_l, D_h, [src, 64]! 198 subs count, count, 64 199 b.hi 1b 200 201 // Write the last full set of 64 bytes. The remainder is at most 64 202 // bytes, so it is safe to always copy 64 bytes from the end even if 203 // there is just 1 byte left. 2042: 205 ldp E_l, E_h, [srcend, -64] 206 stp A_l, A_h, [dst, 16] 207 ldp A_l, A_h, [srcend, -48] 208 stp B_l, B_h, [dst, 32] 209 ldp B_l, B_h, [srcend, -32] 210 stp C_l, C_h, [dst, 48] 211 ldp C_l, C_h, [srcend, -16] 212 stp D_l, D_h, [dst, 64] 213 stp E_l, E_h, [dstend, -64] 214 stp A_l, A_h, [dstend, -48] 215 stp B_l, B_h, [dstend, -32] 216 stp C_l, C_h, [dstend, -16] 217 ret 218 219 220// 221// All memmoves up to 96 bytes are done by memcpy as it supports overlaps. 222// Larger backwards copies are also handled by memcpy. The only remaining 223// case is forward large copies. The destination is aligned, and an 224// unrolled loop processes 64 bytes per iteration. 225// 226 227ASM_GLOBAL ASM_PFX(InternalMemCopyMem) 228ASM_PFX(InternalMemCopyMem): 229 sub tmp2, dstin, src 230 cmp count, 96 231 ccmp tmp2, count, 2, hi 232 b.hs __memcpy 233 234 cbz tmp2, 3f 235 add dstend, dstin, count 236 add srcend, src, count 237 238 // Align dstend to 16 byte alignment so that we don't cross cache line 239 // boundaries on both loads and stores. There are at least 96 bytes 240 // to copy, so copy 16 bytes unaligned and then align. The loop 241 // copies 64 bytes per iteration and prefetches one iteration ahead. 242 243 and tmp2, dstend, 15 244 ldp D_l, D_h, [srcend, -16] 245 sub srcend, srcend, tmp2 246 sub count, count, tmp2 247 ldp A_l, A_h, [srcend, -16] 248 stp D_l, D_h, [dstend, -16] 249 ldp B_l, B_h, [srcend, -32] 250 ldp C_l, C_h, [srcend, -48] 251 ldp D_l, D_h, [srcend, -64]! 252 sub dstend, dstend, tmp2 253 subs count, count, 128 254 b.ls 2f 255 nop 2561: 257 stp A_l, A_h, [dstend, -16] 258 ldp A_l, A_h, [srcend, -16] 259 stp B_l, B_h, [dstend, -32] 260 ldp B_l, B_h, [srcend, -32] 261 stp C_l, C_h, [dstend, -48] 262 ldp C_l, C_h, [srcend, -48] 263 stp D_l, D_h, [dstend, -64]! 264 ldp D_l, D_h, [srcend, -64]! 265 subs count, count, 64 266 b.hi 1b 267 268 // Write the last full set of 64 bytes. The remainder is at most 64 269 // bytes, so it is safe to always copy 64 bytes from the start even if 270 // there is just 1 byte left. 2712: 272 ldp E_l, E_h, [src, 48] 273 stp A_l, A_h, [dstend, -16] 274 ldp A_l, A_h, [src, 32] 275 stp B_l, B_h, [dstend, -32] 276 ldp B_l, B_h, [src, 16] 277 stp C_l, C_h, [dstend, -48] 278 ldp C_l, C_h, [src] 279 stp D_l, D_h, [dstend, -64] 280 stp E_l, E_h, [dstin, 48] 281 stp A_l, A_h, [dstin, 32] 282 stp B_l, B_h, [dstin, 16] 283 stp C_l, C_h, [dstin] 2843: ret 285