1/* 2 * Copyright (c) 2023 Institute of Parallel And Distributed Systems (IPADS), Shanghai Jiao Tong University (SJTU) 3 * Licensed under the Mulan PSL v2. 4 * You can use this software according to the terms and conditions of the Mulan PSL v2. 5 * You may obtain a copy of Mulan PSL v2 at: 6 * http://license.coscl.org.cn/MulanPSL2 7 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR 8 * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR 9 * PURPOSE. 10 * See the Mulan PSL v2 for more details. 11 */ 12 13#include <common/asm.h> 14 15/* Assumptions: 16 * 17 * ARMv8-a, AArch64, unaligned accesses. 18 * 19 */ 20 21#define dstin x0 22#define src x1 23#define count x2 24#define dst x3 25#define srcend x4 26#define dstend x5 27#define A_l x6 28#define A_lw w6 29#define A_h x7 30#define A_hw w7 31#define B_l x8 32#define B_lw w8 33#define B_h x9 34#define C_l x10 35#define C_h x11 36#define D_l x12 37#define D_h x13 38#define E_l src 39#define E_h count 40#define F_l srcend 41#define F_h dst 42#define tmp1 x9 43 44#define L(l) .L ## l 45 46/* Copies are split into 3 main cases: small copies of up to 16 bytes, 47 medium copies of 17..96 bytes which are fully unrolled. Large copies 48 of more than 96 bytes align the destination and use an unrolled loop 49 processing 64 bytes per iteration. 50 Small and medium copies read all data before writing, allowing any 51 kind of overlap, and memmove tailcalls memcpy for these cases as 52 well as non-overlapping copies. 53*/ 54 55BEGIN_FUNC(memcpy) 56 prfm PLDL1KEEP, [src] 57 add srcend, src, count 58 add dstend, dstin, count 59 cmp count, 16 60 b.ls L(copy16) 61 cmp count, 96 62 b.hi L(copy_long) 63 64 /* Medium copies: 17..96 bytes. */ 65 sub tmp1, count, 1 66 ldp A_l, A_h, [src] 67 tbnz tmp1, 6, L(copy96) 68 ldp D_l, D_h, [srcend, -16] 69 tbz tmp1, 5, 1f 70 ldp B_l, B_h, [src, 16] 71 ldp C_l, C_h, [srcend, -32] 72 stp B_l, B_h, [dstin, 16] 73 stp C_l, C_h, [dstend, -32] 741: 75 stp A_l, A_h, [dstin] 76 stp D_l, D_h, [dstend, -16] 77 ret 78 79 .p2align 4 80 /* Small copies: 0..16 bytes. */ 81L(copy16): 82 cmp count, 8 83 b.lo 1f 84 ldr A_l, [src] 85 ldr A_h, [srcend, -8] 86 str A_l, [dstin] 87 str A_h, [dstend, -8] 88 ret 89 .p2align 4 901: 91 tbz count, 2, 1f 92 ldr A_lw, [src] 93 ldr A_hw, [srcend, -4] 94 str A_lw, [dstin] 95 str A_hw, [dstend, -4] 96 ret 97 98 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 99 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1001: 101 cbz count, 2f 102 lsr tmp1, count, 1 103 ldrb A_lw, [src] 104 ldrb A_hw, [srcend, -1] 105 ldrb B_lw, [src, tmp1] 106 strb A_lw, [dstin] 107 strb B_lw, [dstin, tmp1] 108 strb A_hw, [dstend, -1] 1092: ret 110 111 .p2align 4 112 /* Copy 64..96 bytes. Copy 64 bytes from the start and 113 32 bytes from the end. */ 114L(copy96): 115 ldp B_l, B_h, [src, 16] 116 ldp C_l, C_h, [src, 32] 117 ldp D_l, D_h, [src, 48] 118 ldp E_l, E_h, [srcend, -32] 119 ldp F_l, F_h, [srcend, -16] 120 stp A_l, A_h, [dstin] 121 stp B_l, B_h, [dstin, 16] 122 stp C_l, C_h, [dstin, 32] 123 stp D_l, D_h, [dstin, 48] 124 stp E_l, E_h, [dstend, -32] 125 stp F_l, F_h, [dstend, -16] 126 ret 127 128 /* Align DST to 16 byte alignment so that we don't cross cache line 129 boundaries on both loads and stores. There are at least 96 bytes 130 to copy, so copy 16 bytes unaligned and then align. The loop 131 copies 64 bytes per iteration and prefetches one iteration ahead. */ 132 133 .p2align 4 134L(copy_long): 135 and tmp1, dstin, 15 136 bic dst, dstin, 15 137 ldp D_l, D_h, [src] 138 sub src, src, tmp1 139 add count, count, tmp1 /* Count is now 16 too large. */ 140 ldp A_l, A_h, [src, 16] 141 stp D_l, D_h, [dstin] 142 ldp B_l, B_h, [src, 32] 143 ldp C_l, C_h, [src, 48] 144 ldp D_l, D_h, [src, 64]! 145 subs count, count, 128 + 16 /* Test and readjust count. */ 146 b.ls 2f 1471: 148 stp A_l, A_h, [dst, 16] 149 ldp A_l, A_h, [src, 16] 150 stp B_l, B_h, [dst, 32] 151 ldp B_l, B_h, [src, 32] 152 stp C_l, C_h, [dst, 48] 153 ldp C_l, C_h, [src, 48] 154 stp D_l, D_h, [dst, 64]! 155 ldp D_l, D_h, [src, 64]! 156 subs count, count, 64 157 b.hi 1b 158 159 /* Write the last full set of 64 bytes. The remainder is at most 64 160 bytes, so it is safe to always copy 64 bytes from the end even if 161 there is just 1 byte left. */ 1622: 163 ldp E_l, E_h, [srcend, -64] 164 stp A_l, A_h, [dst, 16] 165 ldp A_l, A_h, [srcend, -48] 166 stp B_l, B_h, [dst, 32] 167 ldp B_l, B_h, [srcend, -32] 168 stp C_l, C_h, [dst, 48] 169 ldp C_l, C_h, [srcend, -16] 170 stp D_l, D_h, [dst, 64] 171 stp E_l, E_h, [dstend, -64] 172 stp A_l, A_h, [dstend, -48] 173 stp B_l, B_h, [dstend, -32] 174 stp C_l, C_h, [dstend, -16] 175 ret 176END_FUNC(memcpy) 177