1/* 2 * Copyright (c) 2023 Institute of Parallel And Distributed Systems (IPADS), Shanghai Jiao Tong University (SJTU), Shanghai Jiao Tong University (SJTU) 3 * Licensed under the Mulan PSL v2. 4 * You can use this software according to the terms and conditions of the Mulan PSL v2. 5 * You may obtain a copy of Mulan PSL v2 at: 6 * http://license.coscl.org.cn/MulanPSL2 7 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR 8 * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR 9 * PURPOSE. 10 * See the Mulan PSL v2 for more details. 11 */ 12 13#include <common/asm.h> 14 15/* Assumptions: 16 * 17 * ARMv8-a, AArch64, unaligned accesses. 18 * 19 */ 20 21#define dstin x0 22#define src x1 23#define count x2 24#define dst x3 25#define srcend x4 26#define dstend x5 27#define A_l x6 28#define A_lw w6 29#define A_h x7 30#define A_hw w7 31#define B_l x8 32#define B_lw w8 33#define B_h x9 34#define C_l x10 35#define C_h x11 36#define D_l x12 37#define D_h x13 38#define E_l src 39#define E_h count 40#define F_l dst 41#define F_h srcend 42#define tmp1 x9 43 44#define L(l) .L ## l 45 46 .macro ldp1 reg1, reg2, ptr, val 4788: 48 ldtr \reg1, [\ptr, \val]; 4989: 50 ldtr \reg2, [\ptr, \val + 8]; 51 52 ASM_EXTABLE_64(88b, 97f); 53 ASM_EXTABLE_64(89b, 97f); 54 .endm 55 56 .macro ldr1 reg, ptr, val 5788: 58 ldtr \reg, [\ptr, \val]; 59 60 ASM_EXTABLE_64(88b, 97f); 61 .endm 62 63/* shoule use ldtrb here? But the third parameter of ldtrb can only be an immediate number*/ 64 .macro ldrb1 reg, ptr, val 6588: 66 ldrb \reg, [\ptr, \val]; 67 68 ASM_EXTABLE_64(88b, 97f); 69 .endm 70 71/* Copies are split into 3 main cases: small copies of up to 16 bytes, 72 medium copies of 17..96 bytes which are fully unrolled. Large copies 73 of more than 96 bytes align the destination and use an unrolled loop 74 processing 64 bytes per iteration. 75 Small and medium copies read all data before writing, allowing any 76 kind of overlap, and memmove tailcalls memcpy for these cases as 77 well as non-overlapping copies. 78*/ 79 80BEGIN_FUNC(__copy_from_user) 81 prfm PLDL1KEEP, [src] 82 add srcend, src, count 83 add dstend, dstin, count 84 cmp count, 16 85 b.ls L(copy16) 86 cmp count, 96 87 b.hi L(copy_long) 88 89 /* Medium copies: 17..96 bytes. */ 90 sub tmp1, count, 1 91 ldp1 A_l, A_h, src, 0 92 tbnz tmp1, 6, L(copy96) 93 ldp1 D_l, D_h, srcend, -16 94 tbz tmp1, 5, 1f 95 ldp1 B_l, B_h, src, 16 96 ldp1 C_l, C_h, srcend, -32 97 stp B_l, B_h, [dstin, 16] 98 stp C_l, C_h, [dstend, -32] 991: 100 stp A_l, A_h, [dstin] 101 stp D_l, D_h, [dstend, -16] 102 mov x0, #0 103 ret 104 105 .p2align 4 106 /* Small copies: 0..16 bytes. */ 107L(copy16): 108 cmp count, 8 109 b.lo 1f 110 ldr1 A_l, src, 0 111 ldr1 A_h, srcend, -8 112 str A_l, [dstin] 113 str A_h, [dstend, -8] 114 mov x0, #0 115 ret 116 .p2align 4 1171: 118 tbz count, 2, 1f 119 ldr1 A_lw, src, 0 120 ldr1 A_hw, srcend, -4 121 str A_lw, [dstin] 122 str A_hw, [dstend, -4] 123 mov x0, #0 124 ret 125 126 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 127 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1281: 129 cbz count, 2f 130 lsr tmp1, count, 1 131 ldrb1 A_lw, src, 0 132 ldrb1 A_hw, srcend, -1 133 ldrb1 B_lw, src, tmp1 134 strb A_lw, [dstin] 135 strb B_lw, [dstin, tmp1] 136 strb A_hw, [dstend, -1] 1372: 138 mov x0, #0 139 ret 140 141 .p2align 4 142 /* Copy 64..96 bytes. Copy 64 bytes from the start and 143 32 bytes from the end. */ 144L(copy96): 145 ldp1 B_l, B_h, src, 16 146 ldp1 C_l, C_h, src, 32 147 ldp1 D_l, D_h, src, 48 148 ldp1 E_l, E_h, srcend, -32 149 ldp1 F_l, F_h, srcend, -16 150 stp A_l, A_h, [dstin] 151 stp B_l, B_h, [dstin, 16] 152 stp C_l, C_h, [dstin, 32] 153 stp D_l, D_h, [dstin, 48] 154 stp E_l, E_h, [dstend, -32] 155 stp F_l, F_h, [dstend, -16] 156 mov x0, #0 157 ret 158 159 /* Align DST to 16 byte alignment so that we don't cross cache line 160 boundaries on both loads and stores. There are at least 96 bytes 161 to copy, so copy 16 bytes unaligned and then align. The loop 162 copies 64 bytes per iteration and prefetches one iteration ahead. */ 163 164 .p2align 4 165L(copy_long): 166 and tmp1, dstin, 15 167 bic dst, dstin, 15 168 ldp1 D_l, D_h, src, 0 169 sub src, src, tmp1 170 add count, count, tmp1 /* Count is now 16 too large. */ 171 ldp1 A_l, A_h, src, 16 172 stp D_l, D_h, [dstin] 173 ldp1 B_l, B_h, src, 32 174 ldp1 C_l, C_h, src, 48 175 add src, src, 64 176 ldp1 D_l, D_h, src, 0 177 subs count, count, 128 + 16 /* Test and readjust count. */ 178 b.ls 2f 1791: 180 stp A_l, A_h, [dst, 16] 181 ldp1 A_l, A_h, src, 16 182 stp B_l, B_h, [dst, 32] 183 ldp1 B_l, B_h, src, 32 184 stp C_l, C_h, [dst, 48] 185 ldp1 C_l, C_h, src, 48 186 stp D_l, D_h, [dst, 64]! 187 add src, src, 64 188 ldp1 D_l, D_h, src, 0 189 subs count, count, 64 190 b.hi 1b 191 192 /* Write the last full set of 64 bytes. The remainder is at most 64 193 bytes, so it is safe to always copy 64 bytes from the end even if 194 there is just 1 byte left. */ 1952: 196 ldp1 E_l, E_h, srcend, -64 197 stp A_l, A_h, [dst, 16] 198 ldp1 A_l, A_h, srcend, -48 199 stp B_l, B_h, [dst, 32] 200 ldp1 B_l, B_h, srcend, -32 201 stp C_l, C_h, [dst, 48] 202 ldp1 C_l, C_h, srcend, -16 203 stp D_l, D_h, [dst, 64] 204 stp E_l, E_h, [dstend, -64] 205 stp A_l, A_h, [dstend, -48] 206 stp B_l, B_h, [dstend, -32] 207 stp C_l, C_h, [dstend, -16] 208 mov x0, #0 209 ret 210 21197: 212 mov x0, #-1 213 ret 214END_FUNC(__copy_from_user) 215