1/* 2 * Copyright (c) 2023 Institute of Parallel And Distributed Systems (IPADS), Shanghai Jiao Tong University (SJTU), Shanghai Jiao Tong University (SJTU) 3 * Licensed under the Mulan PSL v2. 4 * You can use this software according to the terms and conditions of the Mulan PSL v2. 5 * You may obtain a copy of Mulan PSL v2 at: 6 * http://license.coscl.org.cn/MulanPSL2 7 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR 8 * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR 9 * PURPOSE. 10 * See the Mulan PSL v2 for more details. 11 */ 12 13#include <common/asm.h> 14 15/* Assumptions: 16 * 17 * ARMv8-a, AArch64, unaligned accesses. 18 * 19 */ 20 21#define dstin x0 22#define src x1 23#define count x2 24#define dst x3 25#define srcend x4 26#define dstend x5 27#define A_l x6 28#define A_lw w6 29#define A_h x7 30#define A_hw w7 31#define B_l x8 32#define B_lw w8 33#define B_h x9 34#define C_l x10 35#define C_h x11 36#define D_l x12 37#define D_h x13 38#define E_l src 39#define E_h count 40#define F_l srcend 41#define F_h dst 42#define tmp1 x9 43 44#define L(l) .L ## l 45 46 .macro stp1 reg1, reg2, ptr, val 4788: 48 sttr \reg1, [\ptr, \val]; 4989: 50 sttr \reg2, [\ptr, \val + 8]; 51 52 ASM_EXTABLE_64(88b, 97f); 53 ASM_EXTABLE_64(89b, 97f); 54 .endm 55 56 .macro str1 reg, ptr, val 5788: 58 sttr \reg, [\ptr, \val]; 59 60 ASM_EXTABLE_64(88b, 97f); 61 .endm 62 63/* shoule use sttrb here? But the third parameter of sttrb can only be an immediate number*/ 64 .macro strb1 reg, ptr, val 6588: 66 strb \reg, [\ptr, \val]; 67 68 ASM_EXTABLE_64(88b, 97f); 69 .endm 70 71/* Copies are split into 3 main cases: small copies of up to 16 bytes, 72 medium copies of 17..96 bytes which are fully unrolled. Large copies 73 of more than 96 bytes align the destination and use an unrolled loop 74 processing 64 bytes per iteration. 75 Small and medium copies read all data before writing, allowing any 76 kind of overlap, and memmove tailcalls memcpy for these cases as 77 well as non-overlapping copies. 78*/ 79 80BEGIN_FUNC(__copy_to_user) 81 prfm PLDL1KEEP, [src] 82 add srcend, src, count 83 add dstend, dstin, count 84 cmp count, 16 85 b.ls L(copy16) 86 cmp count, 96 87 b.hi L(copy_long) 88 89 /* Medium copies: 17..96 bytes. */ 90 sub tmp1, count, 1 91 ldp A_l, A_h, [src] 92 tbnz tmp1, 6, L(copy96) 93 ldp D_l, D_h, [srcend, -16] 94 tbz tmp1, 5, 1f 95 ldp B_l, B_h, [src, 16] 96 ldp C_l, C_h, [srcend, -32] 97 stp1 B_l, B_h, dstin, 16 98 stp1 C_l, C_h, dstend, -32 991: 100 stp1 A_l, A_h, dstin, 0 101 stp1 D_l, D_h, dstend, -16 102 mov x0, #0 103 ret 104 105 .p2align 4 106 /* Small copies: 0..16 bytes. */ 107L(copy16): 108 cmp count, 8 109 b.lo 1f 110 ldr A_l, [src] 111 ldr A_h, [srcend, -8] 112 str1 A_l, dstin, 0 113 str1 A_h, dstend, -8 114 mov x0, #0 115 ret 116 .p2align 4 1171: 118 tbz count, 2, 1f 119 ldr A_lw, [src] 120 ldr A_hw, [srcend, -4] 121 str1 A_lw, dstin, 0 122 str1 A_hw, dstend, -4 123 mov x0, #0 124 ret 125 126 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 127 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1281: 129 cbz count, 2f 130 lsr tmp1, count, 1 131 ldrb A_lw, [src] 132 ldrb A_hw, [srcend, -1] 133 ldrb B_lw, [src, tmp1] 134 strb1 A_lw, dstin, 0 135 strb1 B_lw, dstin, tmp1 136 strb1 A_hw, dstend, -1 1372: 138 mov x0, #0 139 ret 140 141 .p2align 4 142 /* Copy 64..96 bytes. Copy 64 bytes from the start and 143 32 bytes from the end. */ 144L(copy96): 145 ldp B_l, B_h, [src, 16] 146 ldp C_l, C_h, [src, 32] 147 ldp D_l, D_h, [src, 48] 148 ldp E_l, E_h, [srcend, -32] 149 ldp F_l, F_h, [srcend, -16] 150 stp1 A_l, A_h, dstin, 0 151 stp1 B_l, B_h, dstin, 16 152 stp1 C_l, C_h, dstin, 32 153 stp1 D_l, D_h, dstin, 48 154 stp1 E_l, E_h, dstend, -32 155 stp1 F_l, F_h, dstend, -16 156 mov x0, #0 157 ret 158 159 /* Align DST to 16 byte alignment so that we don't cross cache line 160 boundaries on both loads and stores. There are at least 96 bytes 161 to copy, so copy 16 bytes unaligned and then align. The loop 162 copies 64 bytes per iteration and prefetches one iteration ahead. */ 163 164 .p2align 4 165L(copy_long): 166 and tmp1, dstin, 15 167 bic dst, dstin, 15 168 ldp D_l, D_h, [src] 169 sub src, src, tmp1 170 add count, count, tmp1 /* Count is now 16 too large. */ 171 ldp A_l, A_h, [src, 16] 172 stp1 D_l, D_h, dstin, 0 173 ldp B_l, B_h, [src, 32] 174 ldp C_l, C_h, [src, 48] 175 ldp D_l, D_h, [src, 64]! 176 subs count, count, 128 + 16 /* Test and readjust count. */ 177 b.ls 2f 1781: 179 stp1 A_l, A_h, dst, 16 180 ldp A_l, A_h, [src, 16] 181 stp1 B_l, B_h, dst, 32 182 ldp B_l, B_h, [src, 32] 183 stp1 C_l, C_h, dst, 48 184 ldp C_l, C_h, [src, 48] 185 add dst, dst, 64 186 stp1 D_l, D_h, dst, 0 187 ldp D_l, D_h, [src, 64]! 188 subs count, count, 64 189 b.hi 1b 190 191 /* Write the last full set of 64 bytes. The remainder is at most 64 192 bytes, so it is safe to always copy 64 bytes from the end even if 193 there is just 1 byte left. */ 1942: 195 ldp E_l, E_h, [srcend, -64] 196 stp1 A_l, A_h, dst, 16 197 ldp A_l, A_h, [srcend, -48] 198 stp1 B_l, B_h, dst, 32 199 ldp B_l, B_h, [srcend, -32] 200 stp1 C_l, C_h, dst, 48 201 ldp C_l, C_h, [srcend, -16] 202 stp1 D_l, D_h, dst, 64 203 stp1 E_l, E_h, dstend, -64 204 stp1 A_l, A_h, dstend, -48 205 stp1 B_l, B_h, dstend, -32 206 stp1 C_l, C_h, dstend, -16 207 mov x0, #0 208 ret 209 21097: 211 mov x0, #-1 212 ret 213END_FUNC(__copy_to_user) 214