1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2012-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses. 11 * 12 */ 13 14#define dstin x0 15#define src x1 16#define count x2 17#define dst x3 18#define srcend x4 19#define dstend x5 20#define A_l x6 21#define A_lw w6 22#define A_h x7 23#define B_l x8 24#define B_lw w8 25#define B_h x9 26#define C_l x10 27#define C_lw w10 28#define C_h x11 29#define D_l x12 30#define D_h x13 31#define E_l x14 32#define E_h x15 33#define F_l x16 34#define F_h x17 35#define G_l count 36#define G_h dst 37#define H_l src 38#define H_h srcend 39#define tmp1 x14 40 41/* This implementation of memcpy uses unaligned accesses and branchless 42 sequences to keep the code small, simple and improve performance. 43 44 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 45 copies of up to 128 bytes, and large copies. The overhead of the overlap 46 check is negligible since it is only required for large copies. 47 48 Large copies use a software pipelined loop processing 64 bytes per iteration. 49 The destination pointer is 16-byte aligned to minimize unaligned accesses. 50 The loop tail is handled by always copying 64 bytes from the end. 51*/ 52 53.global memcpy 54.type memcpy,%function 55memcpy: 56 add srcend, src, count 57 add dstend, dstin, count 58 cmp count, 128 59 b.hi .Lcopy_long 60 cmp count, 32 61 b.hi .Lcopy32_128 62 63 /* Small copies: 0..32 bytes. */ 64 cmp count, 16 65 b.lo .Lcopy16 66 ldp A_l, A_h, [src] 67 ldp D_l, D_h, [srcend, -16] 68 stp A_l, A_h, [dstin] 69 stp D_l, D_h, [dstend, -16] 70 ret 71 72 /* Copy 8-15 bytes. */ 73.Lcopy16: 74 tbz count, 3, .Lcopy8 75 ldr A_l, [src] 76 ldr A_h, [srcend, -8] 77 str A_l, [dstin] 78 str A_h, [dstend, -8] 79 ret 80 81 .p2align 3 82 /* Copy 4-7 bytes. */ 83.Lcopy8: 84 tbz count, 2, .Lcopy4 85 ldr A_lw, [src] 86 ldr B_lw, [srcend, -4] 87 str A_lw, [dstin] 88 str B_lw, [dstend, -4] 89 ret 90 91 /* Copy 0..3 bytes using a branchless sequence. */ 92.Lcopy4: 93 cbz count, .Lcopy0 94 lsr tmp1, count, 1 95 ldrb A_lw, [src] 96 ldrb C_lw, [srcend, -1] 97 ldrb B_lw, [src, tmp1] 98 strb A_lw, [dstin] 99 strb B_lw, [dstin, tmp1] 100 strb C_lw, [dstend, -1] 101.Lcopy0: 102 ret 103 104 .p2align 4 105 /* Medium copies: 33..128 bytes. */ 106.Lcopy32_128: 107 ldp A_l, A_h, [src] 108 ldp B_l, B_h, [src, 16] 109 ldp C_l, C_h, [srcend, -32] 110 ldp D_l, D_h, [srcend, -16] 111 cmp count, 64 112 b.hi .Lcopy128 113 stp A_l, A_h, [dstin] 114 stp B_l, B_h, [dstin, 16] 115 stp C_l, C_h, [dstend, -32] 116 stp D_l, D_h, [dstend, -16] 117 ret 118 119 .p2align 4 120 /* Copy 65..128 bytes. */ 121.Lcopy128: 122 ldp E_l, E_h, [src, 32] 123 ldp F_l, F_h, [src, 48] 124 cmp count, 96 125 b.ls .Lcopy96 126 ldp G_l, G_h, [srcend, -64] 127 ldp H_l, H_h, [srcend, -48] 128 stp G_l, G_h, [dstend, -64] 129 stp H_l, H_h, [dstend, -48] 130.Lcopy96: 131 stp A_l, A_h, [dstin] 132 stp B_l, B_h, [dstin, 16] 133 stp E_l, E_h, [dstin, 32] 134 stp F_l, F_h, [dstin, 48] 135 stp C_l, C_h, [dstend, -32] 136 stp D_l, D_h, [dstend, -16] 137 ret 138 139 .p2align 4 140 /* Copy more than 128 bytes. */ 141.Lcopy_long: 142 143 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 144 145 ldp D_l, D_h, [src] 146 and tmp1, dstin, 15 147 bic dst, dstin, 15 148 sub src, src, tmp1 149 add count, count, tmp1 /* Count is now 16 too large. */ 150 ldp A_l, A_h, [src, 16] 151 stp D_l, D_h, [dstin] 152 ldp B_l, B_h, [src, 32] 153 ldp C_l, C_h, [src, 48] 154 ldp D_l, D_h, [src, 64]! 155 subs count, count, 128 + 16 /* Test and readjust count. */ 156 b.ls .Lcopy64_from_end 157 158.Lloop64: 159 stp A_l, A_h, [dst, 16] 160 ldp A_l, A_h, [src, 16] 161 stp B_l, B_h, [dst, 32] 162 ldp B_l, B_h, [src, 32] 163 stp C_l, C_h, [dst, 48] 164 ldp C_l, C_h, [src, 48] 165 stp D_l, D_h, [dst, 64]! 166 ldp D_l, D_h, [src, 64]! 167 subs count, count, 64 168 b.hi .Lloop64 169 170 /* Write the last iteration and copy 64 bytes from the end. */ 171.Lcopy64_from_end: 172 ldp E_l, E_h, [srcend, -64] 173 stp A_l, A_h, [dst, 16] 174 ldp A_l, A_h, [srcend, -48] 175 stp B_l, B_h, [dst, 32] 176 ldp B_l, B_h, [srcend, -32] 177 stp C_l, C_h, [dst, 48] 178 ldp C_l, C_h, [srcend, -16] 179 stp D_l, D_h, [dst, 64] 180 stp E_l, E_h, [dstend, -64] 181 stp A_l, A_h, [dstend, -48] 182 stp B_l, B_h, [dstend, -32] 183 stp C_l, C_h, [dstend, -16] 184 ret 185 186.size memcpy,.-memcpy 187