1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2013 ARM Ltd. 4 * Copyright (C) 2013 Linaro. 5 * 6 * This code is based on glibc cortex strings work originally authored by Linaro 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 */ 12 13#include <linux/linkage.h> 14#include <asm/assembler.h> 15#include <asm/cache.h> 16 17/* 18 * Move a buffer from src to test (alignment handled by the hardware). 19 * If dest <= src, call memcpy, otherwise copy in reverse order. 20 * 21 * Parameters: 22 * x0 - dest 23 * x1 - src 24 * x2 - n 25 * Returns: 26 * x0 - dest 27 */ 28dstin .req x0 29src .req x1 30count .req x2 31tmp1 .req x3 32tmp1w .req w3 33tmp2 .req x4 34tmp2w .req w4 35tmp3 .req x5 36tmp3w .req w5 37dst .req x6 38 39A_l .req x7 40A_h .req x8 41B_l .req x9 42B_h .req x10 43C_l .req x11 44C_h .req x12 45D_l .req x13 46D_h .req x14 47 48SYM_FUNC_START_ALIAS(__memmove) 49SYM_FUNC_START_WEAK_PI(memmove) 50 cmp dstin, src 51 b.lo __memcpy 52 add tmp1, src, count 53 cmp dstin, tmp1 54 b.hs __memcpy /* No overlap. */ 55 56 add dst, dstin, count 57 add src, src, count 58 cmp count, #16 59 b.lo .Ltail15 /*probably non-alignment accesses.*/ 60 61 ands tmp2, src, #15 /* Bytes to reach alignment. */ 62 b.eq .LSrcAligned 63 sub count, count, tmp2 64 /* 65 * process the aligned offset length to make the src aligned firstly. 66 * those extra instructions' cost is acceptable. It also make the 67 * coming accesses are based on aligned address. 68 */ 69 tbz tmp2, #0, 1f 70 ldrb tmp1w, [src, #-1]! 71 strb tmp1w, [dst, #-1]! 721: 73 tbz tmp2, #1, 2f 74 ldrh tmp1w, [src, #-2]! 75 strh tmp1w, [dst, #-2]! 762: 77 tbz tmp2, #2, 3f 78 ldr tmp1w, [src, #-4]! 79 str tmp1w, [dst, #-4]! 803: 81 tbz tmp2, #3, .LSrcAligned 82 ldr tmp1, [src, #-8]! 83 str tmp1, [dst, #-8]! 84 85.LSrcAligned: 86 cmp count, #64 87 b.ge .Lcpy_over64 88 89 /* 90 * Deal with small copies quickly by dropping straight into the 91 * exit block. 92 */ 93.Ltail63: 94 /* 95 * Copy up to 48 bytes of data. At this point we only need the 96 * bottom 6 bits of count to be accurate. 97 */ 98 ands tmp1, count, #0x30 99 b.eq .Ltail15 100 cmp tmp1w, #0x20 101 b.eq 1f 102 b.lt 2f 103 ldp A_l, A_h, [src, #-16]! 104 stp A_l, A_h, [dst, #-16]! 1051: 106 ldp A_l, A_h, [src, #-16]! 107 stp A_l, A_h, [dst, #-16]! 1082: 109 ldp A_l, A_h, [src, #-16]! 110 stp A_l, A_h, [dst, #-16]! 111 112.Ltail15: 113 tbz count, #3, 1f 114 ldr tmp1, [src, #-8]! 115 str tmp1, [dst, #-8]! 1161: 117 tbz count, #2, 2f 118 ldr tmp1w, [src, #-4]! 119 str tmp1w, [dst, #-4]! 1202: 121 tbz count, #1, 3f 122 ldrh tmp1w, [src, #-2]! 123 strh tmp1w, [dst, #-2]! 1243: 125 tbz count, #0, .Lexitfunc 126 ldrb tmp1w, [src, #-1] 127 strb tmp1w, [dst, #-1] 128 129.Lexitfunc: 130 ret 131 132.Lcpy_over64: 133 subs count, count, #128 134 b.ge .Lcpy_body_large 135 /* 136 * Less than 128 bytes to copy, so handle 64 bytes here and then jump 137 * to the tail. 138 */ 139 ldp A_l, A_h, [src, #-16] 140 stp A_l, A_h, [dst, #-16] 141 ldp B_l, B_h, [src, #-32] 142 ldp C_l, C_h, [src, #-48] 143 stp B_l, B_h, [dst, #-32] 144 stp C_l, C_h, [dst, #-48] 145 ldp D_l, D_h, [src, #-64]! 146 stp D_l, D_h, [dst, #-64]! 147 148 tst count, #0x3f 149 b.ne .Ltail63 150 ret 151 152 /* 153 * Critical loop. Start at a new cache line boundary. Assuming 154 * 64 bytes per line this ensures the entire loop is in one line. 155 */ 156 .p2align L1_CACHE_SHIFT 157.Lcpy_body_large: 158 /* pre-load 64 bytes data. */ 159 ldp A_l, A_h, [src, #-16] 160 ldp B_l, B_h, [src, #-32] 161 ldp C_l, C_h, [src, #-48] 162 ldp D_l, D_h, [src, #-64]! 1631: 164 /* 165 * interlace the load of next 64 bytes data block with store of the last 166 * loaded 64 bytes data. 167 */ 168 stp A_l, A_h, [dst, #-16] 169 ldp A_l, A_h, [src, #-16] 170 stp B_l, B_h, [dst, #-32] 171 ldp B_l, B_h, [src, #-32] 172 stp C_l, C_h, [dst, #-48] 173 ldp C_l, C_h, [src, #-48] 174 stp D_l, D_h, [dst, #-64]! 175 ldp D_l, D_h, [src, #-64]! 176 subs count, count, #64 177 b.ge 1b 178 stp A_l, A_h, [dst, #-16] 179 stp B_l, B_h, [dst, #-32] 180 stp C_l, C_h, [dst, #-48] 181 stp D_l, D_h, [dst, #-64]! 182 183 tst count, #0x3f 184 b.ne .Ltail63 185 ret 186SYM_FUNC_END_PI(memmove) 187EXPORT_SYMBOL(memmove) 188SYM_FUNC_END_ALIAS(__memmove) 189EXPORT_SYMBOL(__memmove) 190