1/* 2 * Copyright (C) 2013 ARM Ltd. 3 * Copyright (C) 2013 Linaro. 4 * 5 * This code is based on glibc cortex strings work originally authored by Linaro 6 * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License version 2 as 14 * published by the Free Software Foundation. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 */ 24 25#include <linux/linkage.h> 26#include <asm/assembler.h> 27#include <asm/cache.h> 28 29/* 30 * Move a buffer from src to test (alignment handled by the hardware). 31 * If dest <= src, call memcpy, otherwise copy in reverse order. 32 * 33 * Parameters: 34 * x0 - dest 35 * x1 - src 36 * x2 - n 37 * Returns: 38 * x0 - dest 39 */ 40dstin .req x0 41src .req x1 42count .req x2 43tmp1 .req x3 44tmp1w .req w3 45tmp2 .req x4 46tmp2w .req w4 47tmp3 .req x5 48tmp3w .req w5 49dst .req x6 50 51A_l .req x7 52A_h .req x8 53B_l .req x9 54B_h .req x10 55C_l .req x11 56C_h .req x12 57D_l .req x13 58D_h .req x14 59 60 .weak memmove 61ENTRY(__memmove) 62ENTRY(memmove) 63 cmp dstin, src 64 b.lo __memcpy 65 add tmp1, src, count 66 cmp dstin, tmp1 67 b.hs __memcpy /* No overlap. */ 68 69 add dst, dstin, count 70 add src, src, count 71 cmp count, #16 72 b.lo .Ltail15 /*probably non-alignment accesses.*/ 73 74 ands tmp2, src, #15 /* Bytes to reach alignment. */ 75 b.eq .LSrcAligned 76 sub count, count, tmp2 77 /* 78 * process the aligned offset length to make the src aligned firstly. 79 * those extra instructions' cost is acceptable. It also make the 80 * coming accesses are based on aligned address. 81 */ 82 tbz tmp2, #0, 1f 83 ldrb tmp1w, [src, #-1]! 84 strb tmp1w, [dst, #-1]! 851: 86 tbz tmp2, #1, 2f 87 ldrh tmp1w, [src, #-2]! 88 strh tmp1w, [dst, #-2]! 892: 90 tbz tmp2, #2, 3f 91 ldr tmp1w, [src, #-4]! 92 str tmp1w, [dst, #-4]! 933: 94 tbz tmp2, #3, .LSrcAligned 95 ldr tmp1, [src, #-8]! 96 str tmp1, [dst, #-8]! 97 98.LSrcAligned: 99 cmp count, #64 100 b.ge .Lcpy_over64 101 102 /* 103 * Deal with small copies quickly by dropping straight into the 104 * exit block. 105 */ 106.Ltail63: 107 /* 108 * Copy up to 48 bytes of data. At this point we only need the 109 * bottom 6 bits of count to be accurate. 110 */ 111 ands tmp1, count, #0x30 112 b.eq .Ltail15 113 cmp tmp1w, #0x20 114 b.eq 1f 115 b.lt 2f 116 ldp A_l, A_h, [src, #-16]! 117 stp A_l, A_h, [dst, #-16]! 1181: 119 ldp A_l, A_h, [src, #-16]! 120 stp A_l, A_h, [dst, #-16]! 1212: 122 ldp A_l, A_h, [src, #-16]! 123 stp A_l, A_h, [dst, #-16]! 124 125.Ltail15: 126 tbz count, #3, 1f 127 ldr tmp1, [src, #-8]! 128 str tmp1, [dst, #-8]! 1291: 130 tbz count, #2, 2f 131 ldr tmp1w, [src, #-4]! 132 str tmp1w, [dst, #-4]! 1332: 134 tbz count, #1, 3f 135 ldrh tmp1w, [src, #-2]! 136 strh tmp1w, [dst, #-2]! 1373: 138 tbz count, #0, .Lexitfunc 139 ldrb tmp1w, [src, #-1] 140 strb tmp1w, [dst, #-1] 141 142.Lexitfunc: 143 ret 144 145.Lcpy_over64: 146 subs count, count, #128 147 b.ge .Lcpy_body_large 148 /* 149 * Less than 128 bytes to copy, so handle 64 bytes here and then jump 150 * to the tail. 151 */ 152 ldp A_l, A_h, [src, #-16] 153 stp A_l, A_h, [dst, #-16] 154 ldp B_l, B_h, [src, #-32] 155 ldp C_l, C_h, [src, #-48] 156 stp B_l, B_h, [dst, #-32] 157 stp C_l, C_h, [dst, #-48] 158 ldp D_l, D_h, [src, #-64]! 159 stp D_l, D_h, [dst, #-64]! 160 161 tst count, #0x3f 162 b.ne .Ltail63 163 ret 164 165 /* 166 * Critical loop. Start at a new cache line boundary. Assuming 167 * 64 bytes per line this ensures the entire loop is in one line. 168 */ 169 .p2align L1_CACHE_SHIFT 170.Lcpy_body_large: 171 /* pre-load 64 bytes data. */ 172 ldp A_l, A_h, [src, #-16] 173 ldp B_l, B_h, [src, #-32] 174 ldp C_l, C_h, [src, #-48] 175 ldp D_l, D_h, [src, #-64]! 1761: 177 /* 178 * interlace the load of next 64 bytes data block with store of the last 179 * loaded 64 bytes data. 180 */ 181 stp A_l, A_h, [dst, #-16] 182 ldp A_l, A_h, [src, #-16] 183 stp B_l, B_h, [dst, #-32] 184 ldp B_l, B_h, [src, #-32] 185 stp C_l, C_h, [dst, #-48] 186 ldp C_l, C_h, [src, #-48] 187 stp D_l, D_h, [dst, #-64]! 188 ldp D_l, D_h, [src, #-64]! 189 subs count, count, #64 190 b.ge 1b 191 stp A_l, A_h, [dst, #-16] 192 stp B_l, B_h, [dst, #-32] 193 stp C_l, C_h, [dst, #-48] 194 stp D_l, D_h, [dst, #-64]! 195 196 tst count, #0x3f 197 b.ne .Ltail63 198 ret 199ENDPIPROC(memmove) 200ENDPROC(__memmove) 201