1// 2// Copyright (c) 2012 - 2016, Linaro Limited 3// All rights reserved. 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are met: 7// * Redistributions of source code must retain the above copyright 8// notice, this list of conditions and the following disclaimer. 9// * Redistributions in binary form must reproduce the above copyright 10// notice, this list of conditions and the following disclaimer in the 11// documentation and/or other materials provided with the distribution. 12// * Neither the name of the Linaro nor the 13// names of its contributors may be used to endorse or promote products 14// derived from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27// 28 29// 30// Copyright (c) 2015 ARM Ltd 31// All rights reserved. 32// 33// Redistribution and use in source and binary forms, with or without 34// modification, are permitted provided that the following conditions 35// are met: 36// 1. Redistributions of source code must retain the above copyright 37// notice, this list of conditions and the following disclaimer. 38// 2. Redistributions in binary form must reproduce the above copyright 39// notice, this list of conditions and the following disclaimer in the 40// documentation and/or other materials provided with the distribution. 41// 3. The name of the company may not be used to endorse or promote 42// products derived from this software without specific prior written 43// permission. 44// 45// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 46// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 47// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 48// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 49// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 50// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 51// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 52// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 53// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 54// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 55// 56 57// Assumptions: 58// 59// ARMv8-a, AArch64, unaligned accesses 60// 61// 62 63#define dstin x0 64#define count x1 65#define val x2 66#define valw w2 67#define dst x3 68#define dstend x4 69#define tmp1 x5 70#define tmp1w w5 71#define tmp2 x6 72#define tmp2w w6 73#define zva_len x7 74#define zva_lenw w7 75 76#define L(l) .L ## l 77 78ASM_GLOBAL ASM_PFX(InternalMemSetMem16) 79ASM_PFX(InternalMemSetMem16): 80 dup v0.8H, valw 81 lsl count, count, #1 82 b 0f 83 84ASM_GLOBAL ASM_PFX(InternalMemSetMem32) 85ASM_PFX(InternalMemSetMem32): 86 dup v0.4S, valw 87 lsl count, count, #2 88 b 0f 89 90ASM_GLOBAL ASM_PFX(InternalMemSetMem64) 91ASM_PFX(InternalMemSetMem64): 92 dup v0.2D, val 93 lsl count, count, #3 94 b 0f 95 96ASM_GLOBAL ASM_PFX(InternalMemZeroMem) 97ASM_PFX(InternalMemZeroMem): 98 movi v0.16B, #0 99 b 0f 100 101ASM_GLOBAL ASM_PFX(InternalMemSetMem) 102ASM_PFX(InternalMemSetMem): 103 dup v0.16B, valw 1040: add dstend, dstin, count 105 mov val, v0.D[0] 106 107 cmp count, 96 108 b.hi L(set_long) 109 cmp count, 16 110 b.hs L(set_medium) 111 112 // Set 0..15 bytes. 113 tbz count, 3, 1f 114 str val, [dstin] 115 str val, [dstend, -8] 116 ret 117 nop 1181: tbz count, 2, 2f 119 str valw, [dstin] 120 str valw, [dstend, -4] 121 ret 1222: cbz count, 3f 123 strb valw, [dstin] 124 tbz count, 1, 3f 125 strh valw, [dstend, -2] 1263: ret 127 128 // Set 17..96 bytes. 129L(set_medium): 130 str q0, [dstin] 131 tbnz count, 6, L(set96) 132 str q0, [dstend, -16] 133 tbz count, 5, 1f 134 str q0, [dstin, 16] 135 str q0, [dstend, -32] 1361: ret 137 138 .p2align 4 139 // Set 64..96 bytes. Write 64 bytes from the start and 140 // 32 bytes from the end. 141L(set96): 142 str q0, [dstin, 16] 143 stp q0, q0, [dstin, 32] 144 stp q0, q0, [dstend, -32] 145 ret 146 147 .p2align 3 148 nop 149L(set_long): 150 bic dst, dstin, 15 151 str q0, [dstin] 152 cmp count, 256 153 ccmp val, 0, 0, cs 154 b.eq L(try_zva) 155L(no_zva): 156 sub count, dstend, dst // Count is 16 too large. 157 add dst, dst, 16 158 sub count, count, 64 + 16 // Adjust count and bias for loop. 1591: stp q0, q0, [dst], 64 160 stp q0, q0, [dst, -32] 161L(tail64): 162 subs count, count, 64 163 b.hi 1b 1642: stp q0, q0, [dstend, -64] 165 stp q0, q0, [dstend, -32] 166 ret 167 168 .p2align 3 169L(try_zva): 170 mrs tmp1, dczid_el0 171 tbnz tmp1w, 4, L(no_zva) 172 and tmp1w, tmp1w, 15 173 cmp tmp1w, 4 // ZVA size is 64 bytes. 174 b.ne L(zva_128) 175 176 // Write the first and last 64 byte aligned block using stp rather 177 // than using DC ZVA. This is faster on some cores. 178L(zva_64): 179 str q0, [dst, 16] 180 stp q0, q0, [dst, 32] 181 bic dst, dst, 63 182 stp q0, q0, [dst, 64] 183 stp q0, q0, [dst, 96] 184 sub count, dstend, dst // Count is now 128 too large. 185 sub count, count, 128+64+64 // Adjust count and bias for loop. 186 add dst, dst, 128 187 nop 1881: dc zva, dst 189 add dst, dst, 64 190 subs count, count, 64 191 b.hi 1b 192 stp q0, q0, [dst, 0] 193 stp q0, q0, [dst, 32] 194 stp q0, q0, [dstend, -64] 195 stp q0, q0, [dstend, -32] 196 ret 197 198 .p2align 3 199L(zva_128): 200 cmp tmp1w, 5 // ZVA size is 128 bytes. 201 b.ne L(zva_other) 202 203 str q0, [dst, 16] 204 stp q0, q0, [dst, 32] 205 stp q0, q0, [dst, 64] 206 stp q0, q0, [dst, 96] 207 bic dst, dst, 127 208 sub count, dstend, dst // Count is now 128 too large. 209 sub count, count, 128+128 // Adjust count and bias for loop. 210 add dst, dst, 128 2111: dc zva, dst 212 add dst, dst, 128 213 subs count, count, 128 214 b.hi 1b 215 stp q0, q0, [dstend, -128] 216 stp q0, q0, [dstend, -96] 217 stp q0, q0, [dstend, -64] 218 stp q0, q0, [dstend, -32] 219 ret 220 221L(zva_other): 222 mov tmp2w, 4 223 lsl zva_lenw, tmp2w, tmp1w 224 add tmp1, zva_len, 64 // Max alignment bytes written. 225 cmp count, tmp1 226 blo L(no_zva) 227 228 sub tmp2, zva_len, 1 229 add tmp1, dst, zva_len 230 add dst, dst, 16 231 subs count, tmp1, dst // Actual alignment bytes to write. 232 bic tmp1, tmp1, tmp2 // Aligned dc zva start address. 233 beq 2f 2341: stp q0, q0, [dst], 64 235 stp q0, q0, [dst, -32] 236 subs count, count, 64 237 b.hi 1b 2382: mov dst, tmp1 239 sub count, dstend, tmp1 // Remaining bytes to write. 240 subs count, count, zva_len 241 b.lo 4f 2423: dc zva, dst 243 add dst, dst, zva_len 244 subs count, count, zva_len 245 b.hs 3b 2464: add count, count, zva_len 247 b L(tail64) 248