1/* 2 * memset - fill memory with a constant byte 3 * 4 * Copyright (c) 2012, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses 11 * 12 */ 13 14#include "../asmdefs.h" 15 16#define dstin x0 17#define val x1 18#define valw w1 19#define count x2 20#define dst x3 21#define dstend x4 22#define tmp1 x5 23#define tmp1w w5 24#define tmp2 x6 25#define tmp2w w6 26#define zva_len x7 27#define zva_lenw w7 28 29ENTRY (__memset_aarch64) 30 31 dup v0.16B, valw 32 add dstend, dstin, count 33 34 cmp count, 96 35 b.hi L(set_long) 36 cmp count, 16 37 b.hs L(set_medium) 38 mov val, v0.D[0] 39 40 /* Set 0..15 bytes. */ 41 tbz count, 3, 1f 42 str val, [dstin] 43 str val, [dstend, -8] 44 ret 45 nop 461: tbz count, 2, 2f 47 str valw, [dstin] 48 str valw, [dstend, -4] 49 ret 502: cbz count, 3f 51 strb valw, [dstin] 52 tbz count, 1, 3f 53 strh valw, [dstend, -2] 543: ret 55 56 /* Set 17..96 bytes. */ 57L(set_medium): 58 str q0, [dstin] 59 tbnz count, 6, L(set96) 60 str q0, [dstend, -16] 61 tbz count, 5, 1f 62 str q0, [dstin, 16] 63 str q0, [dstend, -32] 641: ret 65 66 .p2align 4 67 /* Set 64..96 bytes. Write 64 bytes from the start and 68 32 bytes from the end. */ 69L(set96): 70 str q0, [dstin, 16] 71 stp q0, q0, [dstin, 32] 72 stp q0, q0, [dstend, -32] 73 ret 74 75 .p2align 3 76 nop 77L(set_long): 78 and valw, valw, 255 79 bic dst, dstin, 15 80 str q0, [dstin] 81 cmp count, 256 82 ccmp valw, 0, 0, cs 83 b.eq L(try_zva) 84L(no_zva): 85 sub count, dstend, dst /* Count is 16 too large. */ 86 add dst, dst, 16 87 sub count, count, 64 + 16 /* Adjust count and bias for loop. */ 881: stp q0, q0, [dst], 64 89 stp q0, q0, [dst, -32] 90L(tail64): 91 subs count, count, 64 92 b.hi 1b 932: stp q0, q0, [dstend, -64] 94 stp q0, q0, [dstend, -32] 95 ret 96 97 .p2align 3 98L(try_zva): 99 mrs tmp1, dczid_el0 100 tbnz tmp1w, 4, L(no_zva) 101 and tmp1w, tmp1w, 15 102 cmp tmp1w, 4 /* ZVA size is 64 bytes. */ 103 b.ne L(zva_128) 104 105 /* Write the first and last 64 byte aligned block using stp rather 106 than using DC ZVA. This is faster on some cores. 107 */ 108L(zva_64): 109 str q0, [dst, 16] 110 stp q0, q0, [dst, 32] 111 bic dst, dst, 63 112 stp q0, q0, [dst, 64] 113 stp q0, q0, [dst, 96] 114 sub count, dstend, dst /* Count is now 128 too large. */ 115 sub count, count, 128+64+64 /* Adjust count and bias for loop. */ 116 add dst, dst, 128 117 nop 1181: dc zva, dst 119 add dst, dst, 64 120 subs count, count, 64 121 b.hi 1b 122 stp q0, q0, [dst, 0] 123 stp q0, q0, [dst, 32] 124 stp q0, q0, [dstend, -64] 125 stp q0, q0, [dstend, -32] 126 ret 127 128 .p2align 3 129L(zva_128): 130 cmp tmp1w, 5 /* ZVA size is 128 bytes. */ 131 b.ne L(zva_other) 132 133 str q0, [dst, 16] 134 stp q0, q0, [dst, 32] 135 stp q0, q0, [dst, 64] 136 stp q0, q0, [dst, 96] 137 bic dst, dst, 127 138 sub count, dstend, dst /* Count is now 128 too large. */ 139 sub count, count, 128+128 /* Adjust count and bias for loop. */ 140 add dst, dst, 128 1411: dc zva, dst 142 add dst, dst, 128 143 subs count, count, 128 144 b.hi 1b 145 stp q0, q0, [dstend, -128] 146 stp q0, q0, [dstend, -96] 147 stp q0, q0, [dstend, -64] 148 stp q0, q0, [dstend, -32] 149 ret 150 151L(zva_other): 152 mov tmp2w, 4 153 lsl zva_lenw, tmp2w, tmp1w 154 add tmp1, zva_len, 64 /* Max alignment bytes written. */ 155 cmp count, tmp1 156 blo L(no_zva) 157 158 sub tmp2, zva_len, 1 159 add tmp1, dst, zva_len 160 add dst, dst, 16 161 subs count, tmp1, dst /* Actual alignment bytes to write. */ 162 bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ 163 beq 2f 1641: stp q0, q0, [dst], 64 165 stp q0, q0, [dst, -32] 166 subs count, count, 64 167 b.hi 1b 1682: mov dst, tmp1 169 sub count, dstend, tmp1 /* Remaining bytes to write. */ 170 subs count, count, zva_len 171 b.lo 4f 1723: dc zva, dst 173 add dst, dst, zva_len 174 subs count, count, zva_len 175 b.hs 3b 1764: add count, count, zva_len 177 b L(tail64) 178 179END (__memset_aarch64) 180