1/* 2 * Copyright (c) 2009 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30/************************************************************************ 31 * 32 * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops 33 * Version: "043009" 34 * 35 ************************************************************************/ 36 37 38/************************************************************************ 39 * Include files 40 ************************************************************************/ 41 42#include "machine/asm.h" 43 44/* 45 * This routine could be optimized for MIPS64. The current code only 46 * uses MIPS32 instructions. 47 */ 48 49#if defined(__MIPSEB__) 50# define SWHI swl /* high part is left in big-endian */ 51# define SWLO swr /* low part is right in big-endian */ 52#endif 53 54#if defined(__MIPSEL__) 55# define SWHI swr /* high part is right in little-endian */ 56# define SWLO swl /* low part is left in little-endian */ 57#endif 58 59#if !(defined(XGPROF) || defined(XPROF)) 60#undef SETUP_GP 61#define SETUP_GP 62#endif 63 64#ifdef NDEBUG 65#define DBG # 66#else 67#define DBG 68#endif 69 70/* 71 * void _memset16(uint16_t* dst, uint16_t value, size_t size); 72 */ 73 74LEAF(_memset16,0) 75 .set noreorder 76DBG /* Check parameters */ 77DBG andi t0,a0,1 # a0 must be halfword aligned 78DBG tne t0,zero 79DBG andi t2,a2,1 # a2 must be even 80DBG tne t2,zero 81 82#ifdef FIXARGS 83 # ensure count is even 84#if (__mips==32) && (__mips_isa_rev>=2) 85 ins a2,zero,0,1 86#else 87 ori a2,1 88 xori a2,1 89#endif 90#endif 91 92#if (__mips==32) && (__mips_isa_rev>=2) 93 ins a1,a1,16,16 94#else 95 andi a1,0xffff 96 sll t3,a1,16 97 or a1,t3 98#endif 99 100 beqz a2,.Ldone 101 andi t1,a0,2 102 beqz t1,.Lalignok 103 addu t0,a0,a2 # t0 is the "past the end" address 104 sh a1,0(a0) # store one halfword to get aligned 105 addu a0,2 106 subu a2,2 107.Lalignok: 108 slti t1,a2,4 # .Laligned for 4 or more bytes 109 beqz t1,.Laligned 110 sne t1,a2,2 # one more halfword? 111 bnez t1,.Ldone 112 nop 113 sh a1,0(a0) 114.Ldone: 115 j ra 116 nop 117 .set reorder 118END(_memset16) 119 120/* 121 * void _memset32(uint32_t* dst, uint32_t value, size_t size); 122 */ 123 124LEAF(_memset32,0) 125 .set noreorder 126DBG /* Check parameters */ 127DBG andi t0,a0,3 # a0 must be word aligned 128DBG tne t0,zero 129DBG andi t2,a2,3 # a2 must be a multiple of 4 bytes 130DBG tne t2,zero 131 132#ifdef FIXARGS 133 # ensure count is a multiple of 4 134#if (__mips==32) && (__mips_isa_rev>=2) 135 ins $a2,$0,0,2 136#else 137 ori a2,3 138 xori a2,3 139#endif 140#endif 141 142 bnez a2,.Laligned # any work to do? 143 addu t0,a0,a2 # t0 is the "past the end" address 144 145 j ra 146 nop 147 .set reorder 148END(_memset32) 149 150LEAF(memset,0) 151 152 .set noreorder 153 .set noat 154 155 addu t0,a0,a2 # t0 is the "past the end" address 156 slti AT,a2,4 # is a2 less than 4? 157 bne AT,zero,.Llast4 # if yes, go to last4 158 move v0,a0 # memset returns the dst pointer 159 160 beq a1,zero,.Lset0 161 subu v1,zero,a0 162 163 # smear byte into 32 bit word 164#if (__mips==32) && (__mips_isa_rev>=2) 165 ins a1, a1, 8, 8 # Replicate fill byte into half-word. 166 ins a1, a1, 16, 16 # Replicate fill byte into word. 167#else 168 and a1,0xff 169 sll AT,a1,8 170 or a1,AT 171 sll AT,a1,16 172 or a1,AT 173#endif 174 175.Lset0: 176 andi v1,v1,0x3 # word-unaligned address? 177 beq v1,zero,.Laligned # v1 is the unalignment count 178 subu a2,a2,v1 179 SWHI a1,0(a0) 180 addu a0,a0,v1 181 182# Here we have the "word-aligned" a0 (until the "last4") 183.Laligned: 184 andi t8,a2,0x3f # any 64-byte chunks? 185 # t8 is the byte count past 64-byte chunks 186 beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks 187 # There will be at most 1 32-byte chunk then 188 subu a3,a2,t8 # subtract from a2 the reminder 189 # Here a3 counts bytes in 16w chunks 190 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks 191 192# Find out, if there are any 64-byte chunks after which will be still at least 193# 96 bytes left. The value "96" is calculated as needed buffer for 194# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after 195# incrementing "a0" by 64. 196# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk. 197# 198 sltiu v1,a2,160 199 bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)" 200 subu t7,a2,96 # subtract "pref 30 unsafe" region 201 # below we have at least 1 64-byte chunk which is "pref 30 safe" 202 andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder 203 subu t5,t7,t6 # subtract from t7 the reminder 204 # Here t5 counts bytes in 16w "safe" chunks 205 addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks 206 207# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line 208# pref 30,0(a0) 209# Here we are in the region, where it is safe to use "pref 30,64(a0)" 210.Lloop16w: 211 addiu a0,a0,64 212 pref 30,-32(a0) # continue setting up the dest, addr 64-32 213 sw a1,-64(a0) 214 sw a1,-60(a0) 215 sw a1,-56(a0) 216 sw a1,-52(a0) 217 sw a1,-48(a0) 218 sw a1,-44(a0) 219 sw a1,-40(a0) 220 sw a1,-36(a0) 221 nop 222 nop # the extra nop instructions help to balance 223 nop # cycles needed for "store" + "fill" + "evict" 224 nop # For 64byte store there are needed 8 fill 225 nop # and 8 evict cycles, i.e. at least 32 instr. 226 nop 227 nop 228 pref 30,0(a0) # continue setting up the dest, addr 64-0 229 sw a1,-32(a0) 230 sw a1,-28(a0) 231 sw a1,-24(a0) 232 sw a1,-20(a0) 233 sw a1,-16(a0) 234 sw a1,-12(a0) 235 sw a1,-8(a0) 236 sw a1,-4(a0) 237 nop 238 nop 239 nop 240 nop # NOTE: adding 14 nop-s instead of 12 nop-s 241 nop # gives better results for "fast" memory 242 nop 243 bne a0,t4,.Lloop16w 244 nop 245 246 beq a0,a3,.Lchk8w # maybe no more 64-byte chunks? 247 nop # this "delayed slot" is useless ... 248 249.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks 250 addiu a0,a0,64 251 sw a1,-64(a0) 252 sw a1,-60(a0) 253 sw a1,-56(a0) 254 sw a1,-52(a0) 255 sw a1,-48(a0) 256 sw a1,-44(a0) 257 sw a1,-40(a0) 258 sw a1,-36(a0) 259 sw a1,-32(a0) 260 sw a1,-28(a0) 261 sw a1,-24(a0) 262 sw a1,-20(a0) 263 sw a1,-16(a0) 264 sw a1,-12(a0) 265 sw a1,-8(a0) 266 bne a0,a3,.Lloop16w_nopref30 267 sw a1,-4(a0) 268 269.Lchk8w: # t8 here is the byte count past 64-byte chunks 270 271 andi t7,t8,0x1f # is there a 32-byte chunk? 272 # the t7 is the reminder count past 32-bytes 273 beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk 274 move a2,t7 275 276 sw a1,0(a0) 277 sw a1,4(a0) 278 sw a1,8(a0) 279 sw a1,12(a0) 280 sw a1,16(a0) 281 sw a1,20(a0) 282 sw a1,24(a0) 283 sw a1,28(a0) 284 addiu a0,a0,32 285 286.Lchk1w: 287 andi t8,a2,0x3 # now t8 is the reminder past 1w chunks 288 beq a2,t8,.Llast4aligned 289 subu a3,a2,t8 # a3 is the count of bytes in 1w chunks 290 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks 291 292# copying in words (4-byte chunks) 293.LwordCopy_loop: 294 addiu a0,a0,4 295 bne a0,a3,.LwordCopy_loop 296 sw a1,-4(a0) 297 298# store last 0-3 bytes 299# this will repeat the last store if the memset finishes on a word boundary 300.Llast4aligned: 301 j ra 302 SWLO a1,-1(t0) 303 304.Llast4: 305 beq a0,t0,.Llast4e 306.Llast4l: 307 addiu a0,a0,1 308 bne a0,t0,.Llast4l 309 sb a1,-1(a0) 310.Llast4e: 311 j ra 312 nop 313 314 .set at 315 .set reorder 316 317END(memset) 318 319 320/************************************************************************ 321 * Implementation : Static functions 322 ************************************************************************/ 323 324