1/* $NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $ */ 2 3/*- 4 * Copyright 2003 Wasabi Systems, Inc. 5 * All rights reserved. 6 * 7 * Written by Steve C. Woodford for Wasabi Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed for the NetBSD Project by 20 * Wasabi Systems, Inc. 21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 22 * or promote products derived from this software without specific prior 23 * written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 * 37 */ 38 39/* 40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e 41 */ 42#define _ASM_TYPE_FUNCTION #function 43#define _ASM_TYPE_OBJECT #object 44 45#define _C_LABEL(x) x 46#define _ASM_LABEL(x) x 47 48#ifndef _ALIGN_TEXT 49# define _ALIGN_TEXT .align 2 50#endif 51 52#ifdef GPROF 53#define _PROF_PROLOGUE \ 54 mov ip, lr; \ 55 bl __mcount 56#else 57#define _PROF_PROLOGUE 58#endif 59 60#define GLOBAL(x) .global x 61 62#ifdef __thumb__ 63#define _FUNC_MODE .code 16; .thumb_func 64#else 65#define _FUNC_MODE .code 32 66#endif 67 68#ifndef _STANDALONE 69#define STOP_UNWINDING .cantunwind 70#define _FNSTART .fnstart 71#define _FNEND .fnend 72#define _SAVE(...) .save __VA_ARGS__ 73#else 74#define STOP_UNWINDING 75#define _FNSTART 76#define _FNEND 77#define _SAVE(...) 78#endif 79 80#define _LEENTRY(x) .type x,_ASM_TYPE_FUNCTION; _FUNC_MODE; x: 81#define _LEEND(x) /* nothing */ 82#define _EENTRY(x) GLOBAL(x); _LEENTRY(x) 83#define _EEND(x) _LEEND(x) 84 85#define _LENTRY(x) .text; _ALIGN_TEXT; _LEENTRY(x); _FNSTART 86#define _LEND(x) .size x, . - x; _FNEND 87#define _ENTRY(x) .text; _ALIGN_TEXT; _EENTRY(x); _FNSTART 88#define _END(x) _LEND(x) 89 90#define ENTRY(y) _ENTRY(_C_LABEL(y)); _PROF_PROLOGUE 91#define EENTRY(y) _EENTRY(_C_LABEL(y)); 92#define ENTRY_NP(y) _ENTRY(_C_LABEL(y)) 93#define EENTRY_NP(y) _EENTRY(_C_LABEL(y)) 94#define END(y) _END(_C_LABEL(y)) 95#define EEND(y) _EEND(_C_LABEL(y)) 96#define ASENTRY_NP(y) _ENTRY(_ASM_LABEL(y)) 97 98#if defined (_HAVE_ARMv4T_INSTRUCTIONS) 99#define RET bx lr 100#define RETeq bxeq lr 101#define RETne bxne lr 102#define RETc(c) bx##c lr 103#else 104#define RET mov pc, lr 105#define RETeq moveq pc, lr 106#define RETne movne pc, lr 107#define RETc(c) mov##c pc, lr 108#endif 109 110 .syntax unified 111 112 113 114ENTRY(do_cksum) 115 stmfd sp!, {r4-r7, lr} 116 bl L_cksumdata 117 mov r0, r2 118 ldmfd sp!, {r4-r7, pc} 119END(do_cksum) 120 121/* 122 * The main in*_cksum() workhorse... 123 * 124 * Entry parameters: 125 * r0 Pointer to buffer 126 * r1 Buffer length 127 * lr Return address 128 * 129 * Returns: 130 * r2 Accumulated 32-bit sum 131 * 132 * Clobbers: 133 * r0-r7 134 */ 135/* LINTSTUB: Ignore */ 136ASENTRY_NP(L_cksumdata) 137#ifdef _ARM_ARCH_5E 138 pld [r0] /* Pre-fetch the start of the buffer */ 139#endif 140 mov r2, #0 141 142 /* We first have to word-align the buffer. */ 143 ands r7, r0, #0x03 144 beq .Lcksumdata_wordaligned 145 rsb r7, r7, #0x04 146 cmp r1, r7 /* Enough bytes left to make it? */ 147 blt .Lcksumdata_endgame 148 cmp r7, #0x02 149 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ 150 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */ 151 movlt r5, #0x00 152 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */ 153 movle r6, #0x00 154 155 /* Combine the three bytes depending on endianness and alignment */ 156#ifdef __ARMEB__ 157 orreq r2, r5, r4, lsl #8 158 orreq r2, r2, r6, lsl #24 159 orrne r2, r4, r5, lsl #8 160 orrne r2, r2, r6, lsl #16 161#else 162 orreq r2, r4, r5, lsl #8 163 orreq r2, r2, r6, lsl #16 164 orrne r2, r5, r4, lsl #8 165 orrne r2, r2, r6, lsl #24 166#endif 167 subs r1, r1, r7 /* Update length */ 168 RETeq /* All done? */ 169 170 /* Buffer is now word aligned */ 171.Lcksumdata_wordaligned: 172#ifdef _ARM_ARCH_5E 173 cmp r1, #0x04 /* Less than 4 bytes left? */ 174 blt .Lcksumdata_endgame /* Yup */ 175 176 /* Now quad-align, if necessary */ 177 ands r7, r0, #0x04 178 ldrne r7, [r0], #0x04 179 subne r1, r1, #0x04 180 subs r1, r1, #0x40 181 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */ 182 183 /* 184 * Buffer is now quad aligned. Sum 64 bytes at a time. 185 * Note: First ldrd is hoisted above the loop, together with 186 * setting r6 to zero to avoid stalling for results in the 187 * loop. (r7 is live, from above). 188 */ 189 ldrd r4, [r0], #0x08 190 mov r6, #0x00 191.Lcksumdata_bigloop: 192 pld [r0, #0x18] 193 adds r2, r2, r6 194 adcs r2, r2, r7 195 ldrd r6, [r0], #0x08 196 adcs r2, r2, r4 197 adcs r2, r2, r5 198 ldrd r4, [r0], #0x08 199 adcs r2, r2, r6 200 adcs r2, r2, r7 201 ldrd r6, [r0], #0x08 202 adcs r2, r2, r4 203 adcs r2, r2, r5 204 ldrd r4, [r0], #0x08 205 adcs r2, r2, r6 206 adcs r2, r2, r7 207 pld [r0, #0x18] 208 ldrd r6, [r0], #0x08 209 adcs r2, r2, r4 210 adcs r2, r2, r5 211 ldrd r4, [r0], #0x08 212 adcs r2, r2, r6 213 adcs r2, r2, r7 214 ldrd r6, [r0], #0x08 215 adcs r2, r2, r4 216 adcs r2, r2, r5 217 adc r2, r2, #0x00 218 subs r1, r1, #0x40 219 ldrdge r4, [r0], #0x08 220 bge .Lcksumdata_bigloop 221 222 adds r2, r2, r6 /* r6/r7 still need summing */ 223.Lcksumdata_bigloop_end: 224 adcs r2, r2, r7 225 adc r2, r2, #0x00 226 227#else /* !_ARM_ARCH_5E */ 228 229 subs r1, r1, #0x40 230 blt .Lcksumdata_bigloop_end 231 232.Lcksumdata_bigloop: 233 ldmia r0!, {r3, r4, r5, r6} 234 adds r2, r2, r3 235 adcs r2, r2, r4 236 adcs r2, r2, r5 237 ldmia r0!, {r3, r4, r5, r7} 238 adcs r2, r2, r6 239 adcs r2, r2, r3 240 adcs r2, r2, r4 241 adcs r2, r2, r5 242 ldmia r0!, {r3, r4, r5, r6} 243 adcs r2, r2, r7 244 adcs r2, r2, r3 245 adcs r2, r2, r4 246 adcs r2, r2, r5 247 ldmia r0!, {r3, r4, r5, r7} 248 adcs r2, r2, r6 249 adcs r2, r2, r3 250 adcs r2, r2, r4 251 adcs r2, r2, r5 252 adcs r2, r2, r7 253 adc r2, r2, #0x00 254 subs r1, r1, #0x40 255 bge .Lcksumdata_bigloop 256.Lcksumdata_bigloop_end: 257#endif 258 259 adds r1, r1, #0x40 260 RETeq 261 cmp r1, #0x20 262 263#ifdef _ARM_ARCH_5E 264 ldrdge r4, [r0], #0x08 /* Avoid stalling pld and result */ 265 blt .Lcksumdata_less_than_32 266 pld [r0, #0x18] 267 ldrd r6, [r0], #0x08 268 adds r2, r2, r4 269 adcs r2, r2, r5 270 ldrd r4, [r0], #0x08 271 adcs r2, r2, r6 272 adcs r2, r2, r7 273 ldrd r6, [r0], #0x08 274 adcs r2, r2, r4 275 adcs r2, r2, r5 276 adcs r2, r2, r6 /* XXX: Unavoidable result stall */ 277 adcs r2, r2, r7 278#else 279 blt .Lcksumdata_less_than_32 280 ldmia r0!, {r3, r4, r5, r6} 281 adds r2, r2, r3 282 adcs r2, r2, r4 283 adcs r2, r2, r5 284 ldmia r0!, {r3, r4, r5, r7} 285 adcs r2, r2, r6 286 adcs r2, r2, r3 287 adcs r2, r2, r4 288 adcs r2, r2, r5 289 adcs r2, r2, r7 290#endif 291 adc r2, r2, #0x00 292 subs r1, r1, #0x20 293 RETeq 294 295.Lcksumdata_less_than_32: 296 /* There are less than 32 bytes left */ 297 and r3, r1, #0x18 298 rsb r4, r3, #0x18 299 sub r1, r1, r3 300 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 301 addne pc, pc, r4 302 nop 303 304/* 305 * Note: We use ldm here, even on armv5e, since the combined issue/result 306 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 307 */ 308 /* At least 24 bytes remaining... */ 309 ldmia r0!, {r4, r5} 310 adcs r2, r2, r4 311 adcs r2, r2, r5 312 313 /* At least 16 bytes remaining... */ 314 ldmia r0!, {r4, r5} 315 adcs r2, r2, r4 316 adcs r2, r2, r5 317 318 /* At least 8 bytes remaining... */ 319 ldmia r0!, {r4, r5} 320 adcs r2, r2, r4 321 adcs r2, r2, r5 322 323 /* Less than 8 bytes remaining... */ 324 adc r2, r2, #0x00 325 subs r1, r1, #0x04 326 blt .Lcksumdata_lessthan4 327 328 ldr r4, [r0], #0x04 329 sub r1, r1, #0x04 330 adds r2, r2, r4 331 adc r2, r2, #0x00 332 333 /* Deal with < 4 bytes remaining */ 334.Lcksumdata_lessthan4: 335 adds r1, r1, #0x04 336 RETeq 337 338 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 339.Lcksumdata_endgame: 340 ldrb r3, [r0] /* Fetch first byte */ 341 cmp r1, #0x02 342 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ 343 movlt r4, #0x00 344 ldrbgt r5, [r0, #0x02] 345 movle r5, #0x00 346 /* Combine the three bytes depending on endianness and alignment */ 347 tst r0, #0x01 348#ifdef __ARMEB__ 349 orreq r3, r4, r3, lsl #8 350 orreq r3, r3, r5, lsl #24 351 orrne r3, r3, r4, lsl #8 352 orrne r3, r3, r5, lsl #16 353#else 354 orreq r3, r3, r4, lsl #8 355 orreq r3, r3, r5, lsl #16 356 orrne r3, r4, r3, lsl #8 357 orrne r3, r3, r5, lsl #24 358#endif 359 adds r2, r2, r3 360 adc r2, r2, #0x00 361 RET 362END(L_cksumdata) 363