1/* $NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $ */ 2/*- 3 * Copyright 2003 Wasabi Systems, Inc. 4 * All rights reserved. 5 * 6 * Written by Steve C. Woodford for Wasabi Systems, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed for the NetBSD Project by 19 * Wasabi Systems, Inc. 20 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 21 * or promote products derived from this software without specific prior 22 * written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 * POSSIBILITY OF SUCH DAMAGE. 35 * 36 */ 37 38/* 39 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e 40 */ 41#define _ASM_TYPE_FUNCTION #function 42#define _ASM_TYPE_OBJECT #object 43 44#define _C_LABEL(x) x 45#define _ASM_LABEL(x) x 46 47#ifndef _ALIGN_TEXT 48# define _ALIGN_TEXT .align 2 49#endif 50 51#ifdef GPROF 52#define _PROF_PROLOGUE \ 53 mov ip, lr; \ 54 bl __mcount 55#else 56#define _PROF_PROLOGUE 57#endif 58 59#define GLOBAL(x) .global x 60 61#ifdef __thumb__ 62#define _FUNC_MODE .code 16; .thumb_func 63#else 64#define _FUNC_MODE .code 32 65#endif 66 67#ifndef _STANDALONE 68#define STOP_UNWINDING .cantunwind 69#define _FNSTART .fnstart 70#define _FNEND .fnend 71#define _SAVE(...) .save __VA_ARGS__ 72#else 73#define STOP_UNWINDING 74#define _FNSTART 75#define _FNEND 76#define _SAVE(...) 77#endif 78 79#define _LEENTRY(x) .type x,_ASM_TYPE_FUNCTION; _FUNC_MODE; x: 80#define _LEEND(x) /* nothing */ 81#define _EENTRY(x) GLOBAL(x); _LEENTRY(x) 82#define _EEND(x) _LEEND(x) 83 84#define _LENTRY(x) .text; _ALIGN_TEXT; _LEENTRY(x); _FNSTART 85#define _LEND(x) .size x, . - x; _FNEND 86#define _ENTRY(x) .text; _ALIGN_TEXT; _EENTRY(x); _FNSTART 87#define _END(x) _LEND(x) 88 89#define ENTRY(y) _ENTRY(_C_LABEL(y)); _PROF_PROLOGUE 90#define EENTRY(y) _EENTRY(_C_LABEL(y)); 91#define ENTRY_NP(y) _ENTRY(_C_LABEL(y)) 92#define EENTRY_NP(y) _EENTRY(_C_LABEL(y)) 93#define END(y) _END(_C_LABEL(y)) 94#define EEND(y) _EEND(_C_LABEL(y)) 95#define ASENTRY_NP(y) _ENTRY(_ASM_LABEL(y)) 96 97#if defined (_HAVE_ARMv4T_INSTRUCTIONS) 98#define RET bx lr 99#define RETeq bxeq lr 100#define RETne bxne lr 101#define RETc(c) bx##c lr 102#else 103#define RET mov pc, lr 104#define RETeq moveq pc, lr 105#define RETne movne pc, lr 106#define RETc(c) mov##c pc, lr 107#endif 108 109 .syntax unified 110 111 112 113ENTRY(do_cksum) 114 stmfd sp!, {r4-r7, lr} 115 bl L_cksumdata 116 mov r0, r2 117 ldmfd sp!, {r4-r7, pc} 118END(do_cksum) 119 120/* 121 * The main in*_cksum() workhorse... 122 * 123 * Entry parameters: 124 * r0 Pointer to buffer 125 * r1 Buffer length 126 * lr Return address 127 * 128 * Returns: 129 * r2 Accumulated 32-bit sum 130 * 131 * Clobbers: 132 * r0-r7 133 */ 134/* LINTSTUB: Ignore */ 135ASENTRY_NP(L_cksumdata) 136#ifdef _ARM_ARCH_5E 137 pld [r0] /* Pre-fetch the start of the buffer */ 138#endif 139 mov r2, #0 140 141 /* We first have to word-align the buffer. */ 142 ands r7, r0, #0x03 143 beq .Lcksumdata_wordaligned 144 rsb r7, r7, #0x04 145 cmp r1, r7 /* Enough bytes left to make it? */ 146 blt .Lcksumdata_endgame 147 cmp r7, #0x02 148 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ 149 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */ 150 movlt r5, #0x00 151 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */ 152 movle r6, #0x00 153 154 /* Combine the three bytes depending on endianness and alignment */ 155#ifdef __ARMEB__ 156 orreq r2, r5, r4, lsl #8 157 orreq r2, r2, r6, lsl #24 158 orrne r2, r4, r5, lsl #8 159 orrne r2, r2, r6, lsl #16 160#else 161 orreq r2, r4, r5, lsl #8 162 orreq r2, r2, r6, lsl #16 163 orrne r2, r5, r4, lsl #8 164 orrne r2, r2, r6, lsl #24 165#endif 166 subs r1, r1, r7 /* Update length */ 167 RETeq /* All done? */ 168 169 /* Buffer is now word aligned */ 170.Lcksumdata_wordaligned: 171#ifdef _ARM_ARCH_5E 172 cmp r1, #0x04 /* Less than 4 bytes left? */ 173 blt .Lcksumdata_endgame /* Yup */ 174 175 /* Now quad-align, if necessary */ 176 ands r7, r0, #0x04 177 ldrne r7, [r0], #0x04 178 subne r1, r1, #0x04 179 subs r1, r1, #0x40 180 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */ 181 182 /* 183 * Buffer is now quad aligned. Sum 64 bytes at a time. 184 * Note: First ldrd is hoisted above the loop, together with 185 * setting r6 to zero to avoid stalling for results in the 186 * loop. (r7 is live, from above). 187 */ 188 ldrd r4, [r0], #0x08 189 mov r6, #0x00 190.Lcksumdata_bigloop: 191 pld [r0, #0x18] 192 adds r2, r2, r6 193 adcs r2, r2, r7 194 ldrd r6, [r0], #0x08 195 adcs r2, r2, r4 196 adcs r2, r2, r5 197 ldrd r4, [r0], #0x08 198 adcs r2, r2, r6 199 adcs r2, r2, r7 200 ldrd r6, [r0], #0x08 201 adcs r2, r2, r4 202 adcs r2, r2, r5 203 ldrd r4, [r0], #0x08 204 adcs r2, r2, r6 205 adcs r2, r2, r7 206 pld [r0, #0x18] 207 ldrd r6, [r0], #0x08 208 adcs r2, r2, r4 209 adcs r2, r2, r5 210 ldrd r4, [r0], #0x08 211 adcs r2, r2, r6 212 adcs r2, r2, r7 213 ldrd r6, [r0], #0x08 214 adcs r2, r2, r4 215 adcs r2, r2, r5 216 adc r2, r2, #0x00 217 subs r1, r1, #0x40 218 ldrdge r4, [r0], #0x08 219 bge .Lcksumdata_bigloop 220 221 adds r2, r2, r6 /* r6/r7 still need summing */ 222.Lcksumdata_bigloop_end: 223 adcs r2, r2, r7 224 adc r2, r2, #0x00 225 226#else /* !_ARM_ARCH_5E */ 227 228 subs r1, r1, #0x40 229 blt .Lcksumdata_bigloop_end 230 231.Lcksumdata_bigloop: 232 ldmia r0!, {r3, r4, r5, r6} 233 adds r2, r2, r3 234 adcs r2, r2, r4 235 adcs r2, r2, r5 236 ldmia r0!, {r3, r4, r5, r7} 237 adcs r2, r2, r6 238 adcs r2, r2, r3 239 adcs r2, r2, r4 240 adcs r2, r2, r5 241 ldmia r0!, {r3, r4, r5, r6} 242 adcs r2, r2, r7 243 adcs r2, r2, r3 244 adcs r2, r2, r4 245 adcs r2, r2, r5 246 ldmia r0!, {r3, r4, r5, r7} 247 adcs r2, r2, r6 248 adcs r2, r2, r3 249 adcs r2, r2, r4 250 adcs r2, r2, r5 251 adcs r2, r2, r7 252 adc r2, r2, #0x00 253 subs r1, r1, #0x40 254 bge .Lcksumdata_bigloop 255.Lcksumdata_bigloop_end: 256#endif 257 258 adds r1, r1, #0x40 259 RETeq 260 cmp r1, #0x20 261 262#ifdef _ARM_ARCH_5E 263 ldrdge r4, [r0], #0x08 /* Avoid stalling pld and result */ 264 blt .Lcksumdata_less_than_32 265 pld [r0, #0x18] 266 ldrd r6, [r0], #0x08 267 adds r2, r2, r4 268 adcs r2, r2, r5 269 ldrd r4, [r0], #0x08 270 adcs r2, r2, r6 271 adcs r2, r2, r7 272 ldrd r6, [r0], #0x08 273 adcs r2, r2, r4 274 adcs r2, r2, r5 275 adcs r2, r2, r6 /* XXX: Unavoidable result stall */ 276 adcs r2, r2, r7 277#else 278 blt .Lcksumdata_less_than_32 279 ldmia r0!, {r3, r4, r5, r6} 280 adds r2, r2, r3 281 adcs r2, r2, r4 282 adcs r2, r2, r5 283 ldmia r0!, {r3, r4, r5, r7} 284 adcs r2, r2, r6 285 adcs r2, r2, r3 286 adcs r2, r2, r4 287 adcs r2, r2, r5 288 adcs r2, r2, r7 289#endif 290 adc r2, r2, #0x00 291 subs r1, r1, #0x20 292 RETeq 293 294.Lcksumdata_less_than_32: 295 /* There are less than 32 bytes left */ 296 and r3, r1, #0x18 297 rsb r4, r3, #0x18 298 sub r1, r1, r3 299 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 300 addne pc, pc, r4 301 nop 302 303/* 304 * Note: We use ldm here, even on armv5e, since the combined issue/result 305 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 306 */ 307 /* At least 24 bytes remaining... */ 308 ldmia r0!, {r4, r5} 309 adcs r2, r2, r4 310 adcs r2, r2, r5 311 312 /* At least 16 bytes remaining... */ 313 ldmia r0!, {r4, r5} 314 adcs r2, r2, r4 315 adcs r2, r2, r5 316 317 /* At least 8 bytes remaining... */ 318 ldmia r0!, {r4, r5} 319 adcs r2, r2, r4 320 adcs r2, r2, r5 321 322 /* Less than 8 bytes remaining... */ 323 adc r2, r2, #0x00 324 subs r1, r1, #0x04 325 blt .Lcksumdata_lessthan4 326 327 ldr r4, [r0], #0x04 328 sub r1, r1, #0x04 329 adds r2, r2, r4 330 adc r2, r2, #0x00 331 332 /* Deal with < 4 bytes remaining */ 333.Lcksumdata_lessthan4: 334 adds r1, r1, #0x04 335 RETeq 336 337 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 338.Lcksumdata_endgame: 339 ldrb r3, [r0] /* Fetch first byte */ 340 cmp r1, #0x02 341 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ 342 movlt r4, #0x00 343 ldrbgt r5, [r0, #0x02] 344 movle r5, #0x00 345 /* Combine the three bytes depending on endianness and alignment */ 346 tst r0, #0x01 347#ifdef __ARMEB__ 348 orreq r3, r4, r3, lsl #8 349 orreq r3, r3, r5, lsl #24 350 orrne r3, r3, r4, lsl #8 351 orrne r3, r3, r5, lsl #16 352#else 353 orreq r3, r3, r4, lsl #8 354 orreq r3, r3, r5, lsl #16 355 orrne r3, r4, r3, lsl #8 356 orrne r3, r3, r5, lsl #24 357#endif 358 adds r2, r2, r3 359 adc r2, r2, #0x00 360 RET 361END(L_cksumdata) 362