/* $NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $ */ /*- * Copyright 2003 Wasabi Systems, Inc. * All rights reserved. * * Written by Steve C. Woodford for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */ /* * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e */ #define _ASM_TYPE_FUNCTION #function #define _ASM_TYPE_OBJECT #object #define _C_LABEL(x) x #define _ASM_LABEL(x) x #ifndef _ALIGN_TEXT # define _ALIGN_TEXT .align 2 #endif #ifdef GPROF #define _PROF_PROLOGUE \ mov ip, lr; \ bl __mcount #else #define _PROF_PROLOGUE #endif #define GLOBAL(x) .global x #ifdef __thumb__ #define _FUNC_MODE .code 16; .thumb_func #else #define _FUNC_MODE .code 32 #endif #ifndef _STANDALONE #define STOP_UNWINDING .cantunwind #define _FNSTART .fnstart #define _FNEND .fnend #define _SAVE(...) .save __VA_ARGS__ #else #define STOP_UNWINDING #define _FNSTART #define _FNEND #define _SAVE(...) #endif #define _LEENTRY(x) .type x,_ASM_TYPE_FUNCTION; _FUNC_MODE; x: #define _LEEND(x) /* nothing */ #define _EENTRY(x) GLOBAL(x); _LEENTRY(x) #define _EEND(x) _LEEND(x) #define _LENTRY(x) .text; _ALIGN_TEXT; _LEENTRY(x); _FNSTART #define _LEND(x) .size x, . - x; _FNEND #define _ENTRY(x) .text; _ALIGN_TEXT; _EENTRY(x); _FNSTART #define _END(x) _LEND(x) #define ENTRY(y) _ENTRY(_C_LABEL(y)); _PROF_PROLOGUE #define EENTRY(y) _EENTRY(_C_LABEL(y)); #define ENTRY_NP(y) _ENTRY(_C_LABEL(y)) #define EENTRY_NP(y) _EENTRY(_C_LABEL(y)) #define END(y) _END(_C_LABEL(y)) #define EEND(y) _EEND(_C_LABEL(y)) #define ASENTRY_NP(y) _ENTRY(_ASM_LABEL(y)) #if defined (_HAVE_ARMv4T_INSTRUCTIONS) #define RET bx lr #define RETeq bxeq lr #define RETne bxne lr #define RETc(c) bx##c lr #else #define RET mov pc, lr #define RETeq moveq pc, lr #define RETne movne pc, lr #define RETc(c) mov##c pc, lr #endif .syntax unified ENTRY(do_cksum) stmfd sp!, {r4-r7, lr} bl L_cksumdata mov r0, r2 ldmfd sp!, {r4-r7, pc} END(do_cksum) /* * The main in*_cksum() workhorse... * * Entry parameters: * r0 Pointer to buffer * r1 Buffer length * lr Return address * * Returns: * r2 Accumulated 32-bit sum * * Clobbers: * r0-r7 */ /* LINTSTUB: Ignore */ ASENTRY_NP(L_cksumdata) #ifdef _ARM_ARCH_5E pld [r0] /* Pre-fetch the start of the buffer */ #endif mov r2, #0 /* We first have to word-align the buffer. */ ands r7, r0, #0x03 beq .Lcksumdata_wordaligned rsb r7, r7, #0x04 cmp r1, r7 /* Enough bytes left to make it? */ blt .Lcksumdata_endgame cmp r7, #0x02 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */ movlt r5, #0x00 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */ movle r6, #0x00 /* Combine the three bytes depending on endianness and alignment */ #ifdef __ARMEB__ orreq r2, r5, r4, lsl #8 orreq r2, r2, r6, lsl #24 orrne r2, r4, r5, lsl #8 orrne r2, r2, r6, lsl #16 #else orreq r2, r4, r5, lsl #8 orreq r2, r2, r6, lsl #16 orrne r2, r5, r4, lsl #8 orrne r2, r2, r6, lsl #24 #endif subs r1, r1, r7 /* Update length */ RETeq /* All done? */ /* Buffer is now word aligned */ .Lcksumdata_wordaligned: #ifdef _ARM_ARCH_5E cmp r1, #0x04 /* Less than 4 bytes left? */ blt .Lcksumdata_endgame /* Yup */ /* Now quad-align, if necessary */ ands r7, r0, #0x04 ldrne r7, [r0], #0x04 subne r1, r1, #0x04 subs r1, r1, #0x40 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */ /* * Buffer is now quad aligned. Sum 64 bytes at a time. * Note: First ldrd is hoisted above the loop, together with * setting r6 to zero to avoid stalling for results in the * loop. (r7 is live, from above). */ ldrd r4, [r0], #0x08 mov r6, #0x00 .Lcksumdata_bigloop: pld [r0, #0x18] adds r2, r2, r6 adcs r2, r2, r7 ldrd r6, [r0], #0x08 adcs r2, r2, r4 adcs r2, r2, r5 ldrd r4, [r0], #0x08 adcs r2, r2, r6 adcs r2, r2, r7 ldrd r6, [r0], #0x08 adcs r2, r2, r4 adcs r2, r2, r5 ldrd r4, [r0], #0x08 adcs r2, r2, r6 adcs r2, r2, r7 pld [r0, #0x18] ldrd r6, [r0], #0x08 adcs r2, r2, r4 adcs r2, r2, r5 ldrd r4, [r0], #0x08 adcs r2, r2, r6 adcs r2, r2, r7 ldrd r6, [r0], #0x08 adcs r2, r2, r4 adcs r2, r2, r5 adc r2, r2, #0x00 subs r1, r1, #0x40 ldrdge r4, [r0], #0x08 bge .Lcksumdata_bigloop adds r2, r2, r6 /* r6/r7 still need summing */ .Lcksumdata_bigloop_end: adcs r2, r2, r7 adc r2, r2, #0x00 #else /* !_ARM_ARCH_5E */ subs r1, r1, #0x40 blt .Lcksumdata_bigloop_end .Lcksumdata_bigloop: ldmia r0!, {r3, r4, r5, r6} adds r2, r2, r3 adcs r2, r2, r4 adcs r2, r2, r5 ldmia r0!, {r3, r4, r5, r7} adcs r2, r2, r6 adcs r2, r2, r3 adcs r2, r2, r4 adcs r2, r2, r5 ldmia r0!, {r3, r4, r5, r6} adcs r2, r2, r7 adcs r2, r2, r3 adcs r2, r2, r4 adcs r2, r2, r5 ldmia r0!, {r3, r4, r5, r7} adcs r2, r2, r6 adcs r2, r2, r3 adcs r2, r2, r4 adcs r2, r2, r5 adcs r2, r2, r7 adc r2, r2, #0x00 subs r1, r1, #0x40 bge .Lcksumdata_bigloop .Lcksumdata_bigloop_end: #endif adds r1, r1, #0x40 RETeq cmp r1, #0x20 #ifdef _ARM_ARCH_5E ldrdge r4, [r0], #0x08 /* Avoid stalling pld and result */ blt .Lcksumdata_less_than_32 pld [r0, #0x18] ldrd r6, [r0], #0x08 adds r2, r2, r4 adcs r2, r2, r5 ldrd r4, [r0], #0x08 adcs r2, r2, r6 adcs r2, r2, r7 ldrd r6, [r0], #0x08 adcs r2, r2, r4 adcs r2, r2, r5 adcs r2, r2, r6 /* XXX: Unavoidable result stall */ adcs r2, r2, r7 #else blt .Lcksumdata_less_than_32 ldmia r0!, {r3, r4, r5, r6} adds r2, r2, r3 adcs r2, r2, r4 adcs r2, r2, r5 ldmia r0!, {r3, r4, r5, r7} adcs r2, r2, r6 adcs r2, r2, r3 adcs r2, r2, r4 adcs r2, r2, r5 adcs r2, r2, r7 #endif adc r2, r2, #0x00 subs r1, r1, #0x20 RETeq .Lcksumdata_less_than_32: /* There are less than 32 bytes left */ and r3, r1, #0x18 rsb r4, r3, #0x18 sub r1, r1, r3 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ addne pc, pc, r4 nop /* * Note: We use ldm here, even on armv5e, since the combined issue/result * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. */ /* At least 24 bytes remaining... */ ldmia r0!, {r4, r5} adcs r2, r2, r4 adcs r2, r2, r5 /* At least 16 bytes remaining... */ ldmia r0!, {r4, r5} adcs r2, r2, r4 adcs r2, r2, r5 /* At least 8 bytes remaining... */ ldmia r0!, {r4, r5} adcs r2, r2, r4 adcs r2, r2, r5 /* Less than 8 bytes remaining... */ adc r2, r2, #0x00 subs r1, r1, #0x04 blt .Lcksumdata_lessthan4 ldr r4, [r0], #0x04 sub r1, r1, #0x04 adds r2, r2, r4 adc r2, r2, #0x00 /* Deal with < 4 bytes remaining */ .Lcksumdata_lessthan4: adds r1, r1, #0x04 RETeq /* Deal with 1 to 3 remaining bytes, possibly misaligned */ .Lcksumdata_endgame: ldrb r3, [r0] /* Fetch first byte */ cmp r1, #0x02 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ movlt r4, #0x00 ldrbgt r5, [r0, #0x02] movle r5, #0x00 /* Combine the three bytes depending on endianness and alignment */ tst r0, #0x01 #ifdef __ARMEB__ orreq r3, r4, r3, lsl #8 orreq r3, r3, r5, lsl #24 orrne r3, r3, r4, lsl #8 orrne r3, r3, r5, lsl #16 #else orreq r3, r3, r4, lsl #8 orreq r3, r3, r5, lsl #16 orrne r3, r4, r3, lsl #8 orrne r3, r3, r5, lsl #24 #endif adds r2, r2, r3 adc r2, r2, #0x00 RET END(L_cksumdata)