• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*    $NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $    */
2/*-
3 * Copyright 2003 Wasabi Systems, Inc.
4 * All rights reserved.
5 *
6 * Written by Steve C. Woodford for Wasabi Systems, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *      This product includes software developed for the NetBSD Project by
19 *      Wasabi Systems, Inc.
20 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
21 *    or promote products derived from this software without specific prior
22 *    written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
26 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
27 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
28 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 *
36 */
37
38/*
39 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
40 */
41#define    _ASM_TYPE_FUNCTION    #function
42#define    _ASM_TYPE_OBJECT    #object
43
44#define    _C_LABEL(x)    x
45#define    _ASM_LABEL(x)    x
46
47#ifndef _ALIGN_TEXT
48# define _ALIGN_TEXT .align 2
49#endif
50
51#ifdef GPROF
52#define    _PROF_PROLOGUE    \
53    mov ip, lr;    \
54    bl __mcount
55#else
56#define    _PROF_PROLOGUE
57#endif
58
59#define    GLOBAL(x)    .global x
60
61#ifdef __thumb__
62#define    _FUNC_MODE    .code 16; .thumb_func
63#else
64#define    _FUNC_MODE    .code 32
65#endif
66
67#ifndef _STANDALONE
68#define    STOP_UNWINDING    .cantunwind
69#define    _FNSTART    .fnstart
70#define    _FNEND        .fnend
71#define    _SAVE(...)    .save __VA_ARGS__
72#else
73#define    STOP_UNWINDING
74#define    _FNSTART
75#define    _FNEND
76#define    _SAVE(...)
77#endif
78
79#define    _LEENTRY(x)     .type x,_ASM_TYPE_FUNCTION; _FUNC_MODE; x:
80#define    _LEEND(x)    /* nothing */
81#define    _EENTRY(x)     GLOBAL(x); _LEENTRY(x)
82#define    _EEND(x)    _LEEND(x)
83
84#define    _LENTRY(x)    .text; _ALIGN_TEXT; _LEENTRY(x); _FNSTART
85#define    _LEND(x)    .size x, . - x; _FNEND
86#define    _ENTRY(x)    .text; _ALIGN_TEXT; _EENTRY(x); _FNSTART
87#define    _END(x)        _LEND(x)
88
89#define    ENTRY(y)    _ENTRY(_C_LABEL(y)); _PROF_PROLOGUE
90#define    EENTRY(y)    _EENTRY(_C_LABEL(y));
91#define    ENTRY_NP(y)    _ENTRY(_C_LABEL(y))
92#define    EENTRY_NP(y)    _EENTRY(_C_LABEL(y))
93#define    END(y)        _END(_C_LABEL(y))
94#define    EEND(y)        _EEND(_C_LABEL(y))
95#define    ASENTRY_NP(y)    _ENTRY(_ASM_LABEL(y))
96
97#if defined (_HAVE_ARMv4T_INSTRUCTIONS)
98#define RET    bx    lr
99#define RETeq    bxeq    lr
100#define RETne    bxne    lr
101#define RETc(c) bx##c    lr
102#else
103#define RET    mov    pc, lr
104#define RETeq    moveq    pc, lr
105#define RETne    movne    pc, lr
106#define RETc(c) mov##c    pc, lr
107#endif
108
109    .syntax    unified
110
111
112
113ENTRY(do_cksum)
114    stmfd    sp!, {r4-r7, lr}
115    bl    L_cksumdata
116    mov    r0, r2
117    ldmfd    sp!, {r4-r7, pc}
118END(do_cksum)
119
120/*
121 * The main in*_cksum() workhorse...
122 *
123 * Entry parameters:
124 *    r0    Pointer to buffer
125 *    r1    Buffer length
126 *    lr    Return address
127 *
128 * Returns:
129 *    r2    Accumulated 32-bit sum
130 *
131 * Clobbers:
132 *    r0-r7
133 */
134/* LINTSTUB: Ignore */
135ASENTRY_NP(L_cksumdata)
136#ifdef _ARM_ARCH_5E
137    pld    [r0]            /* Pre-fetch the start of the buffer */
138#endif
139    mov    r2, #0
140
141    /* We first have to word-align the buffer.  */
142    ands    r7, r0, #0x03
143    beq    .Lcksumdata_wordaligned
144    rsb    r7, r7, #0x04
145    cmp    r1, r7            /* Enough bytes left to make it? */
146    blt    .Lcksumdata_endgame
147    cmp    r7, #0x02
148    ldrb    r4, [r0], #0x01        /* Fetch 1st byte */
149    ldrbge    r5, [r0], #0x01        /* Fetch 2nd byte */
150    movlt    r5, #0x00
151    ldrbgt    r6, [r0], #0x01        /* Fetch 3rd byte */
152    movle    r6, #0x00
153    /* Combine the three bytes depending on endianness and alignment */
154#ifdef __ARMEB__
155    orreq    r2, r5, r4, lsl #8
156    orreq    r2, r2, r6, lsl #24
157    orrne    r2, r4, r5, lsl #8
158    orrne    r2, r2, r6, lsl #16
159#else
160    orreq    r2, r4, r5, lsl #8
161    orreq    r2, r2, r6, lsl #16
162    orrne    r2, r5, r4, lsl #8
163    orrne    r2, r2, r6, lsl #24
164#endif
165    subs    r1, r1, r7        /* Update length */
166    RETeq            /* All done? */
167
168    /* Buffer is now word aligned */
169.Lcksumdata_wordaligned:
170#ifdef _ARM_ARCH_5E
171    cmp    r1, #0x04        /* Less than 4 bytes left? */
172    blt    .Lcksumdata_endgame    /* Yup */
173
174    /* Now quad-align, if necessary */
175    ands    r7, r0, #0x04
176    ldrne    r7, [r0], #0x04
177    subne    r1, r1, #0x04
178    subs    r1, r1, #0x40
179    blt    .Lcksumdata_bigloop_end    /* Note: C flag clear if branch taken */
180
181    /*
182     * Buffer is now quad aligned. Sum 64 bytes at a time.
183     * Note: First ldrd is hoisted above the loop, together with
184     * setting r6 to zero to avoid stalling for results in the
185     * loop. (r7 is live, from above).
186     */
187    ldrd    r4, [r0], #0x08
188    mov    r6, #0x00
189.Lcksumdata_bigloop:
190    pld    [r0, #0x18]
191    adds    r2, r2, r6
192    adcs    r2, r2, r7
193    ldrd    r6, [r0], #0x08
194    adcs    r2, r2, r4
195    adcs    r2, r2, r5
196    ldrd    r4, [r0], #0x08
197    adcs    r2, r2, r6
198    adcs    r2, r2, r7
199    ldrd    r6, [r0], #0x08
200    adcs    r2, r2, r4
201    adcs    r2, r2, r5
202    ldrd    r4, [r0], #0x08
203    adcs    r2, r2, r6
204    adcs    r2, r2, r7
205    pld    [r0, #0x18]
206    ldrd    r6, [r0], #0x08
207    adcs    r2, r2, r4
208    adcs    r2, r2, r5
209    ldrd    r4, [r0], #0x08
210    adcs    r2, r2, r6
211    adcs    r2, r2, r7
212    ldrd    r6, [r0], #0x08
213    adcs    r2, r2, r4
214    adcs    r2, r2, r5
215    adc    r2, r2, #0x00
216    subs    r1, r1, #0x40
217    ldrdge    r4, [r0], #0x08
218    bge    .Lcksumdata_bigloop
219
220    adds    r2, r2, r6        /* r6/r7 still need summing */
221.Lcksumdata_bigloop_end:
222    adcs    r2, r2, r7
223    adc    r2, r2, #0x00
224
225#else    /* !_ARM_ARCH_5E */
226
227    subs    r1, r1, #0x40
228    blt    .Lcksumdata_bigloop_end
229
230.Lcksumdata_bigloop:
231    ldmia    r0!, {r3, r4, r5, r6}
232    adds    r2, r2, r3
233    adcs    r2, r2, r4
234    adcs    r2, r2, r5
235    ldmia    r0!, {r3, r4, r5, r7}
236    adcs    r2, r2, r6
237    adcs    r2, r2, r3
238    adcs    r2, r2, r4
239    adcs    r2, r2, r5
240    ldmia    r0!, {r3, r4, r5, r6}
241    adcs    r2, r2, r7
242    adcs    r2, r2, r3
243    adcs    r2, r2, r4
244    adcs    r2, r2, r5
245    ldmia    r0!, {r3, r4, r5, r7}
246    adcs    r2, r2, r6
247    adcs    r2, r2, r3
248    adcs    r2, r2, r4
249    adcs    r2, r2, r5
250    adcs    r2, r2, r7
251    adc    r2, r2, #0x00
252    subs    r1, r1, #0x40
253    bge    .Lcksumdata_bigloop
254.Lcksumdata_bigloop_end:
255#endif
256
257    adds    r1, r1, #0x40
258    RETeq
259    cmp    r1, #0x20
260
261#ifdef _ARM_ARCH_5E
262    ldrdge    r4, [r0], #0x08        /* Avoid stalling pld and result */
263    blt    .Lcksumdata_less_than_32
264    pld    [r0, #0x18]
265    ldrd    r6, [r0], #0x08
266    adds    r2, r2, r4
267    adcs    r2, r2, r5
268    ldrd    r4, [r0], #0x08
269    adcs    r2, r2, r6
270    adcs    r2, r2, r7
271    ldrd    r6, [r0], #0x08
272    adcs    r2, r2, r4
273    adcs    r2, r2, r5
274    adcs    r2, r2, r6        /* XXX: Unavoidable result stall */
275    adcs    r2, r2, r7
276#else
277    blt    .Lcksumdata_less_than_32
278    ldmia    r0!, {r3, r4, r5, r6}
279    adds    r2, r2, r3
280    adcs    r2, r2, r4
281    adcs    r2, r2, r5
282    ldmia    r0!, {r3, r4, r5, r7}
283    adcs    r2, r2, r6
284    adcs    r2, r2, r3
285    adcs    r2, r2, r4
286    adcs    r2, r2, r5
287    adcs    r2, r2, r7
288#endif
289    adc    r2, r2, #0x00
290    subs    r1, r1, #0x20
291    RETeq
292
293.Lcksumdata_less_than_32:
294    /* There are less than 32 bytes left */
295    and    r3, r1, #0x18
296    rsb    r4, r3, #0x18
297    sub    r1, r1, r3
298    adds    r4, r4, r4, lsr #1    /* Side effect: Clear carry flag */
299    addne    pc, pc, r4
300    nop
301
302/*
303 * Note: We use ldm here, even on armv5e, since the combined issue/result
304 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
305 */
306    /* At least 24 bytes remaining... */
307    ldmia    r0!, {r4, r5}
308    adcs    r2, r2, r4
309    adcs    r2, r2, r5
310
311    /* At least 16 bytes remaining... */
312    ldmia    r0!, {r4, r5}
313    adcs    r2, r2, r4
314    adcs    r2, r2, r5
315
316    /* At least 8 bytes remaining... */
317    ldmia    r0!, {r4, r5}
318    adcs    r2, r2, r4
319    adcs    r2, r2, r5
320
321    /* Less than 8 bytes remaining... */
322    adc    r2, r2, #0x00
323    subs    r1, r1, #0x04
324    blt    .Lcksumdata_lessthan4
325
326    ldr    r4, [r0], #0x04
327    sub    r1, r1, #0x04
328    adds    r2, r2, r4
329    adc    r2, r2, #0x00
330
331    /* Deal with < 4 bytes remaining */
332.Lcksumdata_lessthan4:
333    adds    r1, r1, #0x04
334    RETeq
335
336    /* Deal with 1 to 3 remaining bytes, possibly misaligned */
337.Lcksumdata_endgame:
338    ldrb    r3, [r0]        /* Fetch first byte */
339    cmp    r1, #0x02
340    ldrbge    r4, [r0, #0x01]        /* Fetch 2nd and 3rd as necessary */
341    movlt    r4, #0x00
342    ldrbgt    r5, [r0, #0x02]
343    movle    r5, #0x00
344    /* Combine the three bytes depending on endianness and alignment */
345    tst    r0, #0x01
346#ifdef __ARMEB__
347    orreq    r3, r4, r3, lsl #8
348    orreq    r3, r3, r5, lsl #24
349    orrne    r3, r3, r4, lsl #8
350    orrne    r3, r3, r5, lsl #16
351#else
352    orreq    r3, r3, r4, lsl #8
353    orreq    r3, r3, r5, lsl #16
354    orrne    r3, r4, r3, lsl #8
355    orrne    r3, r3, r5, lsl #24
356#endif
357    adds    r2, r2, r3
358    adc    r2, r2, #0x00
359    RET
360END(L_cksumdata)
361