• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*	$NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 *
37 */
38
39/*
40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
41 */
42#define    _ASM_TYPE_FUNCTION    #function
43#define    _ASM_TYPE_OBJECT    #object
44
45#define    _C_LABEL(x)    x
46#define    _ASM_LABEL(x)    x
47
48#ifndef _ALIGN_TEXT
49# define _ALIGN_TEXT .align 2
50#endif
51
52#ifdef GPROF
53#define    _PROF_PROLOGUE    \
54    mov ip, lr;    \
55    bl __mcount
56#else
57#define    _PROF_PROLOGUE
58#endif
59
60#define    GLOBAL(x)    .global x
61
62#ifdef __thumb__
63#define    _FUNC_MODE    .code 16; .thumb_func
64#else
65#define    _FUNC_MODE    .code 32
66#endif
67
68#ifndef _STANDALONE
69#define    STOP_UNWINDING    .cantunwind
70#define    _FNSTART    .fnstart
71#define    _FNEND        .fnend
72#define    _SAVE(...)    .save __VA_ARGS__
73#else
74#define    STOP_UNWINDING
75#define    _FNSTART
76#define    _FNEND
77#define    _SAVE(...)
78#endif
79
80#define    _LEENTRY(x)     .type x,_ASM_TYPE_FUNCTION; _FUNC_MODE; x:
81#define    _LEEND(x)    /* nothing */
82#define    _EENTRY(x)     GLOBAL(x); _LEENTRY(x)
83#define    _EEND(x)    _LEEND(x)
84
85#define    _LENTRY(x)    .text; _ALIGN_TEXT; _LEENTRY(x); _FNSTART
86#define    _LEND(x)    .size x, . - x; _FNEND
87#define    _ENTRY(x)    .text; _ALIGN_TEXT; _EENTRY(x); _FNSTART
88#define    _END(x)        _LEND(x)
89
90#define    ENTRY(y)    _ENTRY(_C_LABEL(y)); _PROF_PROLOGUE
91#define    EENTRY(y)    _EENTRY(_C_LABEL(y));
92#define    ENTRY_NP(y)    _ENTRY(_C_LABEL(y))
93#define    EENTRY_NP(y)    _EENTRY(_C_LABEL(y))
94#define    END(y)        _END(_C_LABEL(y))
95#define    EEND(y)        _EEND(_C_LABEL(y))
96#define    ASENTRY_NP(y)    _ENTRY(_ASM_LABEL(y))
97
98#if defined (_HAVE_ARMv4T_INSTRUCTIONS)
99#define RET    bx    lr
100#define RETeq    bxeq    lr
101#define RETne    bxne    lr
102#define RETc(c) bx##c    lr
103#else
104#define RET    mov    pc, lr
105#define RETeq    moveq    pc, lr
106#define RETne    movne    pc, lr
107#define RETc(c) mov##c    pc, lr
108#endif
109
110    .syntax    unified
111
112
113
114ENTRY(do_cksum)
115    stmfd    sp!, {r4-r7, lr}
116    bl    L_cksumdata
117    mov    r0, r2
118    ldmfd    sp!, {r4-r7, pc}
119END(do_cksum)
120
121/*
122 * The main in*_cksum() workhorse...
123 *
124 * Entry parameters:
125 *    r0    Pointer to buffer
126 *    r1    Buffer length
127 *    lr    Return address
128 *
129 * Returns:
130 *    r2    Accumulated 32-bit sum
131 *
132 * Clobbers:
133 *    r0-r7
134 */
135/* LINTSTUB: Ignore */
136ASENTRY_NP(L_cksumdata)
137#ifdef _ARM_ARCH_5E
138    pld    [r0]            /* Pre-fetch the start of the buffer */
139#endif
140    mov    r2, #0
141
142    /* We first have to word-align the buffer.  */
143    ands    r7, r0, #0x03
144    beq    .Lcksumdata_wordaligned
145    rsb    r7, r7, #0x04
146    cmp    r1, r7            /* Enough bytes left to make it? */
147    blt    .Lcksumdata_endgame
148    cmp    r7, #0x02
149    ldrb    r4, [r0], #0x01        /* Fetch 1st byte */
150    ldrbge    r5, [r0], #0x01        /* Fetch 2nd byte */
151    movlt    r5, #0x00
152    ldrbgt    r6, [r0], #0x01        /* Fetch 3rd byte */
153    movle    r6, #0x00
154
155    /* Combine the three bytes depending on endianness and alignment */
156#ifdef __ARMEB__
157    orreq    r2, r5, r4, lsl #8
158    orreq    r2, r2, r6, lsl #24
159    orrne    r2, r4, r5, lsl #8
160    orrne    r2, r2, r6, lsl #16
161#else
162    orreq    r2, r4, r5, lsl #8
163    orreq    r2, r2, r6, lsl #16
164    orrne    r2, r5, r4, lsl #8
165    orrne    r2, r2, r6, lsl #24
166#endif
167    subs    r1, r1, r7        /* Update length */
168    RETeq                     /* All done? */
169
170    /* Buffer is now word aligned */
171.Lcksumdata_wordaligned:
172#ifdef _ARM_ARCH_5E
173    cmp    r1, #0x04        /* Less than 4 bytes left? */
174    blt    .Lcksumdata_endgame    /* Yup */
175
176    /* Now quad-align, if necessary */
177    ands    r7, r0, #0x04
178    ldrne    r7, [r0], #0x04
179    subne    r1, r1, #0x04
180    subs    r1, r1, #0x40
181    blt    .Lcksumdata_bigloop_end    /* Note: C flag clear if branch taken */
182
183    /*
184     * Buffer is now quad aligned. Sum 64 bytes at a time.
185     * Note: First ldrd is hoisted above the loop, together with
186     * setting r6 to zero to avoid stalling for results in the
187     * loop. (r7 is live, from above).
188     */
189    ldrd    r4, [r0], #0x08
190    mov    r6, #0x00
191.Lcksumdata_bigloop:
192    pld    [r0, #0x18]
193    adds    r2, r2, r6
194    adcs    r2, r2, r7
195    ldrd    r6, [r0], #0x08
196    adcs    r2, r2, r4
197    adcs    r2, r2, r5
198    ldrd    r4, [r0], #0x08
199    adcs    r2, r2, r6
200    adcs    r2, r2, r7
201    ldrd    r6, [r0], #0x08
202    adcs    r2, r2, r4
203    adcs    r2, r2, r5
204    ldrd    r4, [r0], #0x08
205    adcs    r2, r2, r6
206    adcs    r2, r2, r7
207    pld    [r0, #0x18]
208    ldrd    r6, [r0], #0x08
209    adcs    r2, r2, r4
210    adcs    r2, r2, r5
211    ldrd    r4, [r0], #0x08
212    adcs    r2, r2, r6
213    adcs    r2, r2, r7
214    ldrd    r6, [r0], #0x08
215    adcs    r2, r2, r4
216    adcs    r2, r2, r5
217    adc    r2, r2, #0x00
218    subs    r1, r1, #0x40
219    ldrdge    r4, [r0], #0x08
220    bge    .Lcksumdata_bigloop
221
222    adds    r2, r2, r6        /* r6/r7 still need summing */
223.Lcksumdata_bigloop_end:
224    adcs    r2, r2, r7
225    adc    r2, r2, #0x00
226
227#else    /* !_ARM_ARCH_5E */
228
229    subs    r1, r1, #0x40
230    blt    .Lcksumdata_bigloop_end
231
232.Lcksumdata_bigloop:
233    ldmia    r0!, {r3, r4, r5, r6}
234    adds    r2, r2, r3
235    adcs    r2, r2, r4
236    adcs    r2, r2, r5
237    ldmia    r0!, {r3, r4, r5, r7}
238    adcs    r2, r2, r6
239    adcs    r2, r2, r3
240    adcs    r2, r2, r4
241    adcs    r2, r2, r5
242    ldmia    r0!, {r3, r4, r5, r6}
243    adcs    r2, r2, r7
244    adcs    r2, r2, r3
245    adcs    r2, r2, r4
246    adcs    r2, r2, r5
247    ldmia    r0!, {r3, r4, r5, r7}
248    adcs    r2, r2, r6
249    adcs    r2, r2, r3
250    adcs    r2, r2, r4
251    adcs    r2, r2, r5
252    adcs    r2, r2, r7
253    adc    r2, r2, #0x00
254    subs    r1, r1, #0x40
255    bge    .Lcksumdata_bigloop
256.Lcksumdata_bigloop_end:
257#endif
258
259    adds    r1, r1, #0x40
260    RETeq
261    cmp    r1, #0x20
262
263#ifdef _ARM_ARCH_5E
264    ldrdge    r4, [r0], #0x08        /* Avoid stalling pld and result */
265    blt    .Lcksumdata_less_than_32
266    pld    [r0, #0x18]
267    ldrd    r6, [r0], #0x08
268    adds    r2, r2, r4
269    adcs    r2, r2, r5
270    ldrd    r4, [r0], #0x08
271    adcs    r2, r2, r6
272    adcs    r2, r2, r7
273    ldrd    r6, [r0], #0x08
274    adcs    r2, r2, r4
275    adcs    r2, r2, r5
276    adcs    r2, r2, r6        /* XXX: Unavoidable result stall */
277    adcs    r2, r2, r7
278#else
279    blt    .Lcksumdata_less_than_32
280    ldmia    r0!, {r3, r4, r5, r6}
281    adds    r2, r2, r3
282    adcs    r2, r2, r4
283    adcs    r2, r2, r5
284    ldmia    r0!, {r3, r4, r5, r7}
285    adcs    r2, r2, r6
286    adcs    r2, r2, r3
287    adcs    r2, r2, r4
288    adcs    r2, r2, r5
289    adcs    r2, r2, r7
290#endif
291    adc    r2, r2, #0x00
292    subs    r1, r1, #0x20
293    RETeq
294
295.Lcksumdata_less_than_32:
296    /* There are less than 32 bytes left */
297    and    r3, r1, #0x18
298    rsb    r4, r3, #0x18
299    sub    r1, r1, r3
300    adds    r4, r4, r4, lsr #1    /* Side effect: Clear carry flag */
301    addne    pc, pc, r4
302    nop
303
304/*
305 * Note: We use ldm here, even on armv5e, since the combined issue/result
306 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
307 */
308    /* At least 24 bytes remaining... */
309    ldmia    r0!, {r4, r5}
310    adcs    r2, r2, r4
311    adcs    r2, r2, r5
312
313    /* At least 16 bytes remaining... */
314    ldmia    r0!, {r4, r5}
315    adcs    r2, r2, r4
316    adcs    r2, r2, r5
317
318    /* At least 8 bytes remaining... */
319    ldmia    r0!, {r4, r5}
320    adcs    r2, r2, r4
321    adcs    r2, r2, r5
322
323    /* Less than 8 bytes remaining... */
324    adc    r2, r2, #0x00
325    subs    r1, r1, #0x04
326    blt    .Lcksumdata_lessthan4
327
328    ldr    r4, [r0], #0x04
329    sub    r1, r1, #0x04
330    adds    r2, r2, r4
331    adc    r2, r2, #0x00
332
333    /* Deal with < 4 bytes remaining */
334.Lcksumdata_lessthan4:
335    adds    r1, r1, #0x04
336    RETeq
337
338    /* Deal with 1 to 3 remaining bytes, possibly misaligned */
339.Lcksumdata_endgame:
340    ldrb    r3, [r0]        /* Fetch first byte */
341    cmp    r1, #0x02
342    ldrbge    r4, [r0, #0x01]        /* Fetch 2nd and 3rd as necessary */
343    movlt    r4, #0x00
344    ldrbgt    r5, [r0, #0x02]
345    movle    r5, #0x00
346    /* Combine the three bytes depending on endianness and alignment */
347    tst    r0, #0x01
348#ifdef __ARMEB__
349    orreq    r3, r4, r3, lsl #8
350    orreq    r3, r3, r5, lsl #24
351    orrne    r3, r3, r4, lsl #8
352    orrne    r3, r3, r5, lsl #16
353#else
354    orreq    r3, r3, r4, lsl #8
355    orreq    r3, r3, r5, lsl #16
356    orrne    r3, r4, r3, lsl #8
357    orrne    r3, r3, r5, lsl #24
358#endif
359    adds    r2, r2, r3
360    adc    r2, r2, #0x00
361    RET
362END(L_cksumdata)
363