• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30
31    .text
32
33    .global __memcmp16
34    .type __memcmp16, %function
35    .align 4
36
37/*
38 * Optimized memcmp16() for ARM9.
39 * This would not be optimal on XScale or ARM11, where more prefetching
40 * and use of PLD will be needed.
41 * The 2 major optimzations here are
42 * (1) The main loop compares 16 bytes at a time
43 * (2) The loads are scheduled in a way they won't stall
44 */
45
46__memcmp16:
47        .fnstart
48        PLD         (r0, #0)
49        PLD         (r1, #0)
50
51        /* take of the case where length is nul or the buffers are the same */
52        cmp         r0, r1
53        cmpne       r2, #0
54        moveq       r0, #0
55        bxeq        lr
56
57        /* since r0 hold the result, move the first source
58         * pointer somewhere else
59         */
60
61        mov         r3, r0
62
63         /* make sure we have at least 12 words, this simplify things below
64          * and avoid some overhead for small blocks
65          */
66
67        cmp         r2, #12
68        bpl         0f
69
70        /* small blocks (less then 12 words) */
71        PLD         (r0, #32)
72        PLD         (r1, #32)
73
741:      ldrh        r0, [r3], #2
75        ldrh        ip, [r1], #2
76        subs        r0, r0, ip
77        bxne        lr
78        subs        r2, r2, #1
79        bne         1b
80        bx          lr
81
82
83        .save {r4, lr}
84        /* save registers */
850:      stmfd       sp!, {r4, lr}
86
87        /* align first pointer to word boundary */
88        tst         r3, #2
89        beq         0f
90
91        ldrh        r0, [r3], #2
92        ldrh        ip, [r1], #2
93        sub         r2, r2, #1
94        subs        r0, r0, ip
95        /* restore registers and return */
96        ldmnefd     sp!, {r4, lr}
97        bxne        lr
98        .fnend
99
100
101
1020:      /* here the first pointer is aligned, and we have at least 3 words
103         * to process.
104         */
105
106        /* see if the pointers are congruent */
107        eor         r0, r3, r1
108        ands        r0, r0, #2
109        bne         5f
110
111        /* congruent case, 16 half-words per iteration
112         * We need to make sure there are at least 16+2 words left
113         * because we effectively read ahead one long word, and we could
114         * read past the buffer (and segfault) if we're not careful.
115         */
116
117        ldr         ip, [r1]
118        subs        r2, r2, #(16 + 2)
119        bmi         1f
120
1210:
122        PLD         (r3, #64)
123        PLD         (r1, #64)
124        ldr         r0, [r3], #4
125        ldr         lr, [r1, #4]!
126        eors        r0, r0, ip
127        ldreq       r0, [r3], #4
128        ldreq       ip, [r1, #4]!
129        eoreqs      r0, r0, lr
130        ldreq       r0, [r3], #4
131        ldreq       lr, [r1, #4]!
132        eoreqs      r0, r0, ip
133        ldreq       r0, [r3], #4
134        ldreq       ip, [r1, #4]!
135        eoreqs      r0, r0, lr
136        ldreq       r0, [r3], #4
137        ldreq       lr, [r1, #4]!
138        eoreqs      r0, r0, ip
139        ldreq       r0, [r3], #4
140        ldreq       ip, [r1, #4]!
141        eoreqs      r0, r0, lr
142        ldreq       r0, [r3], #4
143        ldreq       lr, [r1, #4]!
144        eoreqs      r0, r0, ip
145        ldreq       r0, [r3], #4
146        ldreq       ip, [r1, #4]!
147        eoreqs      r0, r0, lr
148        bne         2f
149        subs        r2, r2, #16
150        bhs         0b
151
152        /* do we have at least 2 words left? */
1531:      adds        r2, r2, #(16 - 2 + 2)
154        bmi         4f
155
156        /* finish off 2 words at a time */
1573:      ldr         r0, [r3], #4
158        ldr         ip, [r1], #4
159        eors        r0, r0, ip
160        bne         2f
161        subs        r2, r2, #2
162        bhs         3b
163
164        /* are we done? */
1654:      adds        r2, r2, #2
166        bne         8f
167        /* restore registers and return */
168        mov         r0, #0
169        ldmfd       sp!, {r4, lr}
170        bx          lr
171
1722:      /* the last 2 words are different, restart them */
173        ldrh        r0, [r3, #-4]
174        ldrh        ip, [r1, #-4]
175        subs        r0, r0, ip
176        ldreqh      r0, [r3, #-2]
177        ldreqh      ip, [r1, #-2]
178        subeqs      r0, r0, ip
179        /* restore registers and return */
180        ldmfd       sp!, {r4, lr}
181        bx          lr
182
183        /* process the last few words */
1848:      ldrh        r0, [r3], #2
185        ldrh        ip, [r1], #2
186        subs        r0, r0, ip
187        bne         9f
188        subs        r2, r2, #1
189        bne         8b
190
1919:      /* restore registers and return */
192        ldmfd       sp!, {r4, lr}
193        bx          lr
194
195
1965:      /*************** non-congruent case ***************/
197
198        /* align the unaligned pointer */
199        bic         r1, r1, #3
200        ldr         lr, [r1], #4
201        sub         r2, r2, #8
202
2036:
204        PLD         (r3, #64)
205        PLD         (r1, #64)
206        mov         ip, lr, lsr #16
207        ldr         lr, [r1], #4
208        ldr         r0, [r3], #4
209        orr         ip, ip, lr, lsl #16
210        eors        r0, r0, ip
211        moveq       ip, lr, lsr #16
212        ldreq       lr, [r1], #4
213        ldreq       r0, [r3], #4
214        orreq       ip, ip, lr, lsl #16
215        eoreqs      r0, r0, ip
216        moveq       ip, lr, lsr #16
217        ldreq       lr, [r1], #4
218        ldreq       r0, [r3], #4
219        orreq       ip, ip, lr, lsl #16
220        eoreqs      r0, r0, ip
221        moveq       ip, lr, lsr #16
222        ldreq       lr, [r1], #4
223        ldreq       r0, [r3], #4
224        orreq       ip, ip, lr, lsl #16
225        eoreqs      r0, r0, ip
226        bne         7f
227        subs        r2, r2, #8
228        bhs         6b
229        sub         r1, r1, #2
230        /* are we done? */
231        adds        r2, r2, #8
232        moveq       r0, #0
233        beq         9b
234        /* finish off the remaining bytes */
235        b           8b
236
2377:      /* fix up the 2 pointers and fallthrough... */
238        sub         r1, r1, #2
239        b           2b
240