• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30#include <private/bionic_asm.h>
31
32
33#ifdef HAVE_32_BYTE_CACHE_LINE
34#define CACHE_LINE_SIZE     32
35#else
36#define CACHE_LINE_SIZE     64
37#endif
38
39/*
40 * Optimized memcmp() for Cortex-A9.
41 */
42
43.syntax unified
44
45ENTRY(memcmp)
46        pld         [r0, #(CACHE_LINE_SIZE * 0)]
47        pld         [r0, #(CACHE_LINE_SIZE * 1)]
48
49        /* take of the case where length is 0 or the buffers are the same */
50        cmp         r0, r1
51        moveq       r0, #0
52        bxeq        lr
53
54        pld         [r1, #(CACHE_LINE_SIZE * 0)]
55        pld         [r1, #(CACHE_LINE_SIZE * 1)]
56
57        /* make sure we have at least 8+4 bytes, this simplify things below
58         * and avoid some overhead for small blocks
59         */
60        cmp        r2, #(8+4)
61        bmi        10f
62/*
63 * Neon optimization
64 * Comparing 32 bytes at a time
65 */
66#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
67        subs        r2, r2, #32
68        blo         3f
69
70        /* preload all the cache lines we need. */
71        pld         [r0, #(CACHE_LINE_SIZE * 2)]
72        pld         [r1, #(CACHE_LINE_SIZE * 2)]
73
741:      /* The main loop compares 32 bytes at a time */
75        vld1.8      {d0 - d3}, [r0]!
76        pld         [r0, #(CACHE_LINE_SIZE * 2)]
77        vld1.8      {d4 - d7}, [r1]!
78        pld         [r1, #(CACHE_LINE_SIZE * 2)]
79
80        /* Start subtracting the values and merge results */
81        vsub.i8     q0, q2
82        vsub.i8     q1, q3
83        vorr        q2, q0, q1
84        vorr        d4, d5
85        vmov        r3, ip, d4
86        /* Check if there are any differences among the 32 bytes */
87        orrs        r3, ip
88        bne         2f
89        subs        r2, r2, #32
90        bhs         1b
91        b           3f
922:
93        /* Check if the difference was in the first or last 16 bytes */
94        sub         r0, #32
95        vorr        d0, d1
96        sub         r1, #32
97        vmov        r3, ip, d0
98        orrs        r3, ip
99        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
100        ittt        eq
101        subeq       r2, #16
102        addeq       r0, #16
103        addeq       r1, #16
104
1053:      /* fix-up the remaining count */
106        add         r2, r2, #32
107
108        cmp        r2, #(8+4)
109        bmi        10f
110#endif
111
112        /* save registers */
113        stmfd       sp!, {r4, lr}
114        .cfi_def_cfa_offset 8
115        .cfi_rel_offset r4, 0
116        .cfi_rel_offset lr, 4
117
118        /* since r0 hold the result, move the first source
119         * pointer somewhere else
120         */
121         mov        r4, r0
122
123        /* align first pointer to word boundary
124         * offset = -src & 3
125         */
126        rsb         r3, r4, #0
127        ands        r3, r3, #3
128        beq         0f
129
130        /* align first pointer  */
131        sub         r2, r2, r3
1321:      ldrb        r0, [r4], #1
133        ldrb        ip, [r1], #1
134        subs        r0, r0, ip
135        bne         9f
136        subs        r3, r3, #1
137        bne         1b
138
139
1400:      /* here the first pointer is aligned, and we have at least 4 bytes
141         * to process.
142         */
143
144        /* see if the pointers are congruent */
145        eor         r0, r4, r1
146        ands        r0, r0, #3
147        bne         5f
148
149        /* congruent case, 32 bytes per iteration
150         * We need to make sure there are at least 32+4 bytes left
151         * because we effectively read ahead one word, and we could
152         * read past the buffer (and segfault) if we're not careful.
153         */
154
155        ldr         ip, [r1]
156        subs        r2, r2, #(32 + 4)
157        bmi         1f
158
1590:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
160        pld         [r1, #(CACHE_LINE_SIZE * 2)]
161        ldr         r0, [r4], #4
162        ldr         lr, [r1, #4]!
163        eors        r0, r0, ip
164        ldreq       r0, [r4], #4
165        ldreq       ip, [r1, #4]!
166        eorseq      r0, r0, lr
167        ldreq       r0, [r4], #4
168        ldreq       lr, [r1, #4]!
169        eorseq      r0, r0, ip
170        ldreq       r0, [r4], #4
171        ldreq       ip, [r1, #4]!
172        eorseq      r0, r0, lr
173        ldreq       r0, [r4], #4
174        ldreq       lr, [r1, #4]!
175        eorseq      r0, r0, ip
176        ldreq       r0, [r4], #4
177        ldreq       ip, [r1, #4]!
178        eorseq      r0, r0, lr
179        ldreq       r0, [r4], #4
180        ldreq       lr, [r1, #4]!
181        eorseq      r0, r0, ip
182        ldreq       r0, [r4], #4
183        ldreq       ip, [r1, #4]!
184        eorseq      r0, r0, lr
185        bne         2f
186        subs        r2, r2, #32
187        bhs         0b
188
189        /* do we have at least 4 bytes left? */
1901:      adds        r2, r2, #(32 - 4 + 4)
191        bmi         4f
192
193        /* finish off 4 bytes at a time */
1943:      ldr         r0, [r4], #4
195        ldr         ip, [r1], #4
196        eors        r0, r0, ip
197        bne         2f
198        subs        r2, r2, #4
199        bhs         3b
200
201        /* are we done? */
2024:      adds        r2, r2, #4
203        moveq       r0, #0
204        beq         9f
205
206        /* finish off the remaining bytes */
207        b           8f
208
2092:      /* the last 4 bytes are different, restart them */
210        sub         r4, r4, #4
211        sub         r1, r1, #4
212        mov         r2, #4
213
214        /* process the last few bytes */
2158:      ldrb        r0, [r4], #1
216        ldrb        ip, [r1], #1
217        // stall
218        subs        r0, r0, ip
219        bne         9f
220        subs        r2, r2, #1
221        bne         8b
222
2239:      /* restore registers and return */
224        ldmfd       sp!, {r4, pc}
225
22610:     /* process less than 12 bytes */
227        cmp         r2, #0
228        moveq       r0, #0
229        bxeq        lr
230        mov         r3, r0
23111:
232        ldrb        r0, [r3], #1
233        ldrb        ip, [r1], #1
234        subs        r0, ip
235        bxne        lr
236        subs        r2, r2, #1
237        bne         11b
238        bx          lr
239
2405:      /*************** non-congruent case ***************/
241        and         r0, r1, #3
242        cmp         r0, #2
243        bne         4f
244
245        /* here, offset is 2 (16-bits aligned, special cased) */
246
247        /* make sure we have at least 16 bytes to process */
248        subs        r2, r2, #16
249        addmi       r2, r2, #16
250        bmi         8b
251
252        /* align the unaligned pointer */
253        bic         r1, r1, #3
254        ldr         lr, [r1], #4
255
2566:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
257        pld         [r4, #(CACHE_LINE_SIZE * 2)]
258        mov         ip, lr, lsr #16
259        ldr         lr, [r1], #4
260        ldr         r0, [r4], #4
261        orr         ip, ip, lr, lsl #16
262        eors        r0, r0, ip
263        moveq       ip, lr, lsr #16
264        ldreq       lr, [r1], #4
265        ldreq       r0, [r4], #4
266        orreq       ip, ip, lr, lsl #16
267        eorseq      r0, r0, ip
268        moveq       ip, lr, lsr #16
269        ldreq       lr, [r1], #4
270        ldreq       r0, [r4], #4
271        orreq       ip, ip, lr, lsl #16
272        eorseq      r0, r0, ip
273        moveq       ip, lr, lsr #16
274        ldreq       lr, [r1], #4
275        ldreq       r0, [r4], #4
276        orreq       ip, ip, lr, lsl #16
277        eorseq      r0, r0, ip
278        bne         7f
279        subs        r2, r2, #16
280        bhs         6b
281        sub         r1, r1, #2
282        /* are we done? */
283        adds        r2, r2, #16
284        moveq       r0, #0
285        beq         9b
286        /* finish off the remaining bytes */
287        b           8b
288
2897:      /* fix up the 2 pointers and fallthrough... */
290        sub         r1, r1, #(4+2)
291        sub         r4, r4, #4
292        mov         r2, #4
293        b           8b
294
295
2964:      /*************** offset is 1 or 3 (less optimized) ***************/
297
298		stmfd		sp!, {r5, r6, r7}
299
300        // r5 = rhs
301        // r6 = lhs
302        // r7 = scratch
303
304        mov         r5, r0, lsl #3		/* r5 = right shift */
305        rsb         r6, r5, #32         /* r6 = left shift */
306
307        /* align the unaligned pointer */
308        bic         r1, r1, #3
309        ldr         r7, [r1], #4
310        sub         r2, r2, #8
311
3126:      mov         ip, r7, lsr r5
313        ldr         r7, [r1], #4
314        ldr         r0, [r4], #4
315        orr         ip, ip, r7, lsl r6
316        eors        r0, r0, ip
317        moveq       ip, r7, lsr r5
318        ldreq       r7, [r1], #4
319        ldreq       r0, [r4], #4
320        orreq       ip, ip, r7, lsl r6
321        eorseq      r0, r0, ip
322        bne         7f
323        subs        r2, r2, #8
324        bhs         6b
325
326        sub         r1, r1, r6, lsr #3
327		ldmfd       sp!, {r5, r6, r7}
328
329        /* are we done? */
330        adds        r2, r2, #8
331        moveq       r0, #0
332        beq         9b
333
334        /* finish off the remaining bytes */
335        b           8b
336
3377:      /* fix up the 2 pointers and fallthrough... */
338        sub         r1, r1, #4
339        sub         r1, r1, r6, lsr #3
340        sub         r4, r4, #4
341        mov         r2, #4
342		ldmfd		sp!, {r5, r6, r7}
343        b           8b
344END(memcmp)
345