1/*************************************************************************** 2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of The Linux Foundation nor the names of its contributors may 12 be used to endorse or promote products derived from this software 13 without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 POSSIBILITY OF SUCH DAMAGE. 26 ***************************************************************************/ 27 28/* Assumes neon instructions and a cache line size of 64 bytes. */ 29 30#define PLDOFFS (10) 31#define PLDTHRESH (PLDOFFS) 32#define BBTHRESH (4096/64) 33#define PLDSIZE (64) 34 35#if (PLDOFFS < 1) 36#error Routine does not support offsets less than 1 37#endif 38 39#if (PLDTHRESH < PLDOFFS) 40#error PLD threshold must be greater than or equal to the PLD offset 41#endif 42 43 .text 44 .syntax unified 45 .fpu neon 46 47 // To avoid warning about deprecated instructions, add an explicit 48 // arch. The code generated is exactly the same. 49 .arch armv7-a 50 51.L_memcpy_base: 52 cmp r2, #4 53 blt .L_neon_lt4 54 cmp r2, #16 55 blt .L_neon_lt16 56 cmp r2, #32 57 blt .L_neon_16 58 cmp r2, #64 59 blt .L_neon_copy_32_a 60 61 mov r12, r2, lsr #6 62 cmp r12, #PLDTHRESH 63 ble .L_neon_copy_64_loop_nopld 64 65 push {r9, r10} 66 .cfi_adjust_cfa_offset 8 67 .cfi_rel_offset r9, 0 68 .cfi_rel_offset r10, 4 69 70 cmp r12, #BBTHRESH 71 ble .L_neon_prime_pump 72 73 add lr, r0, #0x400 74 add r9, r1, #(PLDOFFS*PLDSIZE) 75 sub lr, lr, r9 76 lsl lr, lr, #21 77 lsr lr, lr, #21 78 add lr, lr, #(PLDOFFS*PLDSIZE) 79 cmp r12, lr, lsr #6 80 ble .L_neon_prime_pump 81 82 itt gt 83 movgt r9, #(PLDOFFS) 84 rsbsgt r9, r9, lr, lsr #6 85 ble .L_neon_prime_pump 86 87 add r10, r1, lr 88 bic r10, #0x3F 89 90 sub r12, r12, lr, lsr #6 91 92 cmp r9, r12 93 itee le 94 suble r12, r12, r9 95 movgt r9, r12 96 movgt r12, #0 97 98 pld [r1, #((PLDOFFS-1)*PLDSIZE)] 99.L_neon_copy_64_loop_outer_doublepld: 100 pld [r1, #((PLDOFFS)*PLDSIZE)] 101 vld1.32 {q0, q1}, [r1]! 102 vld1.32 {q2, q3}, [r1]! 103 ldr r3, [r10] 104 subs r9, r9, #1 105 vst1.32 {q0, q1}, [r0]! 106 vst1.32 {q2, q3}, [r0]! 107 add r10, #64 108 bne .L_neon_copy_64_loop_outer_doublepld 109 cmp r12, #0 110 beq .L_neon_pop_before_nopld 111 112 cmp r12, #(512*1024/64) 113 blt .L_neon_copy_64_loop_outer 114 115.L_neon_copy_64_loop_ddr: 116 vld1.32 {q0, q1}, [r1]! 117 vld1.32 {q2, q3}, [r1]! 118 pld [r10] 119 subs r12, r12, #1 120 vst1.32 {q0, q1}, [r0]! 121 vst1.32 {q2, q3}, [r0]! 122 add r10, #64 123 bne .L_neon_copy_64_loop_ddr 124 b .L_neon_pop_before_nopld 125 126.L_neon_prime_pump: 127 mov lr, #(PLDOFFS*PLDSIZE) 128 add r10, r1, #(PLDOFFS*PLDSIZE) 129 bic r10, #0x3F 130 sub r12, r12, #PLDOFFS 131 ldr r3, [r10, #(-1*PLDSIZE)] 132 133.L_neon_copy_64_loop_outer: 134 vld1.32 {q0, q1}, [r1]! 135 vld1.32 {q2, q3}, [r1]! 136 ldr r3, [r10] 137 subs r12, r12, #1 138 vst1.32 {q0, q1}, [r0]! 139 vst1.32 {q2, q3}, [r0]! 140 add r10, #64 141 bne .L_neon_copy_64_loop_outer 142 143.L_neon_pop_before_nopld: 144 mov r12, lr, lsr #6 145 pop {r9, r10} 146 .cfi_adjust_cfa_offset -8 147 .cfi_restore r9 148 .cfi_restore r10 149 150.L_neon_copy_64_loop_nopld: 151 vld1.32 {q8, q9}, [r1]! 152 vld1.32 {q10, q11}, [r1]! 153 subs r12, r12, #1 154 vst1.32 {q8, q9}, [r0]! 155 vst1.32 {q10, q11}, [r0]! 156 bne .L_neon_copy_64_loop_nopld 157 ands r2, r2, #0x3f 158 beq .L_neon_exit 159 160.L_neon_copy_32_a: 161 movs r3, r2, lsl #27 162 bcc .L_neon_16 163 vld1.32 {q0,q1}, [r1]! 164 vst1.32 {q0,q1}, [r0]! 165 166.L_neon_16: 167 bpl .L_neon_lt16 168 vld1.32 {q8}, [r1]! 169 vst1.32 {q8}, [r0]! 170 ands r2, r2, #0x0f 171 beq .L_neon_exit 172 173.L_neon_lt16: 174 movs r3, r2, lsl #29 175 bcc 1f 176 vld1.8 {d0}, [r1]! 177 vst1.8 {d0}, [r0]! 1781: 179 bge .L_neon_lt4 180 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! 181 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! 182 183.L_neon_lt4: 184 movs r2, r2, lsl #31 185 itt cs 186 ldrhcs r3, [r1], #2 187 strhcs r3, [r0], #2 188 itt mi 189 ldrbmi r3, [r1] 190 strbmi r3, [r0] 191 192.L_neon_exit: 193 pop {r0, pc} 194