1 .syntax unified 2@ Copyright (c) 2007-2008 CSIRO 3@ Copyright (c) 2007-2009 Xiph.Org Foundation 4@ Copyright (c) 2013 Parrot 5@ Written by Aurélien Zanelli 6@ 7@ Redistribution and use in source and binary forms, with or without 8@ modification, are permitted provided that the following conditions 9@ are met: 10@ 11@ - Redistributions of source code must retain the above copyright 12@ notice, this list of conditions and the following disclaimer. 13@ 14@ - Redistributions in binary form must reproduce the above copyright 15@ notice, this list of conditions and the following disclaimer in the 16@ documentation and/or other materials provided with the distribution. 17@ 18@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 .text; .p2align 2; .arch armv7-a 31 .fpu neon 32 .object_arch armv4t 33 34 .include "celt/arm/armopts_gnu.s" 35 36 .if OPUS_ARM_MAY_HAVE_EDSP 37 .global celt_pitch_xcorr_edsp 38 .endif 39 40 .if OPUS_ARM_MAY_HAVE_NEON 41 .global celt_pitch_xcorr_neon 42 .endif 43 44 .if OPUS_ARM_MAY_HAVE_NEON 45 46@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 47 .type xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC 48xcorr_kernel_neon_start: 49 @ input: 50 @ r3 = int len 51 @ r4 = opus_val16 *x 52 @ r5 = opus_val16 *y 53 @ q0 = opus_val32 sum[4] 54 @ output: 55 @ q0 = opus_val32 sum[4] 56 @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 57 @ internal usage: 58 @ r12 = int j 59 @ d3 = y_3|y_2|y_1|y_0 60 @ q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 61 @ q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 62 @ q8 = scratch 63 @ 64 @ Load y[0...3] 65 @ This requires len>0 to always be valid (which we assert in the C code). 66 VLD1.16 {d5}, [r5]! 67 SUBS r12, r3, #8 68 BLE xcorr_kernel_neon_process4 69@ Process 8 samples at a time. 70@ This loop loads one y value more than we actually need. Therefore we have to 71@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid 72@ reading past the end of the array. 73xcorr_kernel_neon_process8: 74 @ This loop has 19 total instructions (10 cycles to issue, minimum), with 75 @ - 2 cycles of ARM insrtuctions, 76 @ - 10 cycles of load/store/byte permute instructions, and 77 @ - 9 cycles of data processing instructions. 78 @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the 79 @ latter two categories, meaning the whole loop should run in 10 cycles per 80 @ iteration, barring cache misses. 81 @ 82 @ Load x[0...7] 83 VLD1.16 {d6, d7}, [r4]! 84 @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get 85 @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. 86 VAND d3, d5, d5 87 SUBS r12, r12, #8 88 @ Load y[4...11] 89 VLD1.16 {d4, d5}, [r5]! 90 VMLAL.S16 q0, d3, d6[0] 91 VEXT.16 d16, d3, d4, #1 92 VMLAL.S16 q0, d4, d7[0] 93 VEXT.16 d17, d4, d5, #1 94 VMLAL.S16 q0, d16, d6[1] 95 VEXT.16 d16, d3, d4, #2 96 VMLAL.S16 q0, d17, d7[1] 97 VEXT.16 d17, d4, d5, #2 98 VMLAL.S16 q0, d16, d6[2] 99 VEXT.16 d16, d3, d4, #3 100 VMLAL.S16 q0, d17, d7[2] 101 VEXT.16 d17, d4, d5, #3 102 VMLAL.S16 q0, d16, d6[3] 103 VMLAL.S16 q0, d17, d7[3] 104 BGT xcorr_kernel_neon_process8 105@ Process 4 samples here if we have > 4 left (still reading one extra y value). 106xcorr_kernel_neon_process4: 107 ADDS r12, r12, #4 108 BLE xcorr_kernel_neon_process2 109 @ Load x[0...3] 110 VLD1.16 d6, [r4]! 111 @ Use VAND since it's a data processing instruction again. 112 VAND d4, d5, d5 113 SUB r12, r12, #4 114 @ Load y[4...7] 115 VLD1.16 d5, [r5]! 116 VMLAL.S16 q0, d4, d6[0] 117 VEXT.16 d16, d4, d5, #1 118 VMLAL.S16 q0, d16, d6[1] 119 VEXT.16 d16, d4, d5, #2 120 VMLAL.S16 q0, d16, d6[2] 121 VEXT.16 d16, d4, d5, #3 122 VMLAL.S16 q0, d16, d6[3] 123@ Process 2 samples here if we have > 2 left (still reading one extra y value). 124xcorr_kernel_neon_process2: 125 ADDS r12, r12, #2 126 BLE xcorr_kernel_neon_process1 127 @ Load x[0...1] 128 VLD2.16 {d6[],d7[]}, [r4]! 129 @ Use VAND since it's a data processing instruction again. 130 VAND d4, d5, d5 131 SUB r12, r12, #2 132 @ Load y[4...5] 133 VLD1.32 {d5[]}, [r5]! 134 VMLAL.S16 q0, d4, d6 135 VEXT.16 d16, d4, d5, #1 136 @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI 137 @ instead of VEXT, since it's a data-processing instruction. 138 VSRI.64 d5, d4, #32 139 VMLAL.S16 q0, d16, d7 140@ Process 1 sample using the extra y value we loaded above. 141xcorr_kernel_neon_process1: 142 @ Load next *x 143 VLD1.16 {d6[]}, [r4]! 144 ADDS r12, r12, #1 145 @ y[0...3] are left in d5 from prior iteration(s) (if any) 146 VMLAL.S16 q0, d5, d6 147 MOVLE pc, lr 148@ Now process 1 last sample, not reading ahead. 149 @ Load last *y 150 VLD1.16 {d4[]}, [r5]! 151 VSRI.64 d4, d5, #16 152 @ Load last *x 153 VLD1.16 {d6[]}, [r4]! 154 VMLAL.S16 q0, d4, d6 155 MOV pc, lr 156 .size xcorr_kernel_neon, .-xcorr_kernel_neon @ ENDP 157 158@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, 159@ opus_val32 *xcorr, int len, int max_pitch, int arch) 160 .type celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC 161 @ input: 162 @ r0 = opus_val16 *_x 163 @ r1 = opus_val16 *_y 164 @ r2 = opus_val32 *xcorr 165 @ r3 = int len 166 @ output: 167 @ r0 = int maxcorr 168 @ internal usage: 169 @ r4 = opus_val16 *x (for xcorr_kernel_neon()) 170 @ r5 = opus_val16 *y (for xcorr_kernel_neon()) 171 @ r6 = int max_pitch 172 @ r12 = int j 173 @ q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) 174 @ ignored: 175 @ int arch 176 STMFD sp!, {r4-r6, lr} 177 LDR r6, [sp, #16] 178 VMOV.S32 q15, #1 179 @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 180 SUBS r6, r6, #4 181 BLT celt_pitch_xcorr_neon_process4_done 182celt_pitch_xcorr_neon_process4: 183 @ xcorr_kernel_neon parameters: 184 @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} 185 MOV r4, r0 186 MOV r5, r1 187 VEOR q0, q0, q0 188 @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. 189 @ So we don't save/restore any other registers. 190 BL xcorr_kernel_neon_start 191 SUBS r6, r6, #4 192 VST1.32 {q0}, [r2]! 193 @ _y += 4 194 ADD r1, r1, #8 195 VMAX.S32 q15, q15, q0 196 @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 197 BGE celt_pitch_xcorr_neon_process4 198@ We have less than 4 sums left to compute. 199celt_pitch_xcorr_neon_process4_done: 200 ADDS r6, r6, #4 201 @ Reduce maxcorr to a single value 202 VMAX.S32 d30, d30, d31 203 VPMAX.S32 d30, d30, d30 204 @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done 205 BLE celt_pitch_xcorr_neon_done 206@ Now compute each remaining sum one at a time. 207celt_pitch_xcorr_neon_process_remaining: 208 MOV r4, r0 209 MOV r5, r1 210 VMOV.I32 q0, #0 211 SUBS r12, r3, #8 212 BLT celt_pitch_xcorr_neon_process_remaining4 213@ Sum terms 8 at a time. 214celt_pitch_xcorr_neon_process_remaining_loop8: 215 @ Load x[0...7] 216 VLD1.16 {q1}, [r4]! 217 @ Load y[0...7] 218 VLD1.16 {q2}, [r5]! 219 SUBS r12, r12, #8 220 VMLAL.S16 q0, d4, d2 221 VMLAL.S16 q0, d5, d3 222 BGE celt_pitch_xcorr_neon_process_remaining_loop8 223@ Sum terms 4 at a time. 224celt_pitch_xcorr_neon_process_remaining4: 225 ADDS r12, r12, #4 226 BLT celt_pitch_xcorr_neon_process_remaining4_done 227 @ Load x[0...3] 228 VLD1.16 {d2}, [r4]! 229 @ Load y[0...3] 230 VLD1.16 {d3}, [r5]! 231 SUB r12, r12, #4 232 VMLAL.S16 q0, d3, d2 233celt_pitch_xcorr_neon_process_remaining4_done: 234 @ Reduce the sum to a single value. 235 VADD.S32 d0, d0, d1 236 VPADDL.S32 d0, d0 237 ADDS r12, r12, #4 238 BLE celt_pitch_xcorr_neon_process_remaining_loop_done 239@ Sum terms 1 at a time. 240celt_pitch_xcorr_neon_process_remaining_loop1: 241 VLD1.16 {d2[]}, [r4]! 242 VLD1.16 {d3[]}, [r5]! 243 SUBS r12, r12, #1 244 VMLAL.S16 q0, d2, d3 245 BGT celt_pitch_xcorr_neon_process_remaining_loop1 246celt_pitch_xcorr_neon_process_remaining_loop_done: 247 VST1.32 {d0[0]}, [r2]! 248 VMAX.S32 d30, d30, d0 249 SUBS r6, r6, #1 250 @ _y++ 251 ADD r1, r1, #2 252 @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining 253 BGT celt_pitch_xcorr_neon_process_remaining 254celt_pitch_xcorr_neon_done: 255 VMOV.32 r0, d30[0] 256 LDMFD sp!, {r4-r6, pc} 257 .size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon @ ENDP 258 259 .endif 260 261 .if OPUS_ARM_MAY_HAVE_EDSP 262 263@ This will get used on ARMv7 devices without NEON, so it has been optimized 264@ to take advantage of dual-issuing where possible. 265 .type xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC 266xcorr_kernel_edsp_start: 267 @ input: 268 @ r3 = int len 269 @ r4 = opus_val16 *_x (must be 32-bit aligned) 270 @ r5 = opus_val16 *_y (must be 32-bit aligned) 271 @ r6...r9 = opus_val32 sum[4] 272 @ output: 273 @ r6...r9 = opus_val32 sum[4] 274 @ preserved: r0-r5 275 @ internal usage 276 @ r2 = int j 277 @ r12,r14 = opus_val16 x[4] 278 @ r10,r11 = opus_val16 y[4] 279 STMFD sp!, {r2,r4,r5,lr} 280 LDR r10, [r5], #4 @ Load y[0...1] 281 SUBS r2, r3, #4 @ j = len-4 282 LDR r11, [r5], #4 @ Load y[2...3] 283 BLE xcorr_kernel_edsp_process4_done 284 LDR r12, [r4], #4 @ Load x[0...1] 285 @ Stall 286xcorr_kernel_edsp_process4: 287 @ The multiplies must issue from pipeline 0, and can't dual-issue with each 288 @ other. Every other instruction here dual-issues with a multiply, and is 289 @ thus "free". There should be no stalls in the body of the loop. 290 SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_0,y_0) 291 LDR r14, [r4], #4 @ Load x[2...3] 292 SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x_0,y_1) 293 SUBS r2, r2, #4 @ j-=4 294 SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_0,y_2) 295 SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x_0,y_3) 296 SMLATT r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_1,y_1) 297 LDR r10, [r5], #4 @ Load y[4...5] 298 SMLATB r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],x_1,y_2) 299 SMLATT r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_1,y_3) 300 SMLATB r9, r12, r10, r9 @ sum[3] = MAC16_16(sum[3],x_1,y_4) 301 LDRGT r12, [r4], #4 @ Load x[0...1] 302 SMLABB r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_2,y_2) 303 SMLABT r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x_2,y_3) 304 SMLABB r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_2,y_4) 305 SMLABT r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x_2,y_5) 306 SMLATT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_3,y_3) 307 LDR r11, [r5], #4 @ Load y[6...7] 308 SMLATB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],x_3,y_4) 309 SMLATT r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_3,y_5) 310 SMLATB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],x_3,y_6) 311 BGT xcorr_kernel_edsp_process4 312xcorr_kernel_edsp_process4_done: 313 ADDS r2, r2, #4 314 BLE xcorr_kernel_edsp_done 315 LDRH r12, [r4], #2 @ r12 = *x++ 316 SUBS r2, r2, #1 @ j-- 317 @ Stall 318 SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_0) 319 LDRHGT r14, [r4], #2 @ r14 = *x++ 320 SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x,y_1) 321 SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_2) 322 SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x,y_3) 323 BLE xcorr_kernel_edsp_done 324 SMLABT r6, r14, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_1) 325 SUBS r2, r2, #1 @ j-- 326 SMLABB r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x,y_2) 327 LDRH r10, [r5], #2 @ r10 = y_4 = *y++ 328 SMLABT r8, r14, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_3) 329 LDRHGT r12, [r4], #2 @ r12 = *x++ 330 SMLABB r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x,y_4) 331 BLE xcorr_kernel_edsp_done 332 SMLABB r6, r12, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_2) 333 CMP r2, #1 @ j-- 334 SMLABT r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_3) 335 LDRH r2, [r5], #2 @ r2 = y_5 = *y++ 336 SMLABB r8, r12, r10, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_4) 337 LDRHGT r14, [r4] @ r14 = *x 338 SMLABB r9, r12, r2, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_5) 339 BLE xcorr_kernel_edsp_done 340 SMLABT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_3) 341 LDRH r11, [r5] @ r11 = y_6 = *y 342 SMLABB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_4) 343 SMLABB r8, r14, r2, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_5) 344 SMLABB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_6) 345xcorr_kernel_edsp_done: 346 LDMFD sp!, {r2,r4,r5,pc} 347 .size xcorr_kernel_edsp, .-xcorr_kernel_edsp @ ENDP 348 349 .type celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC 350 @ input: 351 @ r0 = opus_val16 *_x (must be 32-bit aligned) 352 @ r1 = opus_val16 *_y (only needs to be 16-bit aligned) 353 @ r2 = opus_val32 *xcorr 354 @ r3 = int len 355 @ output: 356 @ r0 = maxcorr 357 @ internal usage 358 @ r4 = opus_val16 *x 359 @ r5 = opus_val16 *y 360 @ r6 = opus_val32 sum0 361 @ r7 = opus_val32 sum1 362 @ r8 = opus_val32 sum2 363 @ r9 = opus_val32 sum3 364 @ r1 = int max_pitch 365 @ r12 = int j 366 @ ignored: 367 @ int arch 368 STMFD sp!, {r4-r11, lr} 369 MOV r5, r1 370 LDR r1, [sp, #36] 371 MOV r4, r0 372 TST r5, #3 373 @ maxcorr = 1 374 MOV r0, #1 375 BEQ celt_pitch_xcorr_edsp_process1u_done 376@ Compute one sum at the start to make y 32-bit aligned. 377 SUBS r12, r3, #4 378 @ r14 = sum = 0 379 MOV r14, #0 380 LDRH r8, [r5], #2 381 BLE celt_pitch_xcorr_edsp_process1u_loop4_done 382 LDR r6, [r4], #4 383 MOV r8, r8, LSL #16 384celt_pitch_xcorr_edsp_process1u_loop4: 385 LDR r9, [r5], #4 386 SMLABT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) 387 LDR r7, [r4], #4 388 SMLATB r14, r6, r9, r14 @ sum = MAC16_16(sum, x_1, y_1) 389 LDR r8, [r5], #4 390 SMLABT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2) 391 SUBS r12, r12, #4 @ j-=4 392 SMLATB r14, r7, r8, r14 @ sum = MAC16_16(sum, x_3, y_3) 393 LDRGT r6, [r4], #4 394 BGT celt_pitch_xcorr_edsp_process1u_loop4 395 MOV r8, r8, LSR #16 396celt_pitch_xcorr_edsp_process1u_loop4_done: 397 ADDS r12, r12, #4 398celt_pitch_xcorr_edsp_process1u_loop1: 399 LDRHGE r6, [r4], #2 400 @ Stall 401 SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y) 402 SUBSGE r12, r12, #1 403 LDRHGT r8, [r5], #2 404 BGT celt_pitch_xcorr_edsp_process1u_loop1 405 @ Restore _x 406 SUB r4, r4, r3, LSL #1 407 @ Restore and advance _y 408 SUB r5, r5, r3, LSL #1 409 @ maxcorr = max(maxcorr, sum) 410 CMP r0, r14 411 ADD r5, r5, #2 412 MOVLT r0, r14 413 SUBS r1, r1, #1 414 @ xcorr[i] = sum 415 STR r14, [r2], #4 416 BLE celt_pitch_xcorr_edsp_done 417celt_pitch_xcorr_edsp_process1u_done: 418 @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 419 SUBS r1, r1, #4 420 BLT celt_pitch_xcorr_edsp_process2 421celt_pitch_xcorr_edsp_process4: 422 @ xcorr_kernel_edsp parameters: 423 @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} 424 MOV r6, #0 425 MOV r7, #0 426 MOV r8, #0 427 MOV r9, #0 428 BL xcorr_kernel_edsp_start @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) 429 @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) 430 CMP r0, r6 431 @ _y+=4 432 ADD r5, r5, #8 433 MOVLT r0, r6 434 CMP r0, r7 435 MOVLT r0, r7 436 CMP r0, r8 437 MOVLT r0, r8 438 CMP r0, r9 439 MOVLT r0, r9 440 STMIA r2!, {r6-r9} 441 SUBS r1, r1, #4 442 BGE celt_pitch_xcorr_edsp_process4 443celt_pitch_xcorr_edsp_process2: 444 ADDS r1, r1, #2 445 BLT celt_pitch_xcorr_edsp_process1a 446 SUBS r12, r3, #4 447 @ {r10, r11} = {sum0, sum1} = {0, 0} 448 MOV r10, #0 449 MOV r11, #0 450 LDR r8, [r5], #4 451 BLE celt_pitch_xcorr_edsp_process2_loop_done 452 LDR r6, [r4], #4 453 LDR r9, [r5], #4 454celt_pitch_xcorr_edsp_process2_loop4: 455 SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) 456 LDR r7, [r4], #4 457 SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) 458 SUBS r12, r12, #4 @ j-=4 459 SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1) 460 LDR r8, [r5], #4 461 SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2) 462 LDRGT r6, [r4], #4 463 SMLABB r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_2, y_2) 464 SMLABT r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_2, y_3) 465 SMLATT r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_3, y_3) 466 LDRGT r9, [r5], #4 467 SMLATB r11, r7, r8, r11 @ sum1 = MAC16_16(sum1, x_3, y_4) 468 BGT celt_pitch_xcorr_edsp_process2_loop4 469celt_pitch_xcorr_edsp_process2_loop_done: 470 ADDS r12, r12, #2 471 BLE celt_pitch_xcorr_edsp_process2_1 472 LDR r6, [r4], #4 473 @ Stall 474 SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) 475 LDR r9, [r5], #4 476 SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) 477 SUB r12, r12, #2 478 SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1) 479 MOV r8, r9 480 SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2) 481celt_pitch_xcorr_edsp_process2_1: 482 LDRH r6, [r4], #2 483 ADDS r12, r12, #1 484 @ Stall 485 SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) 486 LDRHGT r7, [r4], #2 487 SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) 488 BLE celt_pitch_xcorr_edsp_process2_done 489 LDRH r9, [r5], #2 490 SMLABT r10, r7, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_1) 491 SMLABB r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_0, y_2) 492celt_pitch_xcorr_edsp_process2_done: 493 @ Restore _x 494 SUB r4, r4, r3, LSL #1 495 @ Restore and advance _y 496 SUB r5, r5, r3, LSL #1 497 @ maxcorr = max(maxcorr, sum0) 498 CMP r0, r10 499 ADD r5, r5, #2 500 MOVLT r0, r10 501 SUB r1, r1, #2 502 @ maxcorr = max(maxcorr, sum1) 503 CMP r0, r11 504 @ xcorr[i] = sum 505 STR r10, [r2], #4 506 MOVLT r0, r11 507 STR r11, [r2], #4 508celt_pitch_xcorr_edsp_process1a: 509 ADDS r1, r1, #1 510 BLT celt_pitch_xcorr_edsp_done 511 SUBS r12, r3, #4 512 @ r14 = sum = 0 513 MOV r14, #0 514 BLT celt_pitch_xcorr_edsp_process1a_loop_done 515 LDR r6, [r4], #4 516 LDR r8, [r5], #4 517 LDR r7, [r4], #4 518 LDR r9, [r5], #4 519celt_pitch_xcorr_edsp_process1a_loop4: 520 SMLABB r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) 521 SUBS r12, r12, #4 @ j-=4 522 SMLATT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1) 523 LDRGE r6, [r4], #4 524 SMLABB r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2) 525 LDRGE r8, [r5], #4 526 SMLATT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_3, y_3) 527 LDRGE r7, [r4], #4 528 LDRGE r9, [r5], #4 529 BGE celt_pitch_xcorr_edsp_process1a_loop4 530celt_pitch_xcorr_edsp_process1a_loop_done: 531 ADDS r12, r12, #2 532 LDRGE r6, [r4], #4 533 LDRGE r8, [r5], #4 534 @ Stall 535 SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) 536 SUBGE r12, r12, #2 537 SMLATTGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1) 538 ADDS r12, r12, #1 539 LDRHGE r6, [r4], #2 540 LDRHGE r8, [r5], #2 541 @ Stall 542 SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y) 543 @ maxcorr = max(maxcorr, sum) 544 CMP r0, r14 545 @ xcorr[i] = sum 546 STR r14, [r2], #4 547 MOVLT r0, r14 548celt_pitch_xcorr_edsp_done: 549 LDMFD sp!, {r4-r11, pc} 550 .size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp @ ENDP 551 552 .endif 553 554@ END: 555 .section .note.GNU-stack,"",%progbits 556