1 .syntax unified 2@ Copyright (c) 2007-2008 CSIRO 3@ Copyright (c) 2007-2009 Xiph.Org Foundation 4@ Copyright (c) 2013 Parrot 5@ Written by Aurélien Zanelli 6@ 7@ Redistribution and use in source and binary forms, with or without 8@ modification, are permitted provided that the following conditions 9@ are met: 10@ 11@ - Redistributions of source code must retain the above copyright 12@ notice, this list of conditions and the following disclaimer. 13@ 14@ - Redistributions in binary form must reproduce the above copyright 15@ notice, this list of conditions and the following disclaimer in the 16@ documentation and/or other materials provided with the distribution. 17@ 18@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 .text; .p2align 2; .arch armv7-a 31 .fpu neon 32 .object_arch armv4t 33 34 .include "celt/arm/armopts_gnu.s" 35 36 .if OPUS_ARM_MAY_HAVE_EDSP 37 .global celt_pitch_xcorr_edsp 38 .endif 39 40 .if OPUS_ARM_MAY_HAVE_NEON 41 .global celt_pitch_xcorr_neon 42 .endif 43 44 .if OPUS_ARM_MAY_HAVE_NEON 45 46@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 47; xcorr_kernel_neon: @ PROC 48xcorr_kernel_neon_start: 49 @ input: 50 @ r3 = int len 51 @ r4 = opus_val16 *x 52 @ r5 = opus_val16 *y 53 @ q0 = opus_val32 sum[4] 54 @ output: 55 @ q0 = opus_val32 sum[4] 56 @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 57 @ internal usage: 58 @ r12 = int j 59 @ d3 = y_3|y_2|y_1|y_0 60 @ q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 61 @ q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 62 @ q8 = scratch 63 @ 64 @ Load y[0...3] 65 @ This requires len>0 to always be valid (which we assert in the C code). 66 VLD1.16 {d5}, [r5]! 67 SUBS r12, r3, #8 68 BLE xcorr_kernel_neon_process4 69@ Process 8 samples at a time. 70@ This loop loads one y value more than we actually need. Therefore we have to 71@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid 72@ reading past the end of the array. 73xcorr_kernel_neon_process8: 74 @ This loop has 19 total instructions (10 cycles to issue, minimum), with 75 @ - 2 cycles of ARM insrtuctions, 76 @ - 10 cycles of load/store/byte permute instructions, and 77 @ - 9 cycles of data processing instructions. 78 @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the 79 @ latter two categories, meaning the whole loop should run in 10 cycles per 80 @ iteration, barring cache misses. 81 @ 82 @ Load x[0...7] 83 VLD1.16 {d6, d7}, [r4]! 84 @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get 85 @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. 86 VAND d3, d5, d5 87 SUBS r12, r12, #8 88 @ Load y[4...11] 89 VLD1.16 {d4, d5}, [r5]! 90 VMLAL.S16 q0, d3, d6[0] 91 VEXT.16 d16, d3, d4, #1 92 VMLAL.S16 q0, d4, d7[0] 93 VEXT.16 d17, d4, d5, #1 94 VMLAL.S16 q0, d16, d6[1] 95 VEXT.16 d16, d3, d4, #2 96 VMLAL.S16 q0, d17, d7[1] 97 VEXT.16 d17, d4, d5, #2 98 VMLAL.S16 q0, d16, d6[2] 99 VEXT.16 d16, d3, d4, #3 100 VMLAL.S16 q0, d17, d7[2] 101 VEXT.16 d17, d4, d5, #3 102 VMLAL.S16 q0, d16, d6[3] 103 VMLAL.S16 q0, d17, d7[3] 104 BGT xcorr_kernel_neon_process8 105@ Process 4 samples here if we have > 4 left (still reading one extra y value). 106xcorr_kernel_neon_process4: 107 ADDS r12, r12, #4 108 BLE xcorr_kernel_neon_process2 109 @ Load x[0...3] 110 VLD1.16 d6, [r4]! 111 @ Use VAND since it's a data processing instruction again. 112 VAND d4, d5, d5 113 SUB r12, r12, #4 114 @ Load y[4...7] 115 VLD1.16 d5, [r5]! 116 VMLAL.S16 q0, d4, d6[0] 117 VEXT.16 d16, d4, d5, #1 118 VMLAL.S16 q0, d16, d6[1] 119 VEXT.16 d16, d4, d5, #2 120 VMLAL.S16 q0, d16, d6[2] 121 VEXT.16 d16, d4, d5, #3 122 VMLAL.S16 q0, d16, d6[3] 123@ Process 2 samples here if we have > 2 left (still reading one extra y value). 124xcorr_kernel_neon_process2: 125 ADDS r12, r12, #2 126 BLE xcorr_kernel_neon_process1 127 @ Load x[0...1] 128 VLD2.16 {d6[],d7[]}, [r4]! 129 @ Use VAND since it's a data processing instruction again. 130 VAND d4, d5, d5 131 SUB r12, r12, #2 132 @ Load y[4...5] 133 VLD1.32 {d5[]}, [r5]! 134 VMLAL.S16 q0, d4, d6 135 VEXT.16 d16, d4, d5, #1 136 @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI 137 @ instead of VEXT, since it's a data-processing instruction. 138 VSRI.64 d5, d4, #32 139 VMLAL.S16 q0, d16, d7 140@ Process 1 sample using the extra y value we loaded above. 141xcorr_kernel_neon_process1: 142 @ Load next *x 143 VLD1.16 {d6[]}, [r4]! 144 ADDS r12, r12, #1 145 @ y[0...3] are left in d5 from prior iteration(s) (if any) 146 VMLAL.S16 q0, d5, d6 147 MOVLE pc, lr 148@ Now process 1 last sample, not reading ahead. 149 @ Load last *y 150 VLD1.16 {d4[]}, [r5]! 151 VSRI.64 d4, d5, #16 152 @ Load last *x 153 VLD1.16 {d6[]}, [r4]! 154 VMLAL.S16 q0, d4, d6 155 MOV pc, lr 156 .size xcorr_kernel_neon, .-xcorr_kernel_neon @ ENDP 157 158@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, 159@ opus_val32 *xcorr, int len, int max_pitch) 160; celt_pitch_xcorr_neon: @ PROC 161 @ input: 162 @ r0 = opus_val16 *_x 163 @ r1 = opus_val16 *_y 164 @ r2 = opus_val32 *xcorr 165 @ r3 = int len 166 @ output: 167 @ r0 = int maxcorr 168 @ internal usage: 169 @ r4 = opus_val16 *x (for xcorr_kernel_neon()) 170 @ r5 = opus_val16 *y (for xcorr_kernel_neon()) 171 @ r6 = int max_pitch 172 @ r12 = int j 173 @ q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) 174 STMFD sp!, {r4-r6, lr} 175 LDR r6, [sp, #16] 176 VMOV.S32 q15, #1 177 @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 178 SUBS r6, r6, #4 179 BLT celt_pitch_xcorr_neon_process4_done 180celt_pitch_xcorr_neon_process4: 181 @ xcorr_kernel_neon parameters: 182 @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} 183 MOV r4, r0 184 MOV r5, r1 185 VEOR q0, q0, q0 186 @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. 187 @ So we don't save/restore any other registers. 188 BL xcorr_kernel_neon_start 189 SUBS r6, r6, #4 190 VST1.32 {q0}, [r2]! 191 @ _y += 4 192 ADD r1, r1, #8 193 VMAX.S32 q15, q15, q0 194 @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 195 BGE celt_pitch_xcorr_neon_process4 196@ We have less than 4 sums left to compute. 197celt_pitch_xcorr_neon_process4_done: 198 ADDS r6, r6, #4 199 @ Reduce maxcorr to a single value 200 VMAX.S32 d30, d30, d31 201 VPMAX.S32 d30, d30, d30 202 @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done 203 BLE celt_pitch_xcorr_neon_done 204@ Now compute each remaining sum one at a time. 205celt_pitch_xcorr_neon_process_remaining: 206 MOV r4, r0 207 MOV r5, r1 208 VMOV.I32 q0, #0 209 SUBS r12, r3, #8 210 BLT celt_pitch_xcorr_neon_process_remaining4 211@ Sum terms 8 at a time. 212celt_pitch_xcorr_neon_process_remaining_loop8: 213 @ Load x[0...7] 214 VLD1.16 {q1}, [r4]! 215 @ Load y[0...7] 216 VLD1.16 {q2}, [r5]! 217 SUBS r12, r12, #8 218 VMLAL.S16 q0, d4, d2 219 VMLAL.S16 q0, d5, d3 220 BGE celt_pitch_xcorr_neon_process_remaining_loop8 221@ Sum terms 4 at a time. 222celt_pitch_xcorr_neon_process_remaining4: 223 ADDS r12, r12, #4 224 BLT celt_pitch_xcorr_neon_process_remaining4_done 225 @ Load x[0...3] 226 VLD1.16 {d2}, [r4]! 227 @ Load y[0...3] 228 VLD1.16 {d3}, [r5]! 229 SUB r12, r12, #4 230 VMLAL.S16 q0, d3, d2 231celt_pitch_xcorr_neon_process_remaining4_done: 232 @ Reduce the sum to a single value. 233 VADD.S32 d0, d0, d1 234 VPADDL.S32 d0, d0 235 ADDS r12, r12, #4 236 BLE celt_pitch_xcorr_neon_process_remaining_loop_done 237@ Sum terms 1 at a time. 238celt_pitch_xcorr_neon_process_remaining_loop1: 239 VLD1.16 {d2[]}, [r4]! 240 VLD1.16 {d3[]}, [r5]! 241 SUBS r12, r12, #1 242 VMLAL.S16 q0, d2, d3 243 BGT celt_pitch_xcorr_neon_process_remaining_loop1 244celt_pitch_xcorr_neon_process_remaining_loop_done: 245 VST1.32 {d0[0]}, [r2]! 246 VMAX.S32 d30, d30, d0 247 SUBS r6, r6, #1 248 @ _y++ 249 ADD r1, r1, #2 250 @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining 251 BGT celt_pitch_xcorr_neon_process_remaining 252celt_pitch_xcorr_neon_done: 253 VMOV.32 r0, d30[0] 254 LDMFD sp!, {r4-r6, pc} 255 .size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon @ ENDP 256 257 .endif 258 259 .if OPUS_ARM_MAY_HAVE_EDSP 260 261@ This will get used on ARMv7 devices without NEON, so it has been optimized 262@ to take advantage of dual-issuing where possible. 263; xcorr_kernel_edsp: @ PROC 264xcorr_kernel_edsp_start: 265 @ input: 266 @ r3 = int len 267 @ r4 = opus_val16 *_x (must be 32-bit aligned) 268 @ r5 = opus_val16 *_y (must be 32-bit aligned) 269 @ r6...r9 = opus_val32 sum[4] 270 @ output: 271 @ r6...r9 = opus_val32 sum[4] 272 @ preserved: r0-r5 273 @ internal usage 274 @ r2 = int j 275 @ r12,r14 = opus_val16 x[4] 276 @ r10,r11 = opus_val16 y[4] 277 STMFD sp!, {r2,r4,r5,lr} 278 LDR r10, [r5], #4 @ Load y[0...1] 279 SUBS r2, r3, #4 @ j = len-4 280 LDR r11, [r5], #4 @ Load y[2...3] 281 BLE xcorr_kernel_edsp_process4_done 282 LDR r12, [r4], #4 @ Load x[0...1] 283 @ Stall 284xcorr_kernel_edsp_process4: 285 @ The multiplies must issue from pipeline 0, and can't dual-issue with each 286 @ other. Every other instruction here dual-issues with a multiply, and is 287 @ thus "free". There should be no stalls in the body of the loop. 288 SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_0,y_0) 289 LDR r14, [r4], #4 @ Load x[2...3] 290 SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x_0,y_1) 291 SUBS r2, r2, #4 @ j-=4 292 SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_0,y_2) 293 SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x_0,y_3) 294 SMLATT r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_1,y_1) 295 LDR r10, [r5], #4 @ Load y[4...5] 296 SMLATB r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],x_1,y_2) 297 SMLATT r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_1,y_3) 298 SMLATB r9, r12, r10, r9 @ sum[3] = MAC16_16(sum[3],x_1,y_4) 299 LDRGT r12, [r4], #4 @ Load x[0...1] 300 SMLABB r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_2,y_2) 301 SMLABT r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x_2,y_3) 302 SMLABB r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_2,y_4) 303 SMLABT r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x_2,y_5) 304 SMLATT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_3,y_3) 305 LDR r11, [r5], #4 @ Load y[6...7] 306 SMLATB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],x_3,y_4) 307 SMLATT r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_3,y_5) 308 SMLATB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],x_3,y_6) 309 BGT xcorr_kernel_edsp_process4 310xcorr_kernel_edsp_process4_done: 311 ADDS r2, r2, #4 312 BLE xcorr_kernel_edsp_done 313 LDRH r12, [r4], #2 @ r12 = *x++ 314 SUBS r2, r2, #1 @ j-- 315 @ Stall 316 SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_0) 317 LDRHGT r14, [r4], #2 @ r14 = *x++ 318 SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x,y_1) 319 SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_2) 320 SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x,y_3) 321 BLE xcorr_kernel_edsp_done 322 SMLABT r6, r14, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_1) 323 SUBS r2, r2, #1 @ j-- 324 SMLABB r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x,y_2) 325 LDRH r10, [r5], #2 @ r10 = y_4 = *y++ 326 SMLABT r8, r14, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_3) 327 LDRHGT r12, [r4], #2 @ r12 = *x++ 328 SMLABB r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x,y_4) 329 BLE xcorr_kernel_edsp_done 330 SMLABB r6, r12, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_2) 331 CMP r2, #1 @ j-- 332 SMLABT r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_3) 333 LDRH r2, [r5], #2 @ r2 = y_5 = *y++ 334 SMLABB r8, r12, r10, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_4) 335 LDRHGT r14, [r4] @ r14 = *x 336 SMLABB r9, r12, r2, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_5) 337 BLE xcorr_kernel_edsp_done 338 SMLABT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_3) 339 LDRH r11, [r5] @ r11 = y_6 = *y 340 SMLABB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_4) 341 SMLABB r8, r14, r2, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_5) 342 SMLABB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_6) 343xcorr_kernel_edsp_done: 344 LDMFD sp!, {r2,r4,r5,pc} 345 .size xcorr_kernel_edsp, .-xcorr_kernel_edsp @ ENDP 346 347; celt_pitch_xcorr_edsp: @ PROC 348 @ input: 349 @ r0 = opus_val16 *_x (must be 32-bit aligned) 350 @ r1 = opus_val16 *_y (only needs to be 16-bit aligned) 351 @ r2 = opus_val32 *xcorr 352 @ r3 = int len 353 @ output: 354 @ r0 = maxcorr 355 @ internal usage 356 @ r4 = opus_val16 *x 357 @ r5 = opus_val16 *y 358 @ r6 = opus_val32 sum0 359 @ r7 = opus_val32 sum1 360 @ r8 = opus_val32 sum2 361 @ r9 = opus_val32 sum3 362 @ r1 = int max_pitch 363 @ r12 = int j 364 STMFD sp!, {r4-r11, lr} 365 MOV r5, r1 366 LDR r1, [sp, #36] 367 MOV r4, r0 368 TST r5, #3 369 @ maxcorr = 1 370 MOV r0, #1 371 BEQ celt_pitch_xcorr_edsp_process1u_done 372@ Compute one sum at the start to make y 32-bit aligned. 373 SUBS r12, r3, #4 374 @ r14 = sum = 0 375 MOV r14, #0 376 LDRH r8, [r5], #2 377 BLE celt_pitch_xcorr_edsp_process1u_loop4_done 378 LDR r6, [r4], #4 379 MOV r8, r8, LSL #16 380celt_pitch_xcorr_edsp_process1u_loop4: 381 LDR r9, [r5], #4 382 SMLABT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) 383 LDR r7, [r4], #4 384 SMLATB r14, r6, r9, r14 @ sum = MAC16_16(sum, x_1, y_1) 385 LDR r8, [r5], #4 386 SMLABT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2) 387 SUBS r12, r12, #4 @ j-=4 388 SMLATB r14, r7, r8, r14 @ sum = MAC16_16(sum, x_3, y_3) 389 LDRGT r6, [r4], #4 390 BGT celt_pitch_xcorr_edsp_process1u_loop4 391 MOV r8, r8, LSR #16 392celt_pitch_xcorr_edsp_process1u_loop4_done: 393 ADDS r12, r12, #4 394celt_pitch_xcorr_edsp_process1u_loop1: 395 LDRHGE r6, [r4], #2 396 @ Stall 397 SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y) 398 SUBSGE r12, r12, #1 399 LDRHGT r8, [r5], #2 400 BGT celt_pitch_xcorr_edsp_process1u_loop1 401 @ Restore _x 402 SUB r4, r4, r3, LSL #1 403 @ Restore and advance _y 404 SUB r5, r5, r3, LSL #1 405 @ maxcorr = max(maxcorr, sum) 406 CMP r0, r14 407 ADD r5, r5, #2 408 MOVLT r0, r14 409 SUBS r1, r1, #1 410 @ xcorr[i] = sum 411 STR r14, [r2], #4 412 BLE celt_pitch_xcorr_edsp_done 413celt_pitch_xcorr_edsp_process1u_done: 414 @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 415 SUBS r1, r1, #4 416 BLT celt_pitch_xcorr_edsp_process2 417celt_pitch_xcorr_edsp_process4: 418 @ xcorr_kernel_edsp parameters: 419 @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} 420 MOV r6, #0 421 MOV r7, #0 422 MOV r8, #0 423 MOV r9, #0 424 BL xcorr_kernel_edsp_start @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) 425 @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) 426 CMP r0, r6 427 @ _y+=4 428 ADD r5, r5, #8 429 MOVLT r0, r6 430 CMP r0, r7 431 MOVLT r0, r7 432 CMP r0, r8 433 MOVLT r0, r8 434 CMP r0, r9 435 MOVLT r0, r9 436 STMIA r2!, {r6-r9} 437 SUBS r1, r1, #4 438 BGE celt_pitch_xcorr_edsp_process4 439celt_pitch_xcorr_edsp_process2: 440 ADDS r1, r1, #2 441 BLT celt_pitch_xcorr_edsp_process1a 442 SUBS r12, r3, #4 443 @ {r10, r11} = {sum0, sum1} = {0, 0} 444 MOV r10, #0 445 MOV r11, #0 446 LDR r8, [r5], #4 447 BLE celt_pitch_xcorr_edsp_process2_loop_done 448 LDR r6, [r4], #4 449 LDR r9, [r5], #4 450celt_pitch_xcorr_edsp_process2_loop4: 451 SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) 452 LDR r7, [r4], #4 453 SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) 454 SUBS r12, r12, #4 @ j-=4 455 SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1) 456 LDR r8, [r5], #4 457 SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2) 458 LDRGT r6, [r4], #4 459 SMLABB r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_2, y_2) 460 SMLABT r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_2, y_3) 461 SMLATT r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_3, y_3) 462 LDRGT r9, [r5], #4 463 SMLATB r11, r7, r8, r11 @ sum1 = MAC16_16(sum1, x_3, y_4) 464 BGT celt_pitch_xcorr_edsp_process2_loop4 465celt_pitch_xcorr_edsp_process2_loop_done: 466 ADDS r12, r12, #2 467 BLE celt_pitch_xcorr_edsp_process2_1 468 LDR r6, [r4], #4 469 @ Stall 470 SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) 471 LDR r9, [r5], #4 472 SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) 473 SUB r12, r12, #2 474 SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1) 475 MOV r8, r9 476 SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2) 477celt_pitch_xcorr_edsp_process2_1: 478 LDRH r6, [r4], #2 479 ADDS r12, r12, #1 480 @ Stall 481 SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0) 482 LDRHGT r7, [r4], #2 483 SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1) 484 BLE celt_pitch_xcorr_edsp_process2_done 485 LDRH r9, [r5], #2 486 SMLABT r10, r7, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_1) 487 SMLABB r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_0, y_2) 488celt_pitch_xcorr_edsp_process2_done: 489 @ Restore _x 490 SUB r4, r4, r3, LSL #1 491 @ Restore and advance _y 492 SUB r5, r5, r3, LSL #1 493 @ maxcorr = max(maxcorr, sum0) 494 CMP r0, r10 495 ADD r5, r5, #2 496 MOVLT r0, r10 497 SUB r1, r1, #2 498 @ maxcorr = max(maxcorr, sum1) 499 CMP r0, r11 500 @ xcorr[i] = sum 501 STR r10, [r2], #4 502 MOVLT r0, r11 503 STR r11, [r2], #4 504celt_pitch_xcorr_edsp_process1a: 505 ADDS r1, r1, #1 506 BLT celt_pitch_xcorr_edsp_done 507 SUBS r12, r3, #4 508 @ r14 = sum = 0 509 MOV r14, #0 510 BLT celt_pitch_xcorr_edsp_process1a_loop_done 511 LDR r6, [r4], #4 512 LDR r8, [r5], #4 513 LDR r7, [r4], #4 514 LDR r9, [r5], #4 515celt_pitch_xcorr_edsp_process1a_loop4: 516 SMLABB r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) 517 SUBS r12, r12, #4 @ j-=4 518 SMLATT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1) 519 LDRGE r6, [r4], #4 520 SMLABB r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2) 521 LDRGE r8, [r5], #4 522 SMLATT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_3, y_3) 523 LDRGE r7, [r4], #4 524 LDRGE r9, [r5], #4 525 BGE celt_pitch_xcorr_edsp_process1a_loop4 526celt_pitch_xcorr_edsp_process1a_loop_done: 527 ADDS r12, r12, #2 528 LDRGE r6, [r4], #4 529 LDRGE r8, [r5], #4 530 @ Stall 531 SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0) 532 SUBGE r12, r12, #2 533 SMLATTGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1) 534 ADDS r12, r12, #1 535 LDRHGE r6, [r4], #2 536 LDRHGE r8, [r5], #2 537 @ Stall 538 SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y) 539 @ maxcorr = max(maxcorr, sum) 540 CMP r0, r14 541 @ xcorr[i] = sum 542 STR r14, [r2], #4 543 MOVLT r0, r14 544celt_pitch_xcorr_edsp_done: 545 LDMFD sp!, {r4-r11, pc} 546 .size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp @ ENDP 547 548 .endif 549 550@ END: 551 .section .note.GNU-stack,"",%progbits 552