1; Copyright (c) 2007-2008 CSIRO 2; Copyright (c) 2007-2009 Xiph.Org Foundation 3; Copyright (c) 2013 Parrot 4; Written by Aurélien Zanelli 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions 8; are met: 9; 10; - Redistributions of source code must retain the above copyright 11; notice, this list of conditions and the following disclaimer. 12; 13; - Redistributions in binary form must reproduce the above copyright 14; notice, this list of conditions and the following disclaimer in the 15; documentation and/or other materials provided with the distribution. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 21; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 AREA |.text|, CODE, READONLY 30 31 GET celt/arm/armopts.s 32 33IF OPUS_ARM_MAY_HAVE_EDSP 34 EXPORT celt_pitch_xcorr_edsp 35ENDIF 36 37IF OPUS_ARM_MAY_HAVE_NEON 38 EXPORT celt_pitch_xcorr_neon 39ENDIF 40 41IF OPUS_ARM_MAY_HAVE_NEON 42 43; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 44xcorr_kernel_neon PROC 45xcorr_kernel_neon_start 46 ; input: 47 ; r3 = int len 48 ; r4 = opus_val16 *x 49 ; r5 = opus_val16 *y 50 ; q0 = opus_val32 sum[4] 51 ; output: 52 ; q0 = opus_val32 sum[4] 53 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 54 ; internal usage: 55 ; r12 = int j 56 ; d3 = y_3|y_2|y_1|y_0 57 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 58 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 59 ; q8 = scratch 60 ; 61 ; Load y[0...3] 62 ; This requires len>0 to always be valid (which we assert in the C code). 63 VLD1.16 {d5}, [r5]! 64 SUBS r12, r3, #8 65 BLE xcorr_kernel_neon_process4 66; Process 8 samples at a time. 67; This loop loads one y value more than we actually need. Therefore we have to 68; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid 69; reading past the end of the array. 70xcorr_kernel_neon_process8 71 ; This loop has 19 total instructions (10 cycles to issue, minimum), with 72 ; - 2 cycles of ARM insrtuctions, 73 ; - 10 cycles of load/store/byte permute instructions, and 74 ; - 9 cycles of data processing instructions. 75 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the 76 ; latter two categories, meaning the whole loop should run in 10 cycles per 77 ; iteration, barring cache misses. 78 ; 79 ; Load x[0...7] 80 VLD1.16 {d6, d7}, [r4]! 81 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get 82 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. 83 VAND d3, d5, d5 84 SUBS r12, r12, #8 85 ; Load y[4...11] 86 VLD1.16 {d4, d5}, [r5]! 87 VMLAL.S16 q0, d3, d6[0] 88 VEXT.16 d16, d3, d4, #1 89 VMLAL.S16 q0, d4, d7[0] 90 VEXT.16 d17, d4, d5, #1 91 VMLAL.S16 q0, d16, d6[1] 92 VEXT.16 d16, d3, d4, #2 93 VMLAL.S16 q0, d17, d7[1] 94 VEXT.16 d17, d4, d5, #2 95 VMLAL.S16 q0, d16, d6[2] 96 VEXT.16 d16, d3, d4, #3 97 VMLAL.S16 q0, d17, d7[2] 98 VEXT.16 d17, d4, d5, #3 99 VMLAL.S16 q0, d16, d6[3] 100 VMLAL.S16 q0, d17, d7[3] 101 BGT xcorr_kernel_neon_process8 102; Process 4 samples here if we have > 4 left (still reading one extra y value). 103xcorr_kernel_neon_process4 104 ADDS r12, r12, #4 105 BLE xcorr_kernel_neon_process2 106 ; Load x[0...3] 107 VLD1.16 d6, [r4]! 108 ; Use VAND since it's a data processing instruction again. 109 VAND d4, d5, d5 110 SUB r12, r12, #4 111 ; Load y[4...7] 112 VLD1.16 d5, [r5]! 113 VMLAL.S16 q0, d4, d6[0] 114 VEXT.16 d16, d4, d5, #1 115 VMLAL.S16 q0, d16, d6[1] 116 VEXT.16 d16, d4, d5, #2 117 VMLAL.S16 q0, d16, d6[2] 118 VEXT.16 d16, d4, d5, #3 119 VMLAL.S16 q0, d16, d6[3] 120; Process 2 samples here if we have > 2 left (still reading one extra y value). 121xcorr_kernel_neon_process2 122 ADDS r12, r12, #2 123 BLE xcorr_kernel_neon_process1 124 ; Load x[0...1] 125 VLD2.16 {d6[],d7[]}, [r4]! 126 ; Use VAND since it's a data processing instruction again. 127 VAND d4, d5, d5 128 SUB r12, r12, #2 129 ; Load y[4...5] 130 VLD1.32 {d5[]}, [r5]! 131 VMLAL.S16 q0, d4, d6 132 VEXT.16 d16, d4, d5, #1 133 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI 134 ; instead of VEXT, since it's a data-processing instruction. 135 VSRI.64 d5, d4, #32 136 VMLAL.S16 q0, d16, d7 137; Process 1 sample using the extra y value we loaded above. 138xcorr_kernel_neon_process1 139 ; Load next *x 140 VLD1.16 {d6[]}, [r4]! 141 ADDS r12, r12, #1 142 ; y[0...3] are left in d5 from prior iteration(s) (if any) 143 VMLAL.S16 q0, d5, d6 144 MOVLE pc, lr 145; Now process 1 last sample, not reading ahead. 146 ; Load last *y 147 VLD1.16 {d4[]}, [r5]! 148 VSRI.64 d4, d5, #16 149 ; Load last *x 150 VLD1.16 {d6[]}, [r4]! 151 VMLAL.S16 q0, d4, d6 152 MOV pc, lr 153 ENDP 154 155; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, 156; opus_val32 *xcorr, int len, int max_pitch, int arch) 157celt_pitch_xcorr_neon PROC 158 ; input: 159 ; r0 = opus_val16 *_x 160 ; r1 = opus_val16 *_y 161 ; r2 = opus_val32 *xcorr 162 ; r3 = int len 163 ; output: 164 ; r0 = int maxcorr 165 ; internal usage: 166 ; r4 = opus_val16 *x (for xcorr_kernel_neon()) 167 ; r5 = opus_val16 *y (for xcorr_kernel_neon()) 168 ; r6 = int max_pitch 169 ; r12 = int j 170 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) 171 ; ignored: 172 ; int arch 173 STMFD sp!, {r4-r6, lr} 174 LDR r6, [sp, #16] 175 VMOV.S32 q15, #1 176 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 177 SUBS r6, r6, #4 178 BLT celt_pitch_xcorr_neon_process4_done 179celt_pitch_xcorr_neon_process4 180 ; xcorr_kernel_neon parameters: 181 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} 182 MOV r4, r0 183 MOV r5, r1 184 VEOR q0, q0, q0 185 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. 186 ; So we don't save/restore any other registers. 187 BL xcorr_kernel_neon_start 188 SUBS r6, r6, #4 189 VST1.32 {q0}, [r2]! 190 ; _y += 4 191 ADD r1, r1, #8 192 VMAX.S32 q15, q15, q0 193 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done 194 BGE celt_pitch_xcorr_neon_process4 195; We have less than 4 sums left to compute. 196celt_pitch_xcorr_neon_process4_done 197 ADDS r6, r6, #4 198 ; Reduce maxcorr to a single value 199 VMAX.S32 d30, d30, d31 200 VPMAX.S32 d30, d30, d30 201 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done 202 BLE celt_pitch_xcorr_neon_done 203; Now compute each remaining sum one at a time. 204celt_pitch_xcorr_neon_process_remaining 205 MOV r4, r0 206 MOV r5, r1 207 VMOV.I32 q0, #0 208 SUBS r12, r3, #8 209 BLT celt_pitch_xcorr_neon_process_remaining4 210; Sum terms 8 at a time. 211celt_pitch_xcorr_neon_process_remaining_loop8 212 ; Load x[0...7] 213 VLD1.16 {q1}, [r4]! 214 ; Load y[0...7] 215 VLD1.16 {q2}, [r5]! 216 SUBS r12, r12, #8 217 VMLAL.S16 q0, d4, d2 218 VMLAL.S16 q0, d5, d3 219 BGE celt_pitch_xcorr_neon_process_remaining_loop8 220; Sum terms 4 at a time. 221celt_pitch_xcorr_neon_process_remaining4 222 ADDS r12, r12, #4 223 BLT celt_pitch_xcorr_neon_process_remaining4_done 224 ; Load x[0...3] 225 VLD1.16 {d2}, [r4]! 226 ; Load y[0...3] 227 VLD1.16 {d3}, [r5]! 228 SUB r12, r12, #4 229 VMLAL.S16 q0, d3, d2 230celt_pitch_xcorr_neon_process_remaining4_done 231 ; Reduce the sum to a single value. 232 VADD.S32 d0, d0, d1 233 VPADDL.S32 d0, d0 234 ADDS r12, r12, #4 235 BLE celt_pitch_xcorr_neon_process_remaining_loop_done 236; Sum terms 1 at a time. 237celt_pitch_xcorr_neon_process_remaining_loop1 238 VLD1.16 {d2[]}, [r4]! 239 VLD1.16 {d3[]}, [r5]! 240 SUBS r12, r12, #1 241 VMLAL.S16 q0, d2, d3 242 BGT celt_pitch_xcorr_neon_process_remaining_loop1 243celt_pitch_xcorr_neon_process_remaining_loop_done 244 VST1.32 {d0[0]}, [r2]! 245 VMAX.S32 d30, d30, d0 246 SUBS r6, r6, #1 247 ; _y++ 248 ADD r1, r1, #2 249 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining 250 BGT celt_pitch_xcorr_neon_process_remaining 251celt_pitch_xcorr_neon_done 252 VMOV.32 r0, d30[0] 253 LDMFD sp!, {r4-r6, pc} 254 ENDP 255 256ENDIF 257 258IF OPUS_ARM_MAY_HAVE_EDSP 259 260; This will get used on ARMv7 devices without NEON, so it has been optimized 261; to take advantage of dual-issuing where possible. 262xcorr_kernel_edsp PROC 263xcorr_kernel_edsp_start 264 ; input: 265 ; r3 = int len 266 ; r4 = opus_val16 *_x (must be 32-bit aligned) 267 ; r5 = opus_val16 *_y (must be 32-bit aligned) 268 ; r6...r9 = opus_val32 sum[4] 269 ; output: 270 ; r6...r9 = opus_val32 sum[4] 271 ; preserved: r0-r5 272 ; internal usage 273 ; r2 = int j 274 ; r12,r14 = opus_val16 x[4] 275 ; r10,r11 = opus_val16 y[4] 276 STMFD sp!, {r2,r4,r5,lr} 277 LDR r10, [r5], #4 ; Load y[0...1] 278 SUBS r2, r3, #4 ; j = len-4 279 LDR r11, [r5], #4 ; Load y[2...3] 280 BLE xcorr_kernel_edsp_process4_done 281 LDR r12, [r4], #4 ; Load x[0...1] 282 ; Stall 283xcorr_kernel_edsp_process4 284 ; The multiplies must issue from pipeline 0, and can't dual-issue with each 285 ; other. Every other instruction here dual-issues with a multiply, and is 286 ; thus "free". There should be no stalls in the body of the loop. 287 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) 288 LDR r14, [r4], #4 ; Load x[2...3] 289 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) 290 SUBS r2, r2, #4 ; j-=4 291 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) 292 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) 293 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) 294 LDR r10, [r5], #4 ; Load y[4...5] 295 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) 296 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) 297 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) 298 LDRGT r12, [r4], #4 ; Load x[0...1] 299 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) 300 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) 301 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) 302 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) 303 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) 304 LDR r11, [r5], #4 ; Load y[6...7] 305 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) 306 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) 307 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) 308 BGT xcorr_kernel_edsp_process4 309xcorr_kernel_edsp_process4_done 310 ADDS r2, r2, #4 311 BLE xcorr_kernel_edsp_done 312 LDRH r12, [r4], #2 ; r12 = *x++ 313 SUBS r2, r2, #1 ; j-- 314 ; Stall 315 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) 316 LDRHGT r14, [r4], #2 ; r14 = *x++ 317 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) 318 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) 319 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) 320 BLE xcorr_kernel_edsp_done 321 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) 322 SUBS r2, r2, #1 ; j-- 323 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) 324 LDRH r10, [r5], #2 ; r10 = y_4 = *y++ 325 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) 326 LDRHGT r12, [r4], #2 ; r12 = *x++ 327 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) 328 BLE xcorr_kernel_edsp_done 329 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) 330 CMP r2, #1 ; j-- 331 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) 332 LDRH r2, [r5], #2 ; r2 = y_5 = *y++ 333 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) 334 LDRHGT r14, [r4] ; r14 = *x 335 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) 336 BLE xcorr_kernel_edsp_done 337 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) 338 LDRH r11, [r5] ; r11 = y_6 = *y 339 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) 340 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) 341 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) 342xcorr_kernel_edsp_done 343 LDMFD sp!, {r2,r4,r5,pc} 344 ENDP 345 346celt_pitch_xcorr_edsp PROC 347 ; input: 348 ; r0 = opus_val16 *_x (must be 32-bit aligned) 349 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned) 350 ; r2 = opus_val32 *xcorr 351 ; r3 = int len 352 ; output: 353 ; r0 = maxcorr 354 ; internal usage 355 ; r4 = opus_val16 *x 356 ; r5 = opus_val16 *y 357 ; r6 = opus_val32 sum0 358 ; r7 = opus_val32 sum1 359 ; r8 = opus_val32 sum2 360 ; r9 = opus_val32 sum3 361 ; r1 = int max_pitch 362 ; r12 = int j 363 ; ignored: 364 ; int arch 365 STMFD sp!, {r4-r11, lr} 366 MOV r5, r1 367 LDR r1, [sp, #36] 368 MOV r4, r0 369 TST r5, #3 370 ; maxcorr = 1 371 MOV r0, #1 372 BEQ celt_pitch_xcorr_edsp_process1u_done 373; Compute one sum at the start to make y 32-bit aligned. 374 SUBS r12, r3, #4 375 ; r14 = sum = 0 376 MOV r14, #0 377 LDRH r8, [r5], #2 378 BLE celt_pitch_xcorr_edsp_process1u_loop4_done 379 LDR r6, [r4], #4 380 MOV r8, r8, LSL #16 381celt_pitch_xcorr_edsp_process1u_loop4 382 LDR r9, [r5], #4 383 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 384 LDR r7, [r4], #4 385 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1) 386 LDR r8, [r5], #4 387 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) 388 SUBS r12, r12, #4 ; j-=4 389 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3) 390 LDRGT r6, [r4], #4 391 BGT celt_pitch_xcorr_edsp_process1u_loop4 392 MOV r8, r8, LSR #16 393celt_pitch_xcorr_edsp_process1u_loop4_done 394 ADDS r12, r12, #4 395celt_pitch_xcorr_edsp_process1u_loop1 396 LDRHGE r6, [r4], #2 397 ; Stall 398 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) 399 SUBSGE r12, r12, #1 400 LDRHGT r8, [r5], #2 401 BGT celt_pitch_xcorr_edsp_process1u_loop1 402 ; Restore _x 403 SUB r4, r4, r3, LSL #1 404 ; Restore and advance _y 405 SUB r5, r5, r3, LSL #1 406 ; maxcorr = max(maxcorr, sum) 407 CMP r0, r14 408 ADD r5, r5, #2 409 MOVLT r0, r14 410 SUBS r1, r1, #1 411 ; xcorr[i] = sum 412 STR r14, [r2], #4 413 BLE celt_pitch_xcorr_edsp_done 414celt_pitch_xcorr_edsp_process1u_done 415 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 416 SUBS r1, r1, #4 417 BLT celt_pitch_xcorr_edsp_process2 418celt_pitch_xcorr_edsp_process4 419 ; xcorr_kernel_edsp parameters: 420 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} 421 MOV r6, #0 422 MOV r7, #0 423 MOV r8, #0 424 MOV r9, #0 425 BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) 426 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) 427 CMP r0, r6 428 ; _y+=4 429 ADD r5, r5, #8 430 MOVLT r0, r6 431 CMP r0, r7 432 MOVLT r0, r7 433 CMP r0, r8 434 MOVLT r0, r8 435 CMP r0, r9 436 MOVLT r0, r9 437 STMIA r2!, {r6-r9} 438 SUBS r1, r1, #4 439 BGE celt_pitch_xcorr_edsp_process4 440celt_pitch_xcorr_edsp_process2 441 ADDS r1, r1, #2 442 BLT celt_pitch_xcorr_edsp_process1a 443 SUBS r12, r3, #4 444 ; {r10, r11} = {sum0, sum1} = {0, 0} 445 MOV r10, #0 446 MOV r11, #0 447 LDR r8, [r5], #4 448 BLE celt_pitch_xcorr_edsp_process2_loop_done 449 LDR r6, [r4], #4 450 LDR r9, [r5], #4 451celt_pitch_xcorr_edsp_process2_loop4 452 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 453 LDR r7, [r4], #4 454 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 455 SUBS r12, r12, #4 ; j-=4 456 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) 457 LDR r8, [r5], #4 458 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) 459 LDRGT r6, [r4], #4 460 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2) 461 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3) 462 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3) 463 LDRGT r9, [r5], #4 464 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4) 465 BGT celt_pitch_xcorr_edsp_process2_loop4 466celt_pitch_xcorr_edsp_process2_loop_done 467 ADDS r12, r12, #2 468 BLE celt_pitch_xcorr_edsp_process2_1 469 LDR r6, [r4], #4 470 ; Stall 471 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 472 LDR r9, [r5], #4 473 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 474 SUB r12, r12, #2 475 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) 476 MOV r8, r9 477 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) 478celt_pitch_xcorr_edsp_process2_1 479 LDRH r6, [r4], #2 480 ADDS r12, r12, #1 481 ; Stall 482 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) 483 LDRHGT r7, [r4], #2 484 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) 485 BLE celt_pitch_xcorr_edsp_process2_done 486 LDRH r9, [r5], #2 487 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1) 488 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2) 489celt_pitch_xcorr_edsp_process2_done 490 ; Restore _x 491 SUB r4, r4, r3, LSL #1 492 ; Restore and advance _y 493 SUB r5, r5, r3, LSL #1 494 ; maxcorr = max(maxcorr, sum0) 495 CMP r0, r10 496 ADD r5, r5, #2 497 MOVLT r0, r10 498 SUB r1, r1, #2 499 ; maxcorr = max(maxcorr, sum1) 500 CMP r0, r11 501 ; xcorr[i] = sum 502 STR r10, [r2], #4 503 MOVLT r0, r11 504 STR r11, [r2], #4 505celt_pitch_xcorr_edsp_process1a 506 ADDS r1, r1, #1 507 BLT celt_pitch_xcorr_edsp_done 508 SUBS r12, r3, #4 509 ; r14 = sum = 0 510 MOV r14, #0 511 BLT celt_pitch_xcorr_edsp_process1a_loop_done 512 LDR r6, [r4], #4 513 LDR r8, [r5], #4 514 LDR r7, [r4], #4 515 LDR r9, [r5], #4 516celt_pitch_xcorr_edsp_process1a_loop4 517 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 518 SUBS r12, r12, #4 ; j-=4 519 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) 520 LDRGE r6, [r4], #4 521 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) 522 LDRGE r8, [r5], #4 523 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) 524 LDRGE r7, [r4], #4 525 LDRGE r9, [r5], #4 526 BGE celt_pitch_xcorr_edsp_process1a_loop4 527celt_pitch_xcorr_edsp_process1a_loop_done 528 ADDS r12, r12, #2 529 LDRGE r6, [r4], #4 530 LDRGE r8, [r5], #4 531 ; Stall 532 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) 533 SUBGE r12, r12, #2 534 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) 535 ADDS r12, r12, #1 536 LDRHGE r6, [r4], #2 537 LDRHGE r8, [r5], #2 538 ; Stall 539 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) 540 ; maxcorr = max(maxcorr, sum) 541 CMP r0, r14 542 ; xcorr[i] = sum 543 STR r14, [r2], #4 544 MOVLT r0, r14 545celt_pitch_xcorr_edsp_done 546 LDMFD sp!, {r4-r11, pc} 547 ENDP 548 549ENDIF 550 551END 552