1/* 2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> 3 * 4 * This file is part of FFmpeg 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "config.h" 22#include "asm.S" 23 24/** 25 * Assume that len is a positive number and is multiple of 8 26 */ 27@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) 28function ff_vector_fmul_vfp, export=1 29 vpush {d8-d15} 30 fmrx r12, fpscr 31 orr r12, r12, #(3 << 16) /* set vector size to 4 */ 32 fmxr fpscr, r12 33 34 vldmia r1!, {s0-s3} 35 vldmia r2!, {s8-s11} 36 vldmia r1!, {s4-s7} 37 vldmia r2!, {s12-s15} 38 vmul.f32 s8, s0, s8 391: 40 subs r3, r3, #16 41 vmul.f32 s12, s4, s12 42 itttt ge 43 vldmiage r1!, {s16-s19} 44 vldmiage r2!, {s24-s27} 45 vldmiage r1!, {s20-s23} 46 vldmiage r2!, {s28-s31} 47 it ge 48 vmulge.f32 s24, s16, s24 49 vstmia r0!, {s8-s11} 50 vstmia r0!, {s12-s15} 51 it ge 52 vmulge.f32 s28, s20, s28 53 itttt gt 54 vldmiagt r1!, {s0-s3} 55 vldmiagt r2!, {s8-s11} 56 vldmiagt r1!, {s4-s7} 57 vldmiagt r2!, {s12-s15} 58 ittt ge 59 vmulge.f32 s8, s0, s8 60 vstmiage r0!, {s24-s27} 61 vstmiage r0!, {s28-s31} 62 bgt 1b 63 64 bic r12, r12, #(7 << 16) /* set vector size back to 1 */ 65 fmxr fpscr, r12 66 vpop {d8-d15} 67 bx lr 68endfunc 69 70/** 71 * ARM VFP implementation of 'vector_fmul_window_c' function 72 * Assume that len is a positive non-zero number 73 */ 74@ void ff_vector_fmul_window_vfp(float *dst, const float *src0, 75@ const float *src1, const float *win, int len) 76function ff_vector_fmul_window_vfp, export=1 77DST0 .req a1 78SRC0 .req a2 79SRC1 .req a3 80WIN0 .req a4 81LEN .req v1 82DST1 .req v2 83WIN1 .req v3 84OLDFPSCR .req ip 85 86 push {v1-v3,lr} 87 ldr LEN, [sp, #4*4+0] 88 vpush {s16-s31} 89 fmrx OLDFPSCR, FPSCR 90 add DST1, DST0, LEN, lsl #3 91 add SRC1, SRC1, LEN, lsl #2 92 add WIN1, WIN0, LEN, lsl #3 93 94 tst LEN, #7 95 beq 4f @ common case: len is a multiple of 8 96 97 ldr lr, =0x03000000 @ RunFast mode, scalar mode 98 fmxr FPSCR, lr 99 100 tst LEN, #1 101 beq 1f 102 vldmdb WIN1!, {s0} 103 vldmia SRC0!, {s8} 104 vldmia WIN0!, {s16} 105 vmul.f s24, s0, s8 106 vldmdb SRC1!, {s20} 107 vmul.f s8, s16, s8 108 vmls.f s24, s16, s20 109 vmla.f s8, s0, s20 110 vstmia DST0!, {s24} 111 vstmdb DST1!, {s8} 1121: 113 tst LEN, #2 114 beq 2f 115 vldmdb WIN1!, {s0} 116 vldmdb WIN1!, {s1} 117 vldmia SRC0!, {s8-s9} 118 vldmia WIN0!, {s16-s17} 119 vmul.f s24, s0, s8 120 vmul.f s25, s1, s9 121 vldmdb SRC1!, {s20} 122 vldmdb SRC1!, {s21} 123 vmul.f s8, s16, s8 124 vmul.f s9, s17, s9 125 vmls.f s24, s16, s20 126 vmls.f s25, s17, s21 127 vmla.f s8, s0, s20 128 vmla.f s9, s1, s21 129 vstmia DST0!, {s24-s25} 130 vstmdb DST1!, {s8} 131 vstmdb DST1!, {s9} 1322: 133 tst LEN, #4 134 beq 3f 135 vldmdb WIN1!, {s0} 136 vldmdb WIN1!, {s1} 137 vldmdb WIN1!, {s2} 138 vldmdb WIN1!, {s3} 139 vldmia SRC0!, {s8-s11} 140 vldmia WIN0!, {s16-s19} 141 vmul.f s24, s0, s8 142 vmul.f s25, s1, s9 143 vmul.f s26, s2, s10 144 vmul.f s27, s3, s11 145 vldmdb SRC1!, {s20} 146 vldmdb SRC1!, {s21} 147 vldmdb SRC1!, {s22} 148 vldmdb SRC1!, {s23} 149 vmul.f s8, s16, s8 150 vmul.f s9, s17, s9 151 vmul.f s10, s18, s10 152 vmul.f s11, s19, s11 153 vmls.f s24, s16, s20 154 vmls.f s25, s17, s21 155 vmls.f s26, s18, s22 156 vmls.f s27, s19, s23 157 vmla.f s8, s0, s20 158 vmla.f s9, s1, s21 159 vmla.f s10, s2, s22 160 vmla.f s11, s3, s23 161 vstmia DST0!, {s24-s27} 162 vstmdb DST1!, {s8} 163 vstmdb DST1!, {s9} 164 vstmdb DST1!, {s10} 165 vstmdb DST1!, {s11} 1663: 167 bics LEN, LEN, #7 168 beq 7f 1694: 170 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 171 fmxr FPSCR, lr 172 173 vldmdb WIN1!, {s0} 174 vldmdb WIN1!, {s1} 175 vldmdb WIN1!, {s2} 176 vldmdb WIN1!, {s3} 177 vldmia SRC0!, {s8-s11} 178 vldmia WIN0!, {s16-s19} 179 vmul.f s24, s0, s8 @ vector * vector 180 vldmdb SRC1!, {s20} 181 vldmdb SRC1!, {s21} 182 vldmdb SRC1!, {s22} 183 vldmdb SRC1!, {s23} 184 vmul.f s8, s16, s8 @ vector * vector 185 vmls.f s24, s16, s20 @ vector * vector 186 vldmdb WIN1!, {s4} 187 vldmdb WIN1!, {s5} 188 vldmdb WIN1!, {s6} 189 vldmdb WIN1!, {s7} 190 vldmia SRC0!, {s12-s13} 191 vmla.f s8, s0, s20 @ vector * vector 192 vldmia SRC0!, {s14-s15} 193 subs LEN, LEN, #8 194 beq 6f 1955: vldmia WIN0!, {s20-s23} 196 vmul.f s28, s4, s12 @ vector * vector 197 vstmia DST0!, {s24-s25} 198 vldmdb SRC1!, {s16} 199 vldmdb SRC1!, {s17} 200 vldmdb SRC1!, {s18} 201 vldmdb SRC1!, {s19} 202 vmul.f s12, s20, s12 @ vector * vector 203 vstmia DST0!, {s26-s27} 204 vstmdb DST1!, {s8} 205 vstmdb DST1!, {s9} 206 vstmdb DST1!, {s10} 207 vstmdb DST1!, {s11} 208 vmls.f s28, s20, s16 @ vector * vector 209 vldmdb WIN1!, {s0} 210 vldmdb WIN1!, {s1} 211 vldmdb WIN1!, {s2} 212 vldmdb WIN1!, {s3} 213 vldmia SRC0!, {s8-s9} 214 vmla.f s12, s4, s16 @ vector * vector 215 vldmia SRC0!, {s10-s11} 216 subs LEN, LEN, #8 217 vldmia WIN0!, {s16-s19} 218 vmul.f s24, s0, s8 @ vector * vector 219 vstmia DST0!, {s28-s29} 220 vldmdb SRC1!, {s20} 221 vldmdb SRC1!, {s21} 222 vldmdb SRC1!, {s22} 223 vldmdb SRC1!, {s23} 224 vmul.f s8, s16, s8 @ vector * vector 225 vstmia DST0!, {s30-s31} 226 vstmdb DST1!, {s12} 227 vstmdb DST1!, {s13} 228 vstmdb DST1!, {s14} 229 vstmdb DST1!, {s15} 230 vmls.f s24, s16, s20 @ vector * vector 231 vldmdb WIN1!, {s4} 232 vldmdb WIN1!, {s5} 233 vldmdb WIN1!, {s6} 234 vldmdb WIN1!, {s7} 235 vldmia SRC0!, {s12-s13} 236 vmla.f s8, s0, s20 @ vector * vector 237 vldmia SRC0!, {s14-s15} 238 bne 5b 2396: vldmia WIN0!, {s20-s23} 240 vmul.f s28, s4, s12 @ vector * vector 241 vstmia DST0!, {s24-s25} 242 vldmdb SRC1!, {s16} 243 vldmdb SRC1!, {s17} 244 vldmdb SRC1!, {s18} 245 vldmdb SRC1!, {s19} 246 vmul.f s12, s20, s12 @ vector * vector 247 vstmia DST0!, {s26-s27} 248 vstmdb DST1!, {s8} 249 vstmdb DST1!, {s9} 250 vstmdb DST1!, {s10} 251 vstmdb DST1!, {s11} 252 vmls.f s28, s20, s16 @ vector * vector 253 vmla.f s12, s4, s16 @ vector * vector 254 vstmia DST0!, {s28-s31} 255 vstmdb DST1!, {s12} 256 vstmdb DST1!, {s13} 257 vstmdb DST1!, {s14} 258 vstmdb DST1!, {s15} 2597: 260 fmxr FPSCR, OLDFPSCR 261 vpop {s16-s31} 262 pop {v1-v3,pc} 263 264 .unreq DST0 265 .unreq SRC0 266 .unreq SRC1 267 .unreq WIN0 268 .unreq LEN 269 .unreq OLDFPSCR 270 .unreq DST1 271 .unreq WIN1 272endfunc 273 274/** 275 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. 276 * Assume that len is a positive number and is multiple of 8 277 */ 278@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, 279@ const float *src1, int len) 280function ff_vector_fmul_reverse_vfp, export=1 281 vpush {d8-d15} 282 add r2, r2, r3, lsl #2 283 vldmdb r2!, {s0-s3} 284 vldmia r1!, {s8-s11} 285 vldmdb r2!, {s4-s7} 286 vldmia r1!, {s12-s15} 287 vmul.f32 s8, s3, s8 288 vmul.f32 s9, s2, s9 289 vmul.f32 s10, s1, s10 290 vmul.f32 s11, s0, s11 2911: 292 subs r3, r3, #16 293 it ge 294 vldmdbge r2!, {s16-s19} 295 vmul.f32 s12, s7, s12 296 it ge 297 vldmiage r1!, {s24-s27} 298 vmul.f32 s13, s6, s13 299 it ge 300 vldmdbge r2!, {s20-s23} 301 vmul.f32 s14, s5, s14 302 it ge 303 vldmiage r1!, {s28-s31} 304 vmul.f32 s15, s4, s15 305 it ge 306 vmulge.f32 s24, s19, s24 307 it gt 308 vldmdbgt r2!, {s0-s3} 309 it ge 310 vmulge.f32 s25, s18, s25 311 vstmia r0!, {s8-s13} 312 it ge 313 vmulge.f32 s26, s17, s26 314 it gt 315 vldmiagt r1!, {s8-s11} 316 itt ge 317 vmulge.f32 s27, s16, s27 318 vmulge.f32 s28, s23, s28 319 it gt 320 vldmdbgt r2!, {s4-s7} 321 it ge 322 vmulge.f32 s29, s22, s29 323 vstmia r0!, {s14-s15} 324 ittt ge 325 vmulge.f32 s30, s21, s30 326 vmulge.f32 s31, s20, s31 327 vmulge.f32 s8, s3, s8 328 it gt 329 vldmiagt r1!, {s12-s15} 330 itttt ge 331 vmulge.f32 s9, s2, s9 332 vmulge.f32 s10, s1, s10 333 vstmiage r0!, {s24-s27} 334 vmulge.f32 s11, s0, s11 335 it ge 336 vstmiage r0!, {s28-s31} 337 bgt 1b 338 339 vpop {d8-d15} 340 bx lr 341endfunc 342 343/** 344 * ARM VFP implementation of 'butterflies_float_c' function 345 * Assume that len is a positive non-zero number 346 */ 347@ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len) 348function ff_butterflies_float_vfp, export=1 349BASE1 .req a1 350BASE2 .req a2 351LEN .req a3 352OLDFPSCR .req a4 353 354 vpush {s16-s31} 355 fmrx OLDFPSCR, FPSCR 356 357 tst LEN, #7 358 beq 4f @ common case: len is a multiple of 8 359 360 ldr ip, =0x03000000 @ RunFast mode, scalar mode 361 fmxr FPSCR, ip 362 363 tst LEN, #1 364 beq 1f 365 vldmia BASE1!, {s0} 366 vldmia BASE2!, {s8} 367 vadd.f s16, s0, s8 368 vsub.f s24, s0, s8 369 vstr s16, [BASE1, #0-4*1] 370 vstr s24, [BASE2, #0-4*1] 3711: 372 tst LEN, #2 373 beq 2f 374 vldmia BASE1!, {s0-s1} 375 vldmia BASE2!, {s8-s9} 376 vadd.f s16, s0, s8 377 vadd.f s17, s1, s9 378 vsub.f s24, s0, s8 379 vsub.f s25, s1, s9 380 vstr d8, [BASE1, #0-8*1] @ s16,s17 381 vstr d12, [BASE2, #0-8*1] @ s24,s25 3822: 383 tst LEN, #4 384 beq 3f 385 vldmia BASE1!, {s0-s1} 386 vldmia BASE2!, {s8-s9} 387 vldmia BASE1!, {s2-s3} 388 vldmia BASE2!, {s10-s11} 389 vadd.f s16, s0, s8 390 vadd.f s17, s1, s9 391 vsub.f s24, s0, s8 392 vsub.f s25, s1, s9 393 vadd.f s18, s2, s10 394 vadd.f s19, s3, s11 395 vsub.f s26, s2, s10 396 vsub.f s27, s3, s11 397 vstr d8, [BASE1, #0-16*1] @ s16,s17 398 vstr d12, [BASE2, #0-16*1] @ s24,s25 399 vstr d9, [BASE1, #8-16*1] @ s18,s19 400 vstr d13, [BASE2, #8-16*1] @ s26,s27 4013: 402 bics LEN, LEN, #7 403 beq 7f 4044: 405 ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 406 fmxr FPSCR, ip 407 408 vldmia BASE1!, {s0-s1} 409 vldmia BASE2!, {s8-s9} 410 vldmia BASE1!, {s2-s3} 411 vldmia BASE2!, {s10-s11} 412 vadd.f s16, s0, s8 413 vldmia BASE1!, {s4-s5} 414 vldmia BASE2!, {s12-s13} 415 vldmia BASE1!, {s6-s7} 416 vldmia BASE2!, {s14-s15} 417 vsub.f s24, s0, s8 418 vadd.f s20, s4, s12 419 subs LEN, LEN, #8 420 beq 6f 4215: vldmia BASE1!, {s0-s3} 422 vldmia BASE2!, {s8-s11} 423 vsub.f s28, s4, s12 424 vstr d8, [BASE1, #0-16*3] @ s16,s17 425 vstr d9, [BASE1, #8-16*3] @ s18,s19 426 vstr d12, [BASE2, #0-16*3] @ s24,s25 427 vstr d13, [BASE2, #8-16*3] @ s26,s27 428 vadd.f s16, s0, s8 429 vldmia BASE1!, {s4-s7} 430 vldmia BASE2!, {s12-s15} 431 vsub.f s24, s0, s8 432 vstr d10, [BASE1, #0-16*3] @ s20,s21 433 vstr d11, [BASE1, #8-16*3] @ s22,s23 434 vstr d14, [BASE2, #0-16*3] @ s28,s29 435 vstr d15, [BASE2, #8-16*3] @ s30,s31 436 vadd.f s20, s4, s12 437 subs LEN, LEN, #8 438 bne 5b 4396: vsub.f s28, s4, s12 440 vstr d8, [BASE1, #0-16*2] @ s16,s17 441 vstr d9, [BASE1, #8-16*2] @ s18,s19 442 vstr d12, [BASE2, #0-16*2] @ s24,s25 443 vstr d13, [BASE2, #8-16*2] @ s26,s27 444 vstr d10, [BASE1, #0-16*1] @ s20,s21 445 vstr d11, [BASE1, #8-16*1] @ s22,s23 446 vstr d14, [BASE2, #0-16*1] @ s28,s29 447 vstr d15, [BASE2, #8-16*1] @ s30,s31 4487: 449 fmxr FPSCR, OLDFPSCR 450 vpop {s16-s31} 451 bx lr 452 453 .unreq BASE1 454 .unreq BASE2 455 .unreq LEN 456 .unreq OLDFPSCR 457endfunc 458