1/* 2 * Copyright (c) 2013 RISC OS Open Ltd 3 * Author: Ben Avison <bavison@riscosopen.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/arm/asm.S" 23 24CONTEXT .req a1 25ORIGOUT .req a2 26IN .req a3 27OUT .req v1 28REVTAB .req v2 29TCOS .req v3 30TSIN .req v4 31OLDFPSCR .req v5 32J0 .req a2 33J1 .req a4 34J2 .req ip 35J3 .req lr 36REVTAB_HI .req v5 37IN_HI .req v6 38OUT_HI .req v6 39TCOS_HI .req sl 40TSIN_HI .req fp 41 42.macro prerotation_innerloop 43 .set trig_lo, k 44 .set trig_hi, n4 - k - 2 45 .set in_lo, trig_lo * 2 46 .set in_hi, trig_hi * 2 47 vldr d8, [TCOS, #trig_lo*4] @ s16,s17 48 vldr d9, [TCOS, #trig_hi*4] @ s18,s19 49 vldr s0, [IN, #in_hi*4 + 12] 50 vldr s1, [IN, #in_hi*4 + 4] 51 vldr s2, [IN, #in_lo*4 + 12] 52 vldr s3, [IN, #in_lo*4 + 4] 53 vmul.f s8, s0, s16 @ vector operation 54 vldr d10, [TSIN, #trig_lo*4] @ s20,s21 55 vldr d11, [TSIN, #trig_hi*4] @ s22,s23 56 vldr s4, [IN, #in_lo*4] 57 vldr s5, [IN, #in_lo*4 + 8] 58 vldr s6, [IN, #in_hi*4] 59 vldr s7, [IN, #in_hi*4 + 8] 60 ldr J0, [REVTAB, #trig_lo*2] 61 vmul.f s12, s0, s20 @ vector operation 62 ldr J2, [REVTAB, #trig_hi*2] 63 mov J1, J0, lsr #16 64 and J0, J0, #255 @ halfword value will be < n4 65 vmls.f s8, s4, s20 @ vector operation 66 mov J3, J2, lsr #16 67 and J2, J2, #255 @ halfword value will be < n4 68 add J0, OUT, J0, lsl #3 69 vmla.f s12, s4, s16 @ vector operation 70 add J1, OUT, J1, lsl #3 71 add J2, OUT, J2, lsl #3 72 add J3, OUT, J3, lsl #3 73 vstr s8, [J0] 74 vstr s9, [J1] 75 vstr s10, [J2] 76 vstr s11, [J3] 77 vstr s12, [J0, #4] 78 vstr s13, [J1, #4] 79 vstr s14, [J2, #4] 80 vstr s15, [J3, #4] 81 .set k, k + 2 82.endm 83 84.macro prerotation_innerloop_rolled 85 vldmia TCOS!, {s16,s17} 86 vldmdb TCOS_HI!, {s18,s19} 87 vldr s0, [IN_HI, #-4] 88 vldr s1, [IN_HI, #-12] 89 vldr s2, [IN, #12] 90 vldr s3, [IN, #4] 91 vmul.f s8, s0, s16 @ vector operation 92 vldmia TSIN!, {s20,s21} 93 vldmdb TSIN_HI!, {s22,s23} 94 vldr s4, [IN] 95 vldr s5, [IN, #8] 96 vldr s6, [IN_HI, #-16] 97 vldr s7, [IN_HI, #-8] 98 vmul.f s12, s0, s20 @ vector operation 99 add IN, IN, #16 100 sub IN_HI, IN_HI, #16 101 ldrh J0, [REVTAB], #2 102 ldrh J1, [REVTAB], #2 103 vmls.f s8, s4, s20 @ vector operation 104 ldrh J3, [REVTAB_HI, #-2]! 105 ldrh J2, [REVTAB_HI, #-2]! 106 add J0, OUT, J0, lsl #3 107 vmla.f s12, s4, s16 @ vector operation 108 add J1, OUT, J1, lsl #3 109 add J2, OUT, J2, lsl #3 110 add J3, OUT, J3, lsl #3 111 vstr s8, [J0] 112 vstr s9, [J1] 113 vstr s10, [J2] 114 vstr s11, [J3] 115 vstr s12, [J0, #4] 116 vstr s13, [J1, #4] 117 vstr s14, [J2, #4] 118 vstr s15, [J3, #4] 119.endm 120 121.macro postrotation_innerloop tail, head 122 .set trig_lo_head, n8 - k - 2 123 .set trig_hi_head, n8 + k 124 .set out_lo_head, trig_lo_head * 2 125 .set out_hi_head, trig_hi_head * 2 126 .set trig_lo_tail, n8 - (k - 2) - 2 127 .set trig_hi_tail, n8 + (k - 2) 128 .set out_lo_tail, trig_lo_tail * 2 129 .set out_hi_tail, trig_hi_tail * 2 130 .if (k & 2) == 0 131 TCOS_D0_HEAD .req d10 @ s20,s21 132 TCOS_D1_HEAD .req d11 @ s22,s23 133 TCOS_S0_TAIL .req s24 134 .else 135 TCOS_D0_HEAD .req d12 @ s24,s25 136 TCOS_D1_HEAD .req d13 @ s26,s27 137 TCOS_S0_TAIL .req s20 138 .endif 139 .ifnc "\tail","" 140 vmls.f s8, s0, TCOS_S0_TAIL @ vector operation 141 .endif 142 .ifnc "\head","" 143 vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17 144 vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19 145 vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4] 146 .endif 147 .ifnc "\tail","" 148 vmla.f s12, s4, TCOS_S0_TAIL @ vector operation 149 .endif 150 .ifnc "\head","" 151 vldr s0, [OUT, #out_lo_head*4] 152 vldr s1, [OUT, #out_lo_head*4 + 8] 153 vldr s2, [OUT, #out_hi_head*4] 154 vldr s3, [OUT, #out_hi_head*4 + 8] 155 vldr s4, [OUT, #out_lo_head*4 + 4] 156 vldr s5, [OUT, #out_lo_head*4 + 12] 157 vldr s6, [OUT, #out_hi_head*4 + 4] 158 vldr s7, [OUT, #out_hi_head*4 + 12] 159 .endif 160 .ifnc "\tail","" 161 vstr s8, [OUT, #out_lo_tail*4] 162 vstr s9, [OUT, #out_lo_tail*4 + 8] 163 vstr s10, [OUT, #out_hi_tail*4] 164 vstr s11, [OUT, #out_hi_tail*4 + 8] 165 .endif 166 .ifnc "\head","" 167 vmul.f s8, s4, s16 @ vector operation 168 .endif 169 .ifnc "\tail","" 170 vstr s12, [OUT, #out_hi_tail*4 + 12] 171 vstr s13, [OUT, #out_hi_tail*4 + 4] 172 vstr s14, [OUT, #out_lo_tail*4 + 12] 173 vstr s15, [OUT, #out_lo_tail*4 + 4] 174 .endif 175 .ifnc "\head","" 176 vmul.f s12, s0, s16 @ vector operation 177 vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4] 178 .endif 179 .unreq TCOS_D0_HEAD 180 .unreq TCOS_D1_HEAD 181 .unreq TCOS_S0_TAIL 182 .ifnc "\head","" 183 .set k, k + 2 184 .endif 185.endm 186 187.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail 188 .ifnc "\tail","" 189 vmls.f s8, s0, \tcos_s0_tail @ vector operation 190 .endif 191 .ifnc "\head","" 192 vldmia TSIN!, {s16,s17} 193 vldmdb TSIN_HI!, {s18,s19} 194 vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head} 195 .endif 196 .ifnc "\tail","" 197 vmla.f s12, s4, \tcos_s0_tail @ vector operation 198 .endif 199 .ifnc "\head","" 200 vldr s0, [OUT, #+\out_offset_head+0] 201 vldr s1, [OUT, #+\out_offset_head+8] 202 vldr s2, [OUT_HI, #-\out_offset_head-16] 203 vldr s3, [OUT_HI, #-\out_offset_head-8] 204 vldr s4, [OUT, #+\out_offset_head+4] 205 vldr s5, [OUT, #+\out_offset_head+12] 206 vldr s6, [OUT_HI, #-\out_offset_head-12] 207 vldr s7, [OUT_HI, #-\out_offset_head-4] 208 .endif 209 .ifnc "\tail","" 210 vstr s8, [OUT, #+\out_offset_tail+0] 211 vstr s9, [OUT, #+\out_offset_tail+8] 212 vstr s10, [OUT_HI, #-\out_offset_tail-16] 213 vstr s11, [OUT_HI, #-\out_offset_tail-8] 214 .endif 215 .ifnc "\head","" 216 vmul.f s8, s4, s16 @ vector operation 217 .endif 218 .ifnc "\tail","" 219 vstr s12, [OUT_HI, #-\out_offset_tail-4] 220 vstr s13, [OUT_HI, #-\out_offset_tail-12] 221 vstr s14, [OUT, #+\out_offset_tail+12] 222 vstr s15, [OUT, #+\out_offset_tail+4] 223 .endif 224 .ifnc "\head","" 225 vmul.f s12, s0, s16 @ vector operation 226 vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head} 227 .endif 228.endm 229 230 231/* void ff_imdct_half_vfp(FFTContext *s, 232 * FFTSample *output, 233 * const FFTSample *input) 234 */ 235function ff_imdct_half_vfp, export=1 236 ldr ip, [CONTEXT, #5*4] @ mdct_bits 237 teq ip, #6 238 bne 10f 239 240 .set n, 1<<6 241 .set n2, n/2 242 .set n4, n/4 243 .set n8, n/8 244 245 push {v1-v5,lr} 246 vpush {s16-s27} 247 fmrx OLDFPSCR, FPSCR 248 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 249 fmxr FPSCR, lr 250 mov OUT, ORIGOUT 251 ldr REVTAB, [CONTEXT, #2*4] 252 ldr TCOS, [CONTEXT, #6*4] 253 ldr TSIN, [CONTEXT, #7*4] 254 255 .set k, 0 256 .rept n8/2 257 prerotation_innerloop 258 .endr 259 260 fmxr FPSCR, OLDFPSCR 261 mov a1, OUT 262 bl X(ff_fft16_vfp) 263 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 264 fmxr FPSCR, lr 265 266 .set k, 0 267 postrotation_innerloop , head 268 .rept n8/2 - 1 269 postrotation_innerloop tail, head 270 .endr 271 postrotation_innerloop tail 272 273 fmxr FPSCR, OLDFPSCR 274 vpop {s16-s27} 275 pop {v1-v5,pc} 276 27710: 278 push {v1-v6,sl,fp,lr} 279 vpush {s16-s27} 280 fmrx OLDFPSCR, FPSCR 281 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 282 fmxr FPSCR, lr 283 mov lr, #1 284 mov OUT, ORIGOUT 285 ldr REVTAB, [CONTEXT, #2*4] 286 ldr TCOS, [CONTEXT, #6*4] 287 ldr TSIN, [CONTEXT, #7*4] 288 mov lr, lr, lsl ip 289 290 push {CONTEXT,OLDFPSCR} 291 add IN_HI, IN, lr, lsl #1 292 add REVTAB_HI, REVTAB, lr, lsr #1 293 add TCOS_HI, TCOS, lr 294 add TSIN_HI, TSIN, lr 2950: prerotation_innerloop_rolled 296 teq IN, IN_HI 297 bne 0b 298 ldmia sp, {CONTEXT,OLDFPSCR} 299 300 mov ORIGOUT, OUT 301 fmxr FPSCR, OLDFPSCR 302 ldr ip, [CONTEXT, #9*4] 303 blx ip @ s->fft_calc(s, output) 304 305 pop {CONTEXT,OLDFPSCR} 306 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 307 ldr ip, [CONTEXT, #5*4] @ mdct_bits 308 fmxr FPSCR, lr 309 mov lr, #1 310 mov lr, lr, lsl ip 311 sub TCOS, TCOS, lr, lsr #1 312 sub TSIN, TSIN, lr, lsr #1 313 add OUT_HI, OUT, lr, lsl #1 314 add TCOS_HI, TCOS, lr 315 add TSIN_HI, TSIN, lr 316 postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0 317 b 1f 3180: add OUT, OUT, #32 319 sub OUT_HI, OUT_HI, #32 320 postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16 3211: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0 322 teq TSIN, TSIN_HI 323 bne 0b 324 postrotation_innerloop_rolled tail,,,,,, s24,, 16 325 326 fmxr FPSCR, OLDFPSCR 327 vpop {s16-s27} 328 pop {v1-v6,sl,fp,pc} 329endfunc 330 331 .unreq CONTEXT 332 .unreq ORIGOUT 333 .unreq IN 334 .unreq OUT 335 .unreq REVTAB 336 .unreq TCOS 337 .unreq TSIN 338 .unreq OLDFPSCR 339 .unreq J0 340 .unreq J1 341 .unreq J2 342 .unreq J3 343 .unreq REVTAB_HI 344 .unreq IN_HI 345 .unreq OUT_HI 346 .unreq TCOS_HI 347 .unreq TSIN_HI 348