1/* 2 * Copyright (c) 2013 RISC OS Open Ltd 3 * Author: Ben Avison <bavison@riscosopen.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/arm/asm.S" 23 24IMDCT .req r0 25ORIG_P_SB .req r1 26P_SB_OFF .req r2 27I .req r0 28P_SB2_UP .req r1 29OLDFPSCR .req r2 30P_SB2_DN .req r3 31P_WIN_DN .req r4 32P_OUT_DN .req r5 33P_SB .req r6 34J_WRAP .req r7 35P_WIN_UP .req r12 36P_OUT_UP .req r14 37 38SCALE .req s0 39SBUF_DAT_REV0 .req s4 40SBUF_DAT_REV1 .req s5 41SBUF_DAT_REV2 .req s6 42SBUF_DAT_REV3 .req s7 43VA0 .req s8 44VA3 .req s11 45VB0 .req s12 46VB3 .req s15 47VC0 .req s8 48VC3 .req s11 49VD0 .req s12 50VD3 .req s15 51SBUF_DAT0 .req s16 52SBUF_DAT1 .req s17 53SBUF_DAT2 .req s18 54SBUF_DAT3 .req s19 55SBUF_DAT_ALT0 .req s20 56SBUF_DAT_ALT1 .req s21 57SBUF_DAT_ALT2 .req s22 58SBUF_DAT_ALT3 .req s23 59WIN_DN_DAT0 .req s24 60WIN_UP_DAT0 .req s28 61 62 63.macro inner_loop half, tail, head 64 .if (OFFSET & (64*4)) == 0 @ even numbered call 65 SBUF_DAT_THIS0 .req SBUF_DAT0 66 SBUF_DAT_THIS1 .req SBUF_DAT1 67 SBUF_DAT_THIS2 .req SBUF_DAT2 68 SBUF_DAT_THIS3 .req SBUF_DAT3 69 .ifnc "\head","" 70 vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT 71 vldr d9, [P_SB, #OFFSET+8] 72 .endif 73 .else 74 SBUF_DAT_THIS0 .req SBUF_DAT_ALT0 75 SBUF_DAT_THIS1 .req SBUF_DAT_ALT1 76 SBUF_DAT_THIS2 .req SBUF_DAT_ALT2 77 SBUF_DAT_THIS3 .req SBUF_DAT_ALT3 78 .ifnc "\head","" 79 vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT 80 vldr d11, [P_SB, #OFFSET+8] 81 .endif 82 .endif 83 .ifnc "\tail","" 84 .ifc "\half","ab" 85 vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors 86 .else 87 vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors 88 .endif 89 .endif 90 .ifnc "\head","" 91 vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT 92 vldr d15, [P_WIN_UP, #OFFSET+8] 93 vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT 94 vldr d13, [P_WIN_DN, #OFFSET+8] 95 vmov SBUF_DAT_REV3, SBUF_DAT_THIS0 96 vmov SBUF_DAT_REV2, SBUF_DAT_THIS1 97 vmov SBUF_DAT_REV1, SBUF_DAT_THIS2 98 vmov SBUF_DAT_REV0, SBUF_DAT_THIS3 99 .ifc "\half","ab" 100 vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0 101 .else 102 vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0 103 .endif 104 teq J_WRAP, #J 105 bne 2f @ strongly predictable, so better than cond exec in this case 106 sub P_SB, P_SB, #512*4 1072: 108 .set J, J - 64 109 .set OFFSET, OFFSET + 64*4 110 .endif 111 .unreq SBUF_DAT_THIS0 112 .unreq SBUF_DAT_THIS1 113 .unreq SBUF_DAT_THIS2 114 .unreq SBUF_DAT_THIS3 115.endm 116 117 118/* void ff_synth_filter_float_vfp(FFTContext *imdct, 119 * float *synth_buf_ptr, int *synth_buf_offset, 120 * float synth_buf2[32], const float window[512], 121 * float out[32], const float in[32], float scale) 122 */ 123function ff_synth_filter_float_vfp, export=1 124 push {r3-r7,lr} 125 vpush {s16-s31} 126 ldr lr, [P_SB_OFF] 127 add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half 128 mov P_SB, a2 @ and keep a copy for ourselves 129 bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop 130 sub lr, lr, #32 131 and lr, lr, #512-32 132 str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call 133 ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half 134VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case 135 bl X(ff_imdct_half_vfp) 136VFP vmov SCALE, s16 137 138 fmrx OLDFPSCR, FPSCR 139 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 140 fmxr FPSCR, lr 141 ldr P_SB2_DN, [sp, #16*4] 142 ldr P_WIN_DN, [sp, #(16+6+0)*4] 143 ldr P_OUT_DN, [sp, #(16+6+1)*4] 144NOVFP vldr SCALE, [sp, #(16+6+3)*4] 145 146#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */ 147 add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range 148 add P_SB2_UP, P_SB2_DN, #16*4 149 add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW 150 add P_OUT_UP, P_OUT_DN, #16*4 151 add P_SB2_DN, P_SB2_DN, #16*4 152 add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW 153 add P_OUT_DN, P_OUT_DN, #16*4 154 mov I, #4 1551: 156 vldmia P_SB2_UP!, {VB0-VB3} 157 vldmdb P_SB2_DN!, {VA0-VA3} 158 .set J, 512 - 64 159 .set OFFSET, -IMM_OFF_SKEW 160 inner_loop ab,, head 161 .rept 7 162 inner_loop ab, tail, head 163 .endr 164 inner_loop ab, tail 165 add P_WIN_UP, P_WIN_UP, #4*4 166 sub P_WIN_DN, P_WIN_DN, #4*4 167 vmul.f VB0, VB0, SCALE @ SCALE treated as scalar 168 add P_SB, P_SB, #(512+4)*4 169 subs I, I, #1 170 vmul.f VA0, VA0, SCALE 171 vstmia P_OUT_UP!, {VB0-VB3} 172 vstmdb P_OUT_DN!, {VA0-VA3} 173 bne 1b 174 175 add P_SB2_DN, P_SB2_DN, #(16+28-12)*4 176 sub P_SB2_UP, P_SB2_UP, #(16+16)*4 177 add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4 178 mov I, #4 1791: 180 vldr.d d4, zero @ d4 = VC0 181 vldr.d d5, zero 182 vldr.d d6, zero @ d6 = VD0 183 vldr.d d7, zero 184 .set J, 512 - 64 185 .set OFFSET, -IMM_OFF_SKEW 186 inner_loop cd,, head 187 .rept 7 188 inner_loop cd, tail, head 189 .endr 190 inner_loop cd, tail 191 add P_WIN_UP, P_WIN_UP, #4*4 192 sub P_WIN_DN, P_WIN_DN, #4*4 193 add P_SB, P_SB, #(512+4)*4 194 subs I, I, #1 195 vstmia P_SB2_UP!, {VC0-VC3} 196 vstmdb P_SB2_DN!, {VD0-VD3} 197 bne 1b 198 199 fmxr FPSCR, OLDFPSCR 200 vpop {s16-s31} 201 pop {r3-r7,pc} 202endfunc 203 204 .unreq IMDCT 205 .unreq ORIG_P_SB 206 .unreq P_SB_OFF 207 .unreq I 208 .unreq P_SB2_UP 209 .unreq OLDFPSCR 210 .unreq P_SB2_DN 211 .unreq P_WIN_DN 212 .unreq P_OUT_DN 213 .unreq P_SB 214 .unreq J_WRAP 215 .unreq P_WIN_UP 216 .unreq P_OUT_UP 217 218 .unreq SCALE 219 .unreq SBUF_DAT_REV0 220 .unreq SBUF_DAT_REV1 221 .unreq SBUF_DAT_REV2 222 .unreq SBUF_DAT_REV3 223 .unreq VA0 224 .unreq VA3 225 .unreq VB0 226 .unreq VB3 227 .unreq VC0 228 .unreq VC3 229 .unreq VD0 230 .unreq VD3 231 .unreq SBUF_DAT0 232 .unreq SBUF_DAT1 233 .unreq SBUF_DAT2 234 .unreq SBUF_DAT3 235 .unreq SBUF_DAT_ALT0 236 .unreq SBUF_DAT_ALT1 237 .unreq SBUF_DAT_ALT2 238 .unreq SBUF_DAT_ALT3 239 .unreq WIN_DN_DAT0 240 .unreq WIN_UP_DAT0 241 242 .align 3 243zero: .word 0, 0 244