1/* 2 * AArch64 NEON optimised MDCT 3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/aarch64/asm.S" 24 25function ff_imdct_half_neon, export=1 26 sub sp, sp, #32 27 stp x19, x20, [sp] 28 str x30, [sp, #16] 29 mov x12, #1 30 ldr w14, [x0, #28] // mdct_bits 31 ldr x4, [x0, #32] // tcos 32 ldr x3, [x0, #8] // revtab 33 lsl x12, x12, x14 // n = 1 << nbits 34 lsr x14, x12, #2 // n4 = n >> 2 35 add x7, x2, x12, lsl #1 36 mov x12, #-16 37 sub x7, x7, #16 38 39 ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0 40 ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x 41 rev64 v17.2s, v17.2s 42 ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 43 fmul v6.2s, v17.2s, v2.2s 44 fmul v7.2s, v0.2s, v2.2s 451: 46 subs x14, x14, #2 47 ldr w6, [x3], #4 48 fmul v4.2s, v0.2s, v3.2s 49 fmul v5.2s, v17.2s, v3.2s 50 fsub v4.2s, v6.2s, v4.2s 51 fadd v5.2s, v5.2s, v7.2s 52 ubfm x8, x6, #16, #31 53 ubfm x6, x6, #0, #15 54 add x8, x1, x8, lsl #3 55 add x6, x1, x6, lsl #3 56 b.eq 2f 57 ld2 {v16.2s,v17.2s}, [x7], x12 58 ld2 {v0.2s,v1.2s}, [x2], #16 59 rev64 v17.2s, v17.2s 60 ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 61 fmul v6.2s, v17.2s, v2.2s 62 fmul v7.2s, v0.2s, v2.2s 63 st2 {v4.s,v5.s}[0], [x6] 64 st2 {v4.s,v5.s}[1], [x8] 65 b 1b 662: 67 st2 {v4.s,v5.s}[0], [x6] 68 st2 {v4.s,v5.s}[1], [x8] 69 70 mov x19, x0 71 mov x20, x1 72 bl X(ff_fft_calc_neon) 73 74 mov x12, #1 75 ldr w14, [x19, #28] // mdct_bits 76 ldr x4, [x19, #32] // tcos 77 lsl x12, x12, x14 // n = 1 << nbits 78 lsr x14, x12, #3 // n8 = n >> 3 79 80 add x4, x4, x14, lsl #3 81 add x6, x20, x14, lsl #3 82 sub x1, x4, #16 83 sub x3, x6, #16 84 85 mov x7, #-16 86 mov x8, x6 87 mov x0, x3 88 89 ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0 90 ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3 91 ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 923: 93 subs x14, x14, #2 94 fmul v7.2s, v0.2s, v17.2s 95 ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3 96 fmul v4.2s, v1.2s, v17.2s 97 fmul v6.2s, v21.2s, v19.2s 98 fmul v5.2s, v20.2s, v19.2s 99 fmul v22.2s, v1.2s, v16.2s 100 fmul v23.2s, v21.2s, v18.2s 101 fmul v24.2s, v0.2s, v16.2s 102 fmul v25.2s, v20.2s, v18.2s 103 fadd v7.2s, v7.2s, v22.2s 104 fadd v5.2s, v5.2s, v23.2s 105 fsub v4.2s, v4.2s, v24.2s 106 fsub v6.2s, v6.2s, v25.2s 107 b.eq 4f 108 ld2 {v0.2s,v1.2s}, [x3], x7 109 ld2 {v20.2s,v21.2s},[x6], #16 110 ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 111 rev64 v5.2s, v5.2s 112 rev64 v7.2s, v7.2s 113 st2 {v4.2s,v5.2s}, [x0], x7 114 st2 {v6.2s,v7.2s}, [x8], #16 115 b 3b 1164: 117 rev64 v5.2s, v5.2s 118 rev64 v7.2s, v7.2s 119 st2 {v4.2s,v5.2s}, [x0] 120 st2 {v6.2s,v7.2s}, [x8] 121 122 ldp x19, x20, [sp] 123 ldr x30, [sp, #16] 124 add sp, sp, #32 125 126 ret 127endfunc 128 129function ff_imdct_calc_neon, export=1 130 sub sp, sp, #32 131 stp x19, x20, [sp] 132 str x30, [sp, #16] 133 ldr w3, [x0, #28] // mdct_bits 134 mov x19, #1 135 mov x20, x1 136 lsl x19, x19, x3 137 add x1, x1, x19 138 139 bl X(ff_imdct_half_neon) 140 141 add x0, x20, x19, lsl #2 142 add x1, x20, x19, lsl #1 143 sub x0, x0, #8 144 sub x2, x1, #16 145 mov x3, #-16 146 mov x6, #-8 1471: 148 ld1 {v0.4s}, [x2], x3 149 prfum pldl1keep, [x0, #-16] 150 rev64 v0.4s, v0.4s 151 ld1 {v2.2s,v3.2s}, [x1], #16 152 fneg v4.4s, v0.4s 153 prfum pldl1keep, [x2, #-16] 154 rev64 v2.2s, v2.2s 155 rev64 v3.2s, v3.2s 156 ext v4.16b, v4.16b, v4.16b, #8 157 st1 {v2.2s}, [x0], x6 158 st1 {v3.2s}, [x0], x6 159 st1 {v4.4s}, [x20], #16 160 subs x19, x19, #16 161 b.gt 1b 162 163 ldp x19, x20, [sp], #16 164 ldr x30, [sp], #16 165 166 ret 167endfunc 168 169 170function ff_mdct_calc_neon, export=1 171 sub sp, sp, #32 172 stp x19, x20, [sp] 173 str x30, [sp, #16] 174 175 mov x12, #1 176 ldr w14, [x0, #28] // mdct_bits 177 ldr x4, [x0, #32] // tcos 178 ldr x3, [x0, #8] // revtab 179 lsl x14, x12, x14 // n = 1 << nbits 180 add x7, x2, x14 // in4u 181 sub x9, x7, #16 // in4d 182 add x2, x7, x14, lsl #1 // in3u 183 add x8, x9, x14, lsl #1 // in3d 184 add x5, x4, x14, lsl #1 185 sub x5, x5, #16 186 sub x3, x3, #4 187 mov x12, #-16 188 lsr x13, x14, #1 189 190 ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 191 ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 192 ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 193 rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 194 rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 195 ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 196 fsub v0.2s, v17.2s, v0.2s // in4d-in4u I 197 ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 198 rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 199 rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 200 ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 201 fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R 202 fsub v16.2s, v16.2s, v1.2s // in0u-in2d R 203 fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I 2041: 205 fmul v7.2s, v0.2s, v21.2s // I*s 206 ldr w10, [x3, x13] 207 fmul v6.2s, v2.2s, v20.2s // -R*c 208 ldr w6, [x3, #4]! 209 fmul v4.2s, v2.2s, v21.2s // -R*s 210 fmul v5.2s, v0.2s, v20.2s // I*c 211 fmul v24.2s, v16.2s, v30.2s // R*c 212 fmul v25.2s, v18.2s, v31.2s // -I*s 213 fmul v22.2s, v16.2s, v31.2s // R*s 214 fmul v23.2s, v18.2s, v30.2s // I*c 215 subs x14, x14, #16 216 subs x13, x13, #8 217 fsub v6.2s, v6.2s, v7.2s // -R*c-I*s 218 fadd v7.2s, v4.2s, v5.2s // -R*s+I*c 219 fsub v24.2s, v25.2s, v24.2s // I*s-R*c 220 fadd v25.2s, v22.2s, v23.2s // R*s-I*c 221 b.eq 1f 222 mov x12, #-16 223 ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 224 ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 225 fneg v7.2s, v7.2s // R*s-I*c 226 ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 227 rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 228 rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 229 ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 230 fsub v0.2s, v17.2s, v0.2s // in4d-in4u I 231 ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 232 rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 233 rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 234 ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 235 fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R 236 fsub v16.2s, v16.2s, v1.2s // in0u-in2d R 237 fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I 238 ubfm x12, x6, #16, #31 239 ubfm x6, x6, #0, #15 240 add x12, x1, x12, lsl #3 241 add x6, x1, x6, lsl #3 242 st2 {v6.s,v7.s}[0], [x6] 243 st2 {v6.s,v7.s}[1], [x12] 244 ubfm x6, x10, #16, #31 245 ubfm x10, x10, #0, #15 246 add x6 , x1, x6, lsl #3 247 add x10, x1, x10, lsl #3 248 st2 {v24.s,v25.s}[0], [x10] 249 st2 {v24.s,v25.s}[1], [x6] 250 b 1b 2511: 252 fneg v7.2s, v7.2s // R*s-I*c 253 ubfm x12, x6, #16, #31 254 ubfm x6, x6, #0, #15 255 add x12, x1, x12, lsl #3 256 add x6, x1, x6, lsl #3 257 st2 {v6.s,v7.s}[0], [x6] 258 st2 {v6.s,v7.s}[1], [x12] 259 ubfm x6, x10, #16, #31 260 ubfm x10, x10, #0, #15 261 add x6 , x1, x6, lsl #3 262 add x10, x1, x10, lsl #3 263 st2 {v24.s,v25.s}[0], [x10] 264 st2 {v24.s,v25.s}[1], [x6] 265 266 mov x19, x0 267 mov x20, x1 268 bl X(ff_fft_calc_neon) 269 270 mov x12, #1 271 ldr w14, [x19, #28] // mdct_bits 272 ldr x4, [x19, #32] // tcos 273 lsl x12, x12, x14 // n = 1 << nbits 274 lsr x14, x12, #3 // n8 = n >> 3 275 276 add x4, x4, x14, lsl #3 277 add x6, x20, x14, lsl #3 278 sub x1, x4, #16 279 sub x3, x6, #16 280 281 mov x7, #-16 282 mov x8, x6 283 mov x0, x3 284 285 ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0 286 ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3 287 ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 2881: 289 subs x14, x14, #2 290 fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0 291 ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3 292 fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0 293 fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3 294 fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3 295 fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0 296 fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3 297 fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3 298 fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0 299 fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0 300 fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3 301 fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3 302 fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0 303 fneg v4.2s, v4.2s 304 fneg v6.2s, v6.2s 305 b.eq 1f 306 ld2 {v0.2s, v1.2s}, [x3], x7 307 ld2 {v20.2s,v21.2s}, [x6], #16 308 ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 309 rev64 v5.2s, v5.2s 310 rev64 v7.2s, v7.2s 311 st2 {v4.2s,v5.2s}, [x0], x7 312 st2 {v6.2s,v7.2s}, [x8], #16 313 b 1b 3141: 315 rev64 v5.2s, v5.2s 316 rev64 v7.2s, v7.2s 317 st2 {v4.2s,v5.2s}, [x0] 318 st2 {v6.2s,v7.2s}, [x8] 319 320 ldp x19, x20, [sp], #16 321 ldr x30, [sp], #16 322 ret 323endfunc 324