1/* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19#include "libavutil/aarch64/asm.S" 20 21function ff_ps_add_squares_neon, export=1 221: ld1 {v0.4S,v1.4S}, [x1], #32 23 fmul v0.4S, v0.4S, v0.4S 24 fmul v1.4S, v1.4S, v1.4S 25 faddp v2.4S, v0.4S, v1.4S 26 ld1 {v3.4S}, [x0] 27 fadd v3.4S, v3.4S, v2.4S 28 st1 {v3.4S}, [x0], #16 29 subs w2, w2, #4 30 b.gt 1b 31 ret 32endfunc 33 34function ff_ps_mul_pair_single_neon, export=1 351: ld1 {v0.4S,v1.4S}, [x1], #32 36 ld1 {v2.4S}, [x2], #16 37 zip1 v3.4S, v2.4S, v2.4S 38 zip2 v4.4S, v2.4S, v2.4S 39 fmul v0.4S, v0.4S, v3.4S 40 fmul v1.4S, v1.4S, v4.4S 41 st1 {v0.4S,v1.4S}, [x0], #32 42 subs w3, w3, #4 43 b.gt 1b 44 ret 45endfunc 46 47function ff_ps_stereo_interpolate_neon, export=1 48 ld1 {v0.4S}, [x2] 49 ld1 {v1.4S}, [x3] 50 zip1 v4.4S, v0.4S, v0.4S 51 zip2 v5.4S, v0.4S, v0.4S 52 zip1 v6.4S, v1.4S, v1.4S 53 zip2 v7.4S, v1.4S, v1.4S 541: ld1 {v2.2S}, [x0] 55 ld1 {v3.2S}, [x1] 56 fadd v4.4S, v4.4S, v6.4S 57 fadd v5.4S, v5.4S, v7.4S 58 mov v2.D[1], v2.D[0] 59 mov v3.D[1], v3.D[0] 60 fmul v2.4S, v2.4S, v4.4S 61 fmla v2.4S, v3.4S, v5.4S 62 st1 {v2.D}[0], [x0], #8 63 st1 {v2.D}[1], [x1], #8 64 subs w4, w4, #1 65 b.gt 1b 66 ret 67endfunc 68 69function ff_ps_stereo_interpolate_ipdopd_neon, export=1 70 ld1 {v0.4S,v1.4S}, [x2] 71 ld1 {v6.4S,v7.4S}, [x3] 72 fneg v2.4S, v1.4S 73 fneg v3.4S, v7.4S 74 zip1 v16.4S, v0.4S, v0.4S 75 zip2 v17.4S, v0.4S, v0.4S 76 zip1 v18.4S, v2.4S, v1.4S 77 zip2 v19.4S, v2.4S, v1.4S 78 zip1 v20.4S, v6.4S, v6.4S 79 zip2 v21.4S, v6.4S, v6.4S 80 zip1 v22.4S, v3.4S, v7.4S 81 zip2 v23.4S, v3.4S, v7.4S 821: ld1 {v2.2S}, [x0] 83 ld1 {v3.2S}, [x1] 84 fadd v16.4S, v16.4S, v20.4S 85 fadd v17.4S, v17.4S, v21.4S 86 mov v2.D[1], v2.D[0] 87 mov v3.D[1], v3.D[0] 88 fmul v4.4S, v2.4S, v16.4S 89 fmla v4.4S, v3.4S, v17.4S 90 fadd v18.4S, v18.4S, v22.4S 91 fadd v19.4S, v19.4S, v23.4S 92 ext v2.16B, v2.16B, v2.16B, #4 93 ext v3.16B, v3.16B, v3.16B, #4 94 fmla v4.4S, v2.4S, v18.4S 95 fmla v4.4S, v3.4S, v19.4S 96 st1 {v4.D}[0], [x0], #8 97 st1 {v4.D}[1], [x1], #8 98 subs w4, w4, #1 99 b.gt 1b 100 ret 101endfunc 102 103function ff_ps_hybrid_analysis_neon, export=1 104 lsl x3, x3, #3 105 ld2 {v0.4S,v1.4S}, [x1], #32 106 ld2 {v2.2S,v3.2S}, [x1], #16 107 ld1 {v24.2S}, [x1], #8 108 ld2 {v4.2S,v5.2S}, [x1], #16 109 ld2 {v6.4S,v7.4S}, [x1] 110 rev64 v6.4S, v6.4S 111 rev64 v7.4S, v7.4S 112 ext v6.16B, v6.16B, v6.16B, #8 113 ext v7.16B, v7.16B, v7.16B, #8 114 rev64 v4.2S, v4.2S 115 rev64 v5.2S, v5.2S 116 mov v2.D[1], v3.D[0] 117 mov v4.D[1], v5.D[0] 118 mov v5.D[1], v2.D[0] 119 mov v3.D[1], v4.D[0] 120 fadd v16.4S, v0.4S, v6.4S 121 fadd v17.4S, v1.4S, v7.4S 122 fsub v18.4S, v1.4S, v7.4S 123 fsub v19.4S, v0.4S, v6.4S 124 fadd v22.4S, v2.4S, v4.4S 125 fsub v23.4S, v5.4S, v3.4S 126 trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5} 127 trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7} 1281: ld2 {v2.4S,v3.4S}, [x2], #32 129 ld2 {v4.2S,v5.2S}, [x2], #16 130 ld1 {v6.2S}, [x2], #8 131 add x2, x2, #8 132 mov v4.D[1], v5.D[0] 133 mov v6.S[1], v6.S[0] 134 fmul v6.2S, v6.2S, v24.2S 135 fmul v0.4S, v2.4S, v16.4S 136 fmul v1.4S, v2.4S, v17.4S 137 fmls v0.4S, v3.4S, v18.4S 138 fmla v1.4S, v3.4S, v19.4S 139 fmla v0.4S, v4.4S, v20.4S 140 fmla v1.4S, v4.4S, v21.4S 141 faddp v0.4S, v0.4S, v1.4S 142 faddp v0.4S, v0.4S, v0.4S 143 fadd v0.2S, v0.2S, v6.2S 144 st1 {v0.2S}, [x0], x3 145 subs w4, w4, #1 146 b.gt 1b 147 ret 148endfunc 149