1/* 2 * ARM NEON optimised Float DSP functions 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "config.h" 24#include "asm.S" 25 26function ff_vector_fmul_neon, export=1 271: subs w3, w3, #16 28 ld1 {v0.4S, v1.4S}, [x1], #32 29 ld1 {v2.4S, v3.4S}, [x1], #32 30 ld1 {v4.4S, v5.4S}, [x2], #32 31 ld1 {v6.4S, v7.4S}, [x2], #32 32 fmul v16.4S, v0.4S, v4.4S 33 fmul v17.4S, v1.4S, v5.4S 34 fmul v18.4S, v2.4S, v6.4S 35 fmul v19.4S, v3.4S, v7.4S 36 st1 {v16.4S, v17.4S}, [x0], #32 37 st1 {v18.4S, v19.4S}, [x0], #32 38 b.ne 1b 39 ret 40endfunc 41 42function ff_vector_fmac_scalar_neon, export=1 43 mov x3, #-32 441: subs w2, w2, #16 45 ld1 {v16.4S, v17.4S}, [x0], #32 46 ld1 {v18.4S, v19.4S}, [x0], x3 47 ld1 {v4.4S, v5.4S}, [x1], #32 48 ld1 {v6.4S, v7.4S}, [x1], #32 49 fmla v16.4S, v4.4S, v0.S[0] 50 fmla v17.4S, v5.4S, v0.S[0] 51 fmla v18.4S, v6.4S, v0.S[0] 52 fmla v19.4S, v7.4S, v0.S[0] 53 st1 {v16.4S, v17.4S}, [x0], #32 54 st1 {v18.4S, v19.4S}, [x0], #32 55 b.ne 1b 56 ret 57endfunc 58 59function ff_vector_fmul_scalar_neon, export=1 60 mov w4, #15 61 bics w3, w2, w4 62 dup v16.4S, v0.S[0] 63 b.eq 3f 64 ld1 {v0.4S, v1.4S}, [x1], #32 651: subs w3, w3, #16 66 fmul v0.4S, v0.4S, v16.4S 67 ld1 {v2.4S, v3.4S}, [x1], #32 68 fmul v1.4S, v1.4S, v16.4S 69 fmul v2.4S, v2.4S, v16.4S 70 st1 {v0.4S, v1.4S}, [x0], #32 71 fmul v3.4S, v3.4S, v16.4S 72 b.eq 2f 73 ld1 {v0.4S, v1.4S}, [x1], #32 74 st1 {v2.4S, v3.4S}, [x0], #32 75 b 1b 762: ands w2, w2, #15 77 st1 {v2.4S, v3.4S}, [x0], #32 78 b.eq 4f 793: ld1 {v0.4S}, [x1], #16 80 fmul v0.4S, v0.4S, v16.4S 81 st1 {v0.4S}, [x0], #16 82 subs w2, w2, #4 83 b.gt 3b 844: ret 85endfunc 86 87function ff_vector_dmul_scalar_neon, export=1 88 dup v16.2D, v0.D[0] 89 ld1 {v0.2D, v1.2D}, [x1], #32 901: subs w2, w2, #8 91 fmul v0.2D, v0.2D, v16.2D 92 ld1 {v2.2D, v3.2D}, [x1], #32 93 fmul v1.2D, v1.2D, v16.2D 94 fmul v2.2D, v2.2D, v16.2D 95 st1 {v0.2D, v1.2D}, [x0], #32 96 fmul v3.2D, v3.2D, v16.2D 97 ld1 {v0.2D, v1.2D}, [x1], #32 98 st1 {v2.2D, v3.2D}, [x0], #32 99 b.gt 1b 100 ret 101endfunc 102 103function ff_vector_fmul_window_neon, export=1 104 sxtw x4, w4 // len 105 sub x2, x2, #8 106 sub x5, x4, #2 107 add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4) 108 add x6, x3, x5, lsl #3 // win + 8 * (len - 2) 109 add x5, x0, x5, lsl #3 // dst + 8 * (len - 2) 110 mov x7, #-16 111 ld1 {v0.4S}, [x1], #16 // s0 112 ld1 {v2.4S}, [x3], #16 // wi 113 ld1 {v1.4S}, [x2], x7 // s1 1141: ld1 {v3.4S}, [x6], x7 // wj 115 subs x4, x4, #4 116 fmul v17.4S, v0.4S, v2.4S // s0 * wi 117 rev64 v4.4S, v1.4S 118 rev64 v5.4S, v3.4S 119 rev64 v17.4S, v17.4S 120 ext v4.16B, v4.16B, v4.16B, #8 // s1_r 121 ext v5.16B, v5.16B, v5.16B, #8 // wj_r 122 ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev 123 fmul v16.4S, v0.4S, v5.4S // s0 * wj_r 124 fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj 125 b.eq 2f 126 ld1 {v0.4S}, [x1], #16 127 fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi 128 st1 {v17.4S}, [x5], x7 129 ld1 {v2.4S}, [x3], #16 130 ld1 {v1.4S}, [x2], x7 131 st1 {v16.4S}, [x0], #16 132 b 1b 1332: 134 fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi 135 st1 {v17.4S}, [x5], x7 136 st1 {v16.4S}, [x0], #16 137 ret 138endfunc 139 140function ff_vector_fmul_add_neon, export=1 141 ld1 {v0.4S, v1.4S}, [x1], #32 142 ld1 {v2.4S, v3.4S}, [x2], #32 143 ld1 {v4.4S, v5.4S}, [x3], #32 1441: subs w4, w4, #8 145 fmla v4.4S, v0.4S, v2.4S 146 fmla v5.4S, v1.4S, v3.4S 147 b.eq 2f 148 ld1 {v0.4S, v1.4S}, [x1], #32 149 ld1 {v2.4S, v3.4S}, [x2], #32 150 st1 {v4.4S, v5.4S}, [x0], #32 151 ld1 {v4.4S, v5.4S}, [x3], #32 152 b 1b 1532: st1 {v4.4S, v5.4S}, [x0], #32 154 ret 155endfunc 156 157function ff_vector_fmul_reverse_neon, export=1 158 sxtw x3, w3 159 add x2, x2, x3, lsl #2 160 sub x2, x2, #32 161 mov x4, #-32 162 ld1 {v2.4S, v3.4S}, [x2], x4 163 ld1 {v0.4S, v1.4S}, [x1], #32 1641: subs x3, x3, #8 165 rev64 v3.4S, v3.4S 166 rev64 v2.4S, v2.4S 167 ext v3.16B, v3.16B, v3.16B, #8 168 ext v2.16B, v2.16B, v2.16B, #8 169 fmul v16.4S, v0.4S, v3.4S 170 fmul v17.4S, v1.4S, v2.4S 171 b.eq 2f 172 ld1 {v2.4S, v3.4S}, [x2], x4 173 ld1 {v0.4S, v1.4S}, [x1], #32 174 st1 {v16.4S, v17.4S}, [x0], #32 175 b 1b 1762: st1 {v16.4S, v17.4S}, [x0], #32 177 ret 178endfunc 179 180function ff_butterflies_float_neon, export=1 1811: ld1 {v0.4S}, [x0] 182 ld1 {v1.4S}, [x1] 183 subs w2, w2, #4 184 fsub v2.4S, v0.4S, v1.4S 185 fadd v3.4S, v0.4S, v1.4S 186 st1 {v2.4S}, [x1], #16 187 st1 {v3.4S}, [x0], #16 188 b.gt 1b 189 ret 190endfunc 191 192function ff_scalarproduct_float_neon, export=1 193 movi v2.4S, #0 1941: ld1 {v0.4S}, [x0], #16 195 ld1 {v1.4S}, [x1], #16 196 subs w2, w2, #4 197 fmla v2.4S, v0.4S, v1.4S 198 b.gt 1b 199 faddp v0.4S, v2.4S, v2.4S 200 faddp s0, v0.2S 201 ret 202endfunc 203