1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "config.h" 23#include "libavutil/aarch64/asm.S" 24 25function swri_oldapi_conv_flt_to_s16_neon, export=1 26 subs x2, x2, #8 27 ld1 {v0.4s}, [x1], #16 28 fcvtzs v4.4s, v0.4s, #31 29 ld1 {v1.4s}, [x1], #16 30 fcvtzs v5.4s, v1.4s, #31 31 b.eq 3f 32 ands x12, x2, #~15 33 b.eq 2f 341: subs x12, x12, #16 35 sqrshrn v4.4h, v4.4s, #16 36 ld1 {v2.4s}, [x1], #16 37 fcvtzs v6.4s, v2.4s, #31 38 sqrshrn2 v4.8h, v5.4s, #16 39 ld1 {v3.4s}, [x1], #16 40 fcvtzs v7.4s, v3.4s, #31 41 sqrshrn v6.4h, v6.4s, #16 42 st1 {v4.8h}, [x0], #16 43 sqrshrn2 v6.8h, v7.4s, #16 44 ld1 {v0.4s}, [x1], #16 45 fcvtzs v4.4s, v0.4s, #31 46 ld1 {v1.4s}, [x1], #16 47 fcvtzs v5.4s, v1.4s, #31 48 st1 {v6.8h}, [x0], #16 49 b.ne 1b 50 ands x2, x2, #15 51 b.eq 3f 522: ld1 {v2.4s}, [x1], #16 53 sqrshrn v4.4h, v4.4s, #16 54 fcvtzs v6.4s, v2.4s, #31 55 ld1 {v3.4s}, [x1], #16 56 sqrshrn2 v4.8h, v5.4s, #16 57 fcvtzs v7.4s, v3.4s, #31 58 sqrshrn v6.4h, v6.4s, #16 59 st1 {v4.8h}, [x0], #16 60 sqrshrn2 v6.8h, v7.4s, #16 61 st1 {v6.8h}, [x0] 62 ret 633: sqrshrn v4.4h, v4.4s, #16 64 sqrshrn2 v4.8h, v5.4s, #16 65 st1 {v4.8h}, [x0] 66 ret 67endfunc 68 69function swri_oldapi_conv_fltp_to_s16_2ch_neon, export=1 70 ldp x4, x5, [x1] 71 subs x2, x2, #8 72 ld1 {v0.4s}, [x4], #16 73 fcvtzs v4.4s, v0.4s, #31 74 ld1 {v1.4s}, [x4], #16 75 fcvtzs v5.4s, v1.4s, #31 76 ld1 {v2.4s}, [x5], #16 77 fcvtzs v6.4s, v2.4s, #31 78 ld1 {v3.4s}, [x5], #16 79 fcvtzs v7.4s, v3.4s, #31 80 b.eq 3f 81 ands x12, x2, #~15 82 b.eq 2f 831: subs x12, x12, #16 84 ld1 {v16.4s}, [x4], #16 85 fcvtzs v20.4s, v16.4s, #31 86 sri v6.4s, v4.4s, #16 87 ld1 {v17.4s}, [x4], #16 88 fcvtzs v21.4s, v17.4s, #31 89 ld1 {v18.4s}, [x5], #16 90 fcvtzs v22.4s, v18.4s, #31 91 ld1 {v19.4s}, [x5], #16 92 sri v7.4s, v5.4s, #16 93 st1 {v6.4s}, [x0], #16 94 fcvtzs v23.4s, v19.4s, #31 95 st1 {v7.4s}, [x0], #16 96 sri v22.4s, v20.4s, #16 97 ld1 {v0.4s}, [x4], #16 98 sri v23.4s, v21.4s, #16 99 st1 {v22.4s}, [x0], #16 100 fcvtzs v4.4s, v0.4s, #31 101 ld1 {v1.4s}, [x4], #16 102 fcvtzs v5.4s, v1.4s, #31 103 ld1 {v2.4s}, [x5], #16 104 fcvtzs v6.4s, v2.4s, #31 105 ld1 {v3.4s}, [x5], #16 106 fcvtzs v7.4s, v3.4s, #31 107 st1 {v23.4s}, [x0], #16 108 b.ne 1b 109 ands x2, x2, #15 110 b.eq 3f 1112: sri v6.4s, v4.4s, #16 112 ld1 {v0.4s}, [x4], #16 113 fcvtzs v0.4s, v0.4s, #31 114 ld1 {v1.4s}, [x4], #16 115 fcvtzs v1.4s, v1.4s, #31 116 ld1 {v2.4s}, [x5], #16 117 fcvtzs v2.4s, v2.4s, #31 118 sri v7.4s, v5.4s, #16 119 ld1 {v3.4s}, [x5], #16 120 fcvtzs v3.4s, v3.4s, #31 121 sri v2.4s, v0.4s, #16 122 st1 {v6.4s,v7.4s}, [x0], #32 123 sri v3.4s, v1.4s, #16 124 st1 {v2.4s,v3.4s}, [x0], #32 125 ret 1263: sri v6.4s, v4.4s, #16 127 sri v7.4s, v5.4s, #16 128 st1 {v6.4s,v7.4s}, [x0] 129 ret 130endfunc 131 132function swri_oldapi_conv_fltp_to_s16_nch_neon, export=1 133 cmp w3, #2 134 b.eq X(swri_oldapi_conv_fltp_to_s16_2ch_neon) 135 b.gt 1f 136 ldr x1, [x1] 137 b X(swri_oldapi_conv_flt_to_s16_neon) 1381: 139 cmp w3, #4 140 lsl x12, x3, #1 141 b.lt 4f 142 1435: // 4 channels 144 ldp x4, x5, [x1], #16 145 ldp x6, x7, [x1], #16 146 mov w9, w2 147 mov x8, x0 148 ld1 {v4.4s}, [x4], #16 149 fcvtzs v4.4s, v4.4s, #31 150 ld1 {v5.4s}, [x5], #16 151 fcvtzs v5.4s, v5.4s, #31 152 ld1 {v6.4s}, [x6], #16 153 fcvtzs v6.4s, v6.4s, #31 154 ld1 {v7.4s}, [x7], #16 155 fcvtzs v7.4s, v7.4s, #31 1566: 157 subs w9, w9, #8 158 ld1 {v0.4s}, [x4], #16 159 fcvtzs v0.4s, v0.4s, #31 160 sri v5.4s, v4.4s, #16 161 ld1 {v1.4s}, [x5], #16 162 fcvtzs v1.4s, v1.4s, #31 163 sri v7.4s, v6.4s, #16 164 ld1 {v2.4s}, [x6], #16 165 fcvtzs v2.4s, v2.4s, #31 166 zip1 v16.4s, v5.4s, v7.4s 167 ld1 {v3.4s}, [x7], #16 168 fcvtzs v3.4s, v3.4s, #31 169 zip2 v17.4s, v5.4s, v7.4s 170 st1 {v16.d}[0], [x8], x12 171 sri v1.4s, v0.4s, #16 172 st1 {v16.d}[1], [x8], x12 173 sri v3.4s, v2.4s, #16 174 st1 {v17.d}[0], [x8], x12 175 zip1 v18.4s, v1.4s, v3.4s 176 st1 {v17.d}[1], [x8], x12 177 zip2 v19.4s, v1.4s, v3.4s 178 b.eq 7f 179 ld1 {v4.4s}, [x4], #16 180 fcvtzs v4.4s, v4.4s, #31 181 st1 {v18.d}[0], [x8], x12 182 ld1 {v5.4s}, [x5], #16 183 fcvtzs v5.4s, v5.4s, #31 184 st1 {v18.d}[1], [x8], x12 185 ld1 {v6.4s}, [x6], #16 186 fcvtzs v6.4s, v6.4s, #31 187 st1 {v19.d}[0], [x8], x12 188 ld1 {v7.4s}, [x7], #16 189 fcvtzs v7.4s, v7.4s, #31 190 st1 {v19.d}[1], [x8], x12 191 b 6b 1927: 193 st1 {v18.d}[0], [x8], x12 194 st1 {v18.d}[1], [x8], x12 195 st1 {v19.d}[0], [x8], x12 196 st1 {v19.d}[1], [x8], x12 197 subs w3, w3, #4 198 b.eq end 199 cmp w3, #4 200 add x0, x0, #8 201 b.ge 5b 202 2034: // 2 channels 204 cmp w3, #2 205 b.lt 4f 206 ldp x4, x5, [x1], #16 207 mov w9, w2 208 mov x8, x0 209 tst w9, #8 210 ld1 {v4.4s}, [x4], #16 211 fcvtzs v4.4s, v4.4s, #31 212 ld1 {v5.4s}, [x5], #16 213 fcvtzs v5.4s, v5.4s, #31 214 ld1 {v6.4s}, [x4], #16 215 fcvtzs v6.4s, v6.4s, #31 216 ld1 {v7.4s}, [x5], #16 217 fcvtzs v7.4s, v7.4s, #31 218 b.eq 6f 219 subs w9, w9, #8 220 b.eq 7f 221 sri v5.4s, v4.4s, #16 222 ld1 {v4.4s}, [x4], #16 223 fcvtzs v4.4s, v4.4s, #31 224 st1 {v5.s}[0], [x8], x12 225 sri v7.4s, v6.4s, #16 226 st1 {v5.s}[1], [x8], x12 227 ld1 {v6.4s}, [x4], #16 228 fcvtzs v6.4s, v6.4s, #31 229 st1 {v5.s}[2], [x8], x12 230 st1 {v5.s}[3], [x8], x12 231 st1 {v7.s}[0], [x8], x12 232 st1 {v7.s}[1], [x8], x12 233 ld1 {v5.4s}, [x5], #16 234 fcvtzs v5.4s, v5.4s, #31 235 st1 {v7.s}[2], [x8], x12 236 st1 {v7.s}[3], [x8], x12 237 ld1 {v7.4s}, [x5], #16 238 fcvtzs v7.4s, v7.4s, #31 2396: 240 subs w9, w9, #16 241 ld1 {v0.4s}, [x4], #16 242 sri v5.4s, v4.4s, #16 243 fcvtzs v0.4s, v0.4s, #31 244 ld1 {v1.4s}, [x5], #16 245 sri v7.4s, v6.4s, #16 246 st1 {v5.s}[0], [x8], x12 247 st1 {v5.s}[1], [x8], x12 248 fcvtzs v1.4s, v1.4s, #31 249 st1 {v5.s}[2], [x8], x12 250 st1 {v5.s}[3], [x8], x12 251 ld1 {v2.4s}, [x4], #16 252 st1 {v7.s}[0], [x8], x12 253 fcvtzs v2.4s, v2.4s, #31 254 st1 {v7.s}[1], [x8], x12 255 ld1 {v3.4s}, [x5], #16 256 st1 {v7.s}[2], [x8], x12 257 fcvtzs v3.4s, v3.4s, #31 258 st1 {v7.s}[3], [x8], x12 259 sri v1.4s, v0.4s, #16 260 sri v3.4s, v2.4s, #16 261 b.eq 6f 262 ld1 {v4.4s}, [x4], #16 263 st1 {v1.s}[0], [x8], x12 264 fcvtzs v4.4s, v4.4s, #31 265 st1 {v1.s}[1], [x8], x12 266 ld1 {v5.4s}, [x5], #16 267 st1 {v1.s}[2], [x8], x12 268 fcvtzs v5.4s, v5.4s, #31 269 st1 {v1.s}[3], [x8], x12 270 ld1 {v6.4s}, [x4], #16 271 st1 {v3.s}[0], [x8], x12 272 fcvtzs v6.4s, v6.4s, #31 273 st1 {v3.s}[1], [x8], x12 274 ld1 {v7.4s}, [x5], #16 275 st1 {v3.s}[2], [x8], x12 276 fcvtzs v7.4s, v7.4s, #31 277 st1 {v3.s}[3], [x8], x12 278 b.gt 6b 2796: 280 st1 {v1.s}[0], [x8], x12 281 st1 {v1.s}[1], [x8], x12 282 st1 {v1.s}[2], [x8], x12 283 st1 {v1.s}[3], [x8], x12 284 st1 {v3.s}[0], [x8], x12 285 st1 {v3.s}[1], [x8], x12 286 st1 {v3.s}[2], [x8], x12 287 st1 {v3.s}[3], [x8], x12 288 b 8f 2897: 290 sri v5.4s, v4.4s, #16 291 sri v7.4s, v6.4s, #16 292 st1 {v5.s}[0], [x8], x12 293 st1 {v5.s}[1], [x8], x12 294 st1 {v5.s}[2], [x8], x12 295 st1 {v5.s}[3], [x8], x12 296 st1 {v7.s}[0], [x8], x12 297 st1 {v7.s}[1], [x8], x12 298 st1 {v7.s}[2], [x8], x12 299 st1 {v7.s}[3], [x8], x12 3008: 301 subs w3, w3, #2 302 add x0, x0, #4 303 b.eq end 304 3054: // 1 channel 306 ldr x4, [x1] 307 tst w2, #8 308 mov w9, w2 309 mov x5, x0 310 ld1 {v0.4s}, [x4], #16 311 fcvtzs v0.4s, v0.4s, #31 312 ld1 {v1.4s}, [x4], #16 313 fcvtzs v1.4s, v1.4s, #31 314 b.ne 8f 3156: 316 subs w9, w9, #16 317 ld1 {v2.4s}, [x4], #16 318 fcvtzs v2.4s, v2.4s, #31 319 ld1 {v3.4s}, [x4], #16 320 fcvtzs v3.4s, v3.4s, #31 321 st1 {v0.h}[1], [x5], x12 322 st1 {v0.h}[3], [x5], x12 323 st1 {v0.h}[5], [x5], x12 324 st1 {v0.h}[7], [x5], x12 325 st1 {v1.h}[1], [x5], x12 326 st1 {v1.h}[3], [x5], x12 327 st1 {v1.h}[5], [x5], x12 328 st1 {v1.h}[7], [x5], x12 329 b.eq 7f 330 ld1 {v0.4s}, [x4], #16 331 fcvtzs v0.4s, v0.4s, #31 332 ld1 {v1.4s}, [x4], #16 333 fcvtzs v1.4s, v1.4s, #31 3347: 335 st1 {v2.h}[1], [x5], x12 336 st1 {v2.h}[3], [x5], x12 337 st1 {v2.h}[5], [x5], x12 338 st1 {v2.h}[7], [x5], x12 339 st1 {v3.h}[1], [x5], x12 340 st1 {v3.h}[3], [x5], x12 341 st1 {v3.h}[5], [x5], x12 342 st1 {v3.h}[7], [x5], x12 343 b.gt 6b 344 ret 3458: 346 subs w9, w9, #8 347 st1 {v0.h}[1], [x5], x12 348 st1 {v0.h}[3], [x5], x12 349 st1 {v0.h}[5], [x5], x12 350 st1 {v0.h}[7], [x5], x12 351 st1 {v1.h}[1], [x5], x12 352 st1 {v1.h}[3], [x5], x12 353 st1 {v1.h}[5], [x5], x12 354 st1 {v1.h}[7], [x5], x12 355 b.eq end 356 ld1 {v0.4s}, [x4], #16 357 fcvtzs v0.4s, v0.4s, #31 358 ld1 {v1.4s}, [x4], #16 359 fcvtzs v1.4s, v1.4s, #31 360 b 6b 361end: 362 ret 363endfunc 364