1/* 2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22 23#define FRAC_BITS 23 // fractional bits for sb_samples and dct 24#define WFRAC_BITS 16 // fractional bits for window 25#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15) 26 27const tbl_rev128_s, align=4 28 .byte 12, 13, 14, 15 29 .byte 8, 9, 10, 11 30 .byte 4, 5, 6, 7 31 .byte 0, 1, 2, 3 32endconst 33 34.macro apply_window type, st 35function ff_mpadsp_apply_window_\type\()_neon, export=1 36 mov x7, x0 37 add x8, x0, #512<<2 38 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x7], #64 39 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x7], #64 40 st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x8], #64 41 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x8], #64 42 movrel x15, tbl_rev128_s 43 ld1 {v27.4s}, [x15] 44.ifc \type, fixed 45 lsl x4, x4, #1 46.else 47 lsl x4, x4, #2 48.endif 49 add x10, x0, #45<<2 50 add x0, x0, #16<<2 51 add x1, x1, #16<<2 52 add x5, x3, x4, lsl #5 53 sub x5, x5, x4 // samples2 54 neg x13, x4 // -incr 55 mov x9, #64<<2 56.ifc \type, fixed 57 ld1r {v16.2s}, [x2] // dither_state 58 sxtl v16.2d, v16.2s 59 movi v29.2d, #0 60 movi v30.2d, #(1<<OUT_SHIFT)-1 61 trn1 v31.2d, v29.2d, v30.2d 62 trn2 v30.2d, v30.2d, v29.2d 63 trn1 v16.2d, v16.2d, v29.2d 64.else 65 movi v16.4s, #0 66 movi v28.4s, #0 67.endif 68 mov x14, #4 691: 70 mov x8, x0 71 sub x7, x1, #3<<2 72 sub x6, x1, x14, lsl #4 73 add x7, x7, x14, lsl #4 74 add x11, x6, #(32)<<2 // w + 32 75 add x12, x7, #(32)<<2 // w2 + 32 76 mov x15, #8 77 movi v17.2d, #0 78 movi v18.2d, #0 79 movi v19.2d, #0 802: 81 subs x15, x15, #1 82 ld1 {v0.4s}, [x8], x9 83 ld1 {v1.4s}, [x10], x9 84 ld1 {v2.4s}, [x6], x9 85 ld1 {v3.4s}, [x7], x9 86 tbl v6.16b, {v0.16b}, v27.16b 87 tbl v7.16b, {v1.16b}, v27.16b 88 ld1 {v4.4s}, [x11], x9 89 ld1 {v5.4s}, [x12], x9 90 MLA v16, v2, v0 91 MLA2 v17, v2, v0 92 MLS v18, v3, v6 93 MLS2 v19, v3, v6 94 MLS v16, v4, v7 95 MLS2 v17, v4, v7 96 MLS v18, v5, v1 97 MLS2 v19, v5, v1 98 b.gt 2b 99 100 cmp x14, #4 101 sub x10, x10, #64<<5 // 64 * 8 * sizeof(int32_t) 102 103.ifc \type, fixed 104 and v28.16b, v16.16b, v30.16b 105 ext v28.16b, v29.16b, v28.16b, #8 106 107 b.eq 4f 108 round_sample v19, 1, 1 1094: 110 round_sample v16, 1, 0 111 shrn v16.2s, v16.2d, #OUT_SHIFT 112 round_sample v19, 0, 0 113 shrn v19.2s, v19.2d, #OUT_SHIFT 114 round_sample v17, 0, 1 115 round_sample v18, 1, 1 116 round_sample v17, 1, 0 117 shrn2 v16.4s, v17.2d, #OUT_SHIFT 118 round_sample v18, 0, 0 119 shrn2 v19.4s, v18.2d, #OUT_SHIFT 120 sqxtn v16.4h, v16.4s 121 sqxtn v18.4h, v19.4s 122.else 123 ext v18.16b, v18.16b, v18.16b, #8 124.endif 125 126 st1 {v16.\st\()}[0], [x3], x4 127 b.eq 4f 128 st1 {v18.\st\()}[1], [x5], x13 1294: 130 st1 {v16.\st\()}[1], [x3], x4 131 st1 {v18.\st\()}[0], [x5], x13 132 st1 {v16.\st\()}[2], [x3], x4 133 st1 {v18.\st\()}[3], [x5], x13 134 st1 {v16.\st\()}[3], [x3], x4 135 st1 {v18.\st\()}[2], [x5], x13 136 137 mov v16.16b, v28.16b 138 139 subs x14, x14, #1 140 add x0, x0, #4<<2 141 sub x10, x10, #4<<2 142 b.gt 1b 143 144// computing samples[16] 145 add x6, x1, #32<<2 146 ld1 {v0.2s}, [x6], x9 147 ld1 {v1.2s}, [x0], x9 148.rept 3 149 ld1 {v2.2s}, [x6], x9 150 ld1 {v3.2s}, [x0], x9 151 MLS v16, v0, v1 152 ld1 {v0.2s}, [x6], x9 153 ld1 {v1.2s}, [x0], x9 154 MLS v16, v2, v3 155.endr 156 ld1 {v2.2s}, [x6], x9 157 ld1 {v3.2s}, [x0], x9 158 MLS v16, v0, v1 159 MLS v16, v2, v3 160 161.ifc \type, fixed 162 and v28.16b, v16.16b, v30.16b 163 shrn v20.2s, v16.2d, #OUT_SHIFT 164 xtn v28.2s, v28.2d 165 sqxtn v20.4h, v20.4s 166 st1 {v28.s}[0], [x2] // save dither_state 167 st1 {v20.h}[0], [x3] 168.else 169 st1 {v16.s}[0], [x3] 170.endif 171 172 ret 173endfunc 174.purgem round_sample 175.purgem MLA 176.purgem MLA2 177.purgem MLS 178.purgem MLS2 179.endm 180 181 182.macro round_sample r, idx, next 183 add \r\().2d, \r\().2d, v28.2d 184.if \idx == 0 185 and v28.16b, \r\().16b, v30.16b 186.else // \idx == 1 187 and v28.16b, \r\().16b, v31.16b 188.endif 189.if \idx != \next 190 .if \next == 0 191 ext v28.16b, v28.16b, v29.16b, #8 192 .else 193 ext v28.16b, v29.16b, v28.16b, #8 194 .endif 195.endif 196.endm 197.macro MLA d, s1, s2 198 smlal \d\().2d, \s1\().2s, \s2\().2s 199.endm 200.macro MLA2 d, s1, s2 201 smlal2 \d\().2d, \s1\().4s, \s2\().4s 202.endm 203.macro MLS d, s1, s2 204 smlsl \d\().2d, \s1\().2s, \s2\().2s 205.endm 206.macro MLS2 d, s1, s2 207 smlsl2 \d\().2d, \s1\().4s, \s2\().4s 208.endm 209apply_window fixed, h 210 211 212// nothing to do for round_sample and ML{A,S}2 213.macro round_sample r, idx, next 214.endm 215.macro MLA2 d, s1, s2 216.endm 217.macro MLS2 d, s1, s2 218.endm 219.macro MLA d, s1, s2 220 fmla \d\().4s, \s1\().4s, \s2\().4s 221.endm 222.macro MLS d, s1, s2 223 fmls \d\().4s, \s1\().4s, \s2\().4s 224.endm 225apply_window float, s 226