1/* 2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22#include "asm-offsets.h" 23 24.macro resample_one fmt, es=2 25.ifnc \fmt, dbl 26 .macro M_MUL2 x:vararg 27 .endm 28 .macro M_MLA2 x:vararg 29 .endm 30.endif 31function ff_resample_one_\fmt\()_neon, export=1 32 sxtw x2, w2 33 ldr x9, [x0, #FILTER_BANK] 34 ldr w6, [x0, #FILTER_LENGTH] 35 ldp w7, w8, [x0, #PHASE_SHIFT] // and phase_mask 36 lsr x10, x4, x7 // sample_index 37 and x4, x4, x8 38 lsl x11, x6, #\es // filter_length * elem_size 39 add x3, x3, x10, lsl #\es // src[sample_index] 40 madd x9, x11, x4, x9 // filter 41 cmp w6, #16 42 b.lt 5f 438: // remaining filter_length at least 16 44 subs w6, w6, #16 45 LOAD8 v4, v5, v6, v7, x3 46 LOAD8 v16, v17, v18, v19, x9 47 M_MUL v0, v4, v16, v1 48 M_MUL2 v1, v6, v18 497: 50 LOAD8 v20, v21, v22, v23, x3 51 M_MLA v0, v5, v17, v1 52 M_MLA2 v1, v7, v19 53 LOAD8 v24, v25, v26, v27, x9 54 M_MLA v0, v20, v24, v1 55 M_MLA2 v1, v22, v26 56 b.eq 6f 57 cmp w6, #16 58 M_MLA v0, v21, v25, v1 59 M_MLA2 v1, v23, v27 60 b.lt 4f 61 subs w6, w6, #16 62 LOAD8 v4, v5, v6, v7, x3 63 LOAD8 v16, v17, v18, v19, x9 64 M_MLA v0, v4, v16, v1 65 M_MLA2 v1, v6, v18 66 b 7b 676: 68 M_MLA v0, v21, v25, v1 69 M_MLA2 v1, v23, v27 70 STORE_ONE 0, x1, x2, v1 71 ret 725: 73 movi v0.16b, #0 74 movi v1.16b, #0 754: // remaining filter_length 1-15 76 cmp w6, #4 77 b.lt 2f 78 subs w6, w6, #4 79 LOAD4 v4, v5, x3 80 LOAD4 v6, v7, x9 81 M_MLA v0, v4, v6, v1 82 M_MLA2 v1, v5, v7 83 b.eq 0f 84 b 4b 852: // remaining filter_length 1-3 86 cmp w6, #2 87 b.lt 1f 88 LOAD2 2, x3 89 LOAD2 3, x9 90 subs w6, w6, #2 91 M_MLA v0, v2, v3 92 b.eq 0f 931: // remaining filter_length 1 94 LOAD1 6, x3 95 LOAD1 7, x9 96 M_MLA v0, v6, v7 970: 98 STORE_ONE 0, x1, x2, v1 99 ret 100endfunc 101 102.purgem LOAD1 103.purgem LOAD2 104.purgem LOAD4 105.purgem LOAD8 106.purgem M_MLA 107.purgem M_MLA2 108.purgem M_MUL 109.purgem M_MUL2 110.purgem STORE_ONE 111.endm 112 113 114.macro LOAD1 d1, addr 115 ldr d\d1, [\addr], #8 116.endm 117.macro LOAD2 d1, addr 118 ld1 {v\d1\().2d}, [\addr], #16 119.endm 120.macro LOAD4 d1, d2, addr 121 ld1 {\d1\().2d,\d2\().2d}, [\addr], #32 122.endm 123.macro LOAD8 d1, d2, d3, d4, addr 124 ld1 {\d1\().2d,\d2\().2d,\d3\().2d,\d4\().2d}, [\addr], #64 125.endm 126.macro M_MLA d, r0, r1, d2:vararg 127 fmla \d\().2d, \r0\().2d, \r1\().2d 128.endm 129.macro M_MLA2 second:vararg 130 M_MLA \second 131.endm 132.macro M_MUL d, r0, r1, d2:vararg 133 fmul \d\().2d, \r0\().2d, \r1\().2d 134.endm 135.macro M_MUL2 second:vararg 136 M_MUL \second 137.endm 138.macro STORE_ONE rn, addr, idx, d2 139 fadd v\rn\().2d, v\rn\().2d, \d2\().2d 140 faddp d\rn\(), v\rn\().2d 141 str d\rn\(), [\addr, \idx, lsl #3] 142.endm 143 144resample_one dbl, 3 145 146 147.macro LOAD1 d1, addr 148 ldr s\d1, [\addr], #4 149.endm 150.macro LOAD2 d1, addr 151 ld1 {v\d1\().2s}, [\addr], #8 152.endm 153.macro LOAD4 d1, d2, addr 154 ld1 {\d1\().4s}, [\addr], #16 155.endm 156.macro LOAD8 d1, d2, d3, d4, addr 157 ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 158.endm 159.macro M_MLA d, r0, r1, d2:vararg 160 fmla \d\().4s, \r0\().4s, \r1\().4s 161.endm 162.macro M_MUL d, r0, r1, d2:vararg 163 fmul \d\().4s, \r0\().4s, \r1\().4s 164.endm 165.macro STORE_ONE rn, addr, idx, d2 166 faddp v\rn\().4s, v\rn\().4s, v\rn\().4s 167 faddp s\rn\(), v\rn\().2s 168 str s\rn\(), [\addr, \idx, lsl #2] 169.endm 170 171resample_one flt 172 173 174.macro LOAD1 d1, addr 175 ldr h\d1, [\addr], #2 176.endm 177.macro LOAD2 d1, addr 178 ldr s\d1, [\addr], #4 179.endm 180.macro LOAD4 d1, d2, addr 181 ld1 {\d1\().4h}, [\addr], #8 182.endm 183.macro LOAD8 d1, d2, d3, d4, addr 184 ld1 {\d1\().4h,\d2\().4h}, [\addr], #16 185.endm 186.macro M_MLA d, r0, r1, d2:vararg 187 smlal \d\().4s, \r0\().4h, \r1\().4h 188.endm 189.macro M_MUL d, r0, r1, d2:vararg 190 smull \d\().4s, \r0\().4h, \r1\().4h 191.endm 192.macro STORE_ONE rn, addr, idx, d2 193 addp v\rn\().4s, v\rn\().4s, v\rn\().4s 194 addp v\rn\().4s, v\rn\().4s, v\rn\().4s 195 sqrshrn v\rn\().4h, v\rn\().4s, #15 196 str h\rn\(), [\addr, \idx, lsl #1] 197.endm 198 199resample_one s16, 1 200 201 202.macro LOAD1 d1, addr 203 ldr s\d1, [\addr], #4 204.endm 205.macro LOAD2 d1, addr 206 ld1 {v\d1\().2s}, [\addr], #8 207.endm 208.macro LOAD4 d1, d2, addr 209 ld1 {\d1\().4s}, [\addr], #16 210.endm 211.macro LOAD8 d1, d2, d3, d4, addr 212 ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 213.endm 214.macro M_MLA d1, r0, r1, d2:vararg 215 smlal \d1\().2d, \r0\().2s, \r1\().2s 216.ifnb \d2 217 smlal2 \d2\().2d, \r0\().4s, \r1\().4s 218.endif 219.endm 220.macro M_MUL d1, r0, r1, d2:vararg 221 smull \d1\().2d, \r0\().2s, \r1\().2s 222.ifnb \d2 223 smull2 \d2\().2d, \r0\().4s, \r1\().4s 224.endif 225.endm 226.macro STORE_ONE rn, addr, idx, d2 227 add v\rn\().2d, v\rn\().2d, \d2\().2d 228 addp d\rn\(), v\rn\().2d 229 sqrshrn v\rn\().2s, v\rn\().2d, #30 230 str s\rn\(), [\addr, \idx, lsl #2] 231.endm 232 233resample_one s32 234