1/* 2 * Copyright (c) 2014 Peter Meerwald <pmeerw@pmeerw.net> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22 23#include "asm-offsets.h" 24 25.macro resample_one fmt, es=2 26function ff_resample_one_\fmt\()_neon, export=1 27 push {r4, r5} 28 add r1, r1, r2, lsl #\es 29 30 ldr r2, [r0, #PHASE_SHIFT+4] /* phase_mask */ 31 ldr ip, [sp, #8] /* index */ 32 ldr r5, [r0, #FILTER_LENGTH] 33 and r2, ip, r2 /* (index & phase_mask) */ 34 ldr r4, [r0, #PHASE_SHIFT] 35 lsr r4, ip, r4 /* compute sample_index */ 36 mul r2, r2, r5 37 38 ldr ip, [r0, #FILTER_BANK] 39 add r3, r3, r4, lsl #\es /* &src[sample_index] */ 40 41 cmp r5, #8 42 add r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */ 43 44 blt 5f 458: 46 subs r5, r5, #8 47 LOAD4 48 MUL4 497: 50 LOAD4 51 beq 6f 52 cmp r5, #8 53 MLA4 54 blt 4f 55 subs r5, r5, #8 56 LOAD4 57 MLA4 58 b 7b 596: 60 MLA4 61 STORE 62 pop {r4, r5} 63 bx lr 645: 65 INIT4 664: /* remaining filter_length 1 to 7 */ 67 cmp r5, #4 68 blt 2f 69 subs r5, r5, #4 70 LOAD4 71 MLA4 72 beq 0f 732: /* remaining filter_length 1 to 3 */ 74 cmp r5, #2 75 blt 1f 76 subs r5, r5, #2 77 LOAD2 78 MLA2 79 beq 0f 801: /* remaining filter_length 1 */ 81 LOAD1 82 MLA1 830: 84 STORE 85 pop {r4, r5} 86 bx lr 87endfunc 88 89.purgem LOAD1 90.purgem LOAD2 91.purgem LOAD4 92.purgem MLA1 93.purgem MLA2 94.purgem MLA4 95.purgem MUL4 96.purgem INIT4 97.purgem STORE 98.endm 99 100 101/* float32 */ 102.macro LOAD1 103 veor.32 d0, d0 104 vld1.32 {d0[0]}, [r0]! /* load filter */ 105 vld1.32 {d4[0]}, [r3]! /* load src */ 106.endm 107.macro LOAD2 108 vld1.32 {d0}, [r0]! /* load filter */ 109 vld1.32 {d4}, [r3]! /* load src */ 110.endm 111.macro LOAD4 112 vld1.32 {d0,d1}, [r0]! /* load filter */ 113 vld1.32 {d4,d5}, [r3]! /* load src */ 114.endm 115.macro MLA1 116 vmla.f32 d16, d0, d4[0] 117.endm 118.macro MLA2 119 vmla.f32 d16, d0, d4 120.endm 121.macro MLA4 122 vmla.f32 d16, d0, d4 123 vmla.f32 d17, d1, d5 124.endm 125.macro MUL4 126 vmul.f32 d16, d0, d4 127 vmul.f32 d17, d1, d5 128.endm 129.macro INIT4 130 veor.f32 q8, q8 131.endm 132.macro STORE 133 vpadd.f32 d16, d16, d17 134 vpadd.f32 d16, d16, d16 135 vst1.32 d16[0], [r1] 136.endm 137 138resample_one flt, 2 139 140 141/* s32 */ 142.macro LOAD1 143 veor.32 d0, d0 144 vld1.32 {d0[0]}, [r0]! /* load filter */ 145 vld1.32 {d4[0]}, [r3]! /* load src */ 146.endm 147.macro LOAD2 148 vld1.32 {d0}, [r0]! /* load filter */ 149 vld1.32 {d4}, [r3]! /* load src */ 150.endm 151.macro LOAD4 152 vld1.32 {d0,d1}, [r0]! /* load filter */ 153 vld1.32 {d4,d5}, [r3]! /* load src */ 154.endm 155.macro MLA1 156 vmlal.s32 q8, d0, d4[0] 157.endm 158.macro MLA2 159 vmlal.s32 q8, d0, d4 160.endm 161.macro MLA4 162 vmlal.s32 q8, d0, d4 163 vmlal.s32 q9, d1, d5 164.endm 165.macro MUL4 166 vmull.s32 q8, d0, d4 167 vmull.s32 q9, d1, d5 168.endm 169.macro INIT4 170 veor.s64 q8, q8 171 veor.s64 q9, q9 172.endm 173.macro STORE 174 vadd.s64 q8, q8, q9 175 vadd.s64 d16, d16, d17 176 vqrshrn.s64 d16, q8, #30 177 vst1.32 d16[0], [r1] 178.endm 179 180resample_one s32, 2 181 182 183/* s16 */ 184.macro LOAD1 185 veor.16 d0, d0 186 vld1.16 {d0[0]}, [r0]! /* load filter */ 187 vld1.16 {d4[0]}, [r3]! /* load src */ 188.endm 189.macro LOAD2 190 veor.16 d0, d0 191 vld1.32 {d0[0]}, [r0]! /* load filter */ 192 veor.16 d4, d4 193 vld1.32 {d4[0]}, [r3]! /* load src */ 194.endm 195.macro LOAD4 196 vld1.16 {d0}, [r0]! /* load filter */ 197 vld1.16 {d4}, [r3]! /* load src */ 198.endm 199.macro MLA1 200 vmlal.s16 q8, d0, d4[0] 201.endm 202.macro MLA2 203 vmlal.s16 q8, d0, d4 204.endm 205.macro MLA4 206 vmlal.s16 q8, d0, d4 207.endm 208.macro MUL4 209 vmull.s16 q8, d0, d4 210.endm 211.macro INIT4 212 veor.s32 q8, q8 213.endm 214.macro STORE 215 vpadd.s32 d16, d16, d17 216 vpadd.s32 d16, d16, d16 217 vqrshrn.s32 d16, q8, #15 218 vst1.16 d16[0], [r1] 219.endm 220 221resample_one s16, 1 222 223 224.macro resample_linear fmt, es=2 225function ff_resample_linear_\fmt\()_neon, export=1 226 push {r4, r5} 227 add r1, r1, r2, lsl #\es 228 229 ldr r2, [r0, #PHASE_SHIFT+4] /* phase_mask */ 230 ldr ip, [sp, #8] /* index */ 231 ldr r5, [r0, #FILTER_LENGTH] 232 and r2, ip, r2 /* (index & phase_mask) */ 233 ldr r4, [r0, #PHASE_SHIFT] 234 lsr r4, ip, r4 /* compute sample_index */ 235 mul r2, r2, r5 236 237 ldr ip, [r0, #FILTER_BANK] 238 add r3, r3, r4, lsl #\es /* &src[sample_index] */ 239 240 cmp r5, #8 241 ldr r4, [r0, #SRC_INCR] 242 add r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */ 243 add r2, r0, r5, lsl #\es /* filter[... + c->filter_length] */ 244 245 blt 5f 2468: 247 subs r5, r5, #8 248 LOAD4 249 MUL4 2507: 251 LOAD4 252 beq 6f 253 cmp r5, #8 254 MLA4 255 blt 4f 256 subs r5, r5, #8 257 LOAD4 258 MLA4 259 b 7b 2606: 261 MLA4 262 STORE 263 pop {r4, r5} 264 bx lr 2655: 266 INIT4 2674: /* remaining filter_length 1 to 7 */ 268 cmp r5, #4 269 blt 2f 270 subs r5, r5, #4 271 LOAD4 272 MLA4 273 beq 0f 2742: /* remaining filter_length 1 to 3 */ 275 cmp r5, #2 276 blt 1f 277 subs r5, r5, #2 278 LOAD2 279 MLA2 280 beq 0f 2811: /* remaining filter_length 1 */ 282 LOAD1 283 MLA1 2840: 285 STORE 286 pop {r4, r5} 287 bx lr 288endfunc 289 290.purgem LOAD1 291.purgem LOAD2 292.purgem LOAD4 293.purgem MLA1 294.purgem MLA2 295.purgem MLA4 296.purgem MUL4 297.purgem INIT4 298.purgem STORE 299.endm 300 301 302/* float32 linear */ 303.macro LOAD1 304 veor.32 d0, d0 305 veor.32 d2, d2 306 vld1.32 {d0[0]}, [r0]! /* load filter */ 307 vld1.32 {d2[0]}, [r2]! /* load filter */ 308 vld1.32 {d4[0]}, [r3]! /* load src */ 309.endm 310.macro LOAD2 311 vld1.32 {d0}, [r0]! /* load filter */ 312 vld1.32 {d2}, [r2]! /* load filter */ 313 vld1.32 {d4}, [r3]! /* load src */ 314.endm 315.macro LOAD4 316 vld1.32 {d0,d1}, [r0]! /* load filter */ 317 vld1.32 {d2,d3}, [r2]! /* load filter */ 318 vld1.32 {d4,d5}, [r3]! /* load src */ 319.endm 320.macro MLA1 321 vmla.f32 d18, d0, d4[0] 322 vmla.f32 d16, d2, d4[0] 323.endm 324.macro MLA2 325 vmla.f32 d18, d0, d4 326 vmla.f32 d16, d2, d4 327.endm 328.macro MLA4 329 vmla.f32 q9, q0, q2 330 vmla.f32 q8, q1, q2 331.endm 332.macro MUL4 333 vmul.f32 q9, q0, q2 334 vmul.f32 q8, q1, q2 335.endm 336.macro INIT4 337 veor.f32 q9, q9 338 veor.f32 q8, q8 339.endm 340.macro STORE 341 vldr s0, [sp, #12] /* frac */ 342 vmov s1, r4 343 vcvt.f32.s32 d0, d0 344 345 vsub.f32 q8, q8, q9 /* v2 - val */ 346 vpadd.f32 d18, d18, d19 347 vpadd.f32 d16, d16, d17 348 vpadd.f32 d2, d18, d18 349 vpadd.f32 d1, d16, d16 350 351 vmul.f32 s2, s2, s0 /* (v2 - val) * frac */ 352 vdiv.f32 s2, s2, s1 /* / c->src_incr */ 353 vadd.f32 s4, s4, s2 354 355 vstr s4, [r1] 356.endm 357 358resample_linear flt, 2 359