1/* 2 * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com> 3 * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/arm/asm.S" 23 24 25.macro compute_premult 26 vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3) 27 vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3) 28 vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r 29 vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g 30 vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g 31 vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g 32 vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b 33.endm 34 35.macro compute_color dst_comp1 dst_comp2 pre 36 vadd.s16 q1, q14, \pre 37 vadd.s16 q2, q15, \pre 38 vqrshrun.s16 \dst_comp1, q1, #1 39 vqrshrun.s16 \dst_comp2, q2, #1 40.endm 41 42.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 43 compute_color \r1, \r2, q8 44 compute_color \g1, \g2, q9 45 compute_color \b1, \b2, q10 46 vmov.u8 \a1, #255 47 vmov.u8 \a2, #255 48.endm 49 50.macro compute dst ofmt 51 vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3) 52 vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3) 53 vsub.s16 q14, q12 @ q14 = (Y - y_offset) 54 vsub.s16 q15, q12 @ q15 = (Y - y_offset) 55 vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff 56 vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff 57 58.ifc \ofmt,argb 59 compute_rgba d7, d8, d9, d6, d11, d12, d13, d10 60.endif 61 62.ifc \ofmt,rgba 63 compute_rgba d6, d7, d8, d9, d10, d11, d12, d13 64.endif 65 66.ifc \ofmt,abgr 67 compute_rgba d9, d8, d7, d6, d13, d12, d11, d10 68.endif 69 70.ifc \ofmt,bgra 71 compute_rgba d8, d7, d6, d9, d12, d11, d10, d13 72.endif 73 74 vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16 75 vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16 76 vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16 77 vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16 78 vst4.8 {q3, q4}, [\dst,:128]! 79 vst4.8 {q5, q6}, [\dst,:128]! 80.endm 81 82.macro process_1l_internal dst src ofmt 83 vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved) 84 compute \dst, \ofmt 85.endm 86 87.macro process_1l ofmt 88 compute_premult 89 process_1l_internal r2, r4, \ofmt 90.endm 91 92.macro process_2l ofmt 93 compute_premult 94 process_1l_internal r2, r4, \ofmt 95 process_1l_internal r11,r12,\ofmt 96.endm 97 98.macro load_args_nv12 99 push {r4-r12, lr} 100 vpush {q4-q7} 101 ldr r4, [sp, #104] @ r4 = srcY 102 ldr r5, [sp, #108] @ r5 = linesizeY 103 ldr r6, [sp, #112] @ r6 = srcC 104 ldr r7, [sp, #116] @ r7 = linesizeC 105 ldr r8, [sp, #120] @ r8 = table 106 ldr r9, [sp, #124] @ r9 = y_offset 107 ldr r10,[sp, #128] @ r10 = y_coeff 108 vdup.16 d0, r10 @ d0 = y_coeff 109 vld1.16 {d1}, [r8] @ d1 = *table 110 add r11, r2, r3 @ r11 = dst + linesize (dst2) 111 add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) 112 lsl r3, r3, #1 113 lsl r5, r5, #1 114 sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) 115 sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) 116 sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) 117.endm 118 119.macro load_args_nv21 120 load_args_nv12 121.endm 122 123.macro load_args_yuv420p 124 push {r4-r12, lr} 125 vpush {q4-q7} 126 ldr r4, [sp, #104] @ r4 = srcY 127 ldr r5, [sp, #108] @ r5 = linesizeY 128 ldr r6, [sp, #112] @ r6 = srcU 129 ldr r8, [sp, #128] @ r8 = table 130 ldr r9, [sp, #132] @ r9 = y_offset 131 ldr r10,[sp, #136] @ r10 = y_coeff 132 vdup.16 d0, r10 @ d0 = y_coeff 133 vld1.16 {d1}, [r8] @ d1 = *table 134 add r11, r2, r3 @ r11 = dst + linesize (dst2) 135 add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) 136 lsl r3, r3, #1 137 lsl r5, r5, #1 138 sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) 139 sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) 140 ldr r10,[sp, #120] @ r10 = srcV 141.endm 142 143.macro load_args_yuv422p 144 push {r4-r12, lr} 145 vpush {q4-q7} 146 ldr r4, [sp, #104] @ r4 = srcY 147 ldr r5, [sp, #108] @ r5 = linesizeY 148 ldr r6, [sp, #112] @ r6 = srcU 149 ldr r7, [sp, #116] @ r7 = linesizeU 150 ldr r12,[sp, #124] @ r12 = linesizeV 151 ldr r8, [sp, #128] @ r8 = table 152 ldr r9, [sp, #132] @ r9 = y_offset 153 ldr r10,[sp, #136] @ r10 = y_coeff 154 vdup.16 d0, r10 @ d0 = y_coeff 155 vld1.16 {d1}, [r8] @ d1 = *table 156 sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding) 157 sub r5, r5, r0 @ r5 = linesizeY - width (paddingY) 158 sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) 159 sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV) 160 ldr r10,[sp, #120] @ r10 = srcV 161.endm 162 163.macro load_chroma_nv12 164 pld [r12, #64*3] 165 166 vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line 167 vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) 168 vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) 169.endm 170 171.macro load_chroma_nv21 172 pld [r12, #64*3] 173 174 vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line 175 vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3) 176 vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3) 177.endm 178 179.macro load_chroma_yuv420p 180 pld [r10, #64*3] 181 pld [r12, #64*3] 182 183 vld1.8 d2, [r6]! @ d2: chroma red line 184 vld1.8 d3, [r10]! @ d3: chroma blue line 185 vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) 186 vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) 187.endm 188 189.macro load_chroma_yuv422p 190 pld [r10, #64*3] 191 192 vld1.8 d2, [r6]! @ d2: chroma red line 193 vld1.8 d3, [r10]! @ d3: chroma blue line 194 vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) 195 vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) 196.endm 197 198.macro increment_and_test_nv12 199 add r11, r11, r3 @ dst2 += padding 200 add r12, r12, r5 @ srcY2 += paddingY 201 add r6, r6, r7 @ srcC += paddingC 202 subs r1, r1, #2 @ height -= 2 203.endm 204 205.macro increment_and_test_nv21 206 increment_and_test_nv12 207.endm 208 209.macro increment_and_test_yuv420p 210 add r11, r11, r3 @ dst2 += padding 211 add r12, r12, r5 @ srcY2 += paddingY 212 ldr r7, [sp, #116] @ r7 = linesizeU 213 sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) 214 add r6, r6, r7 @ srcU += paddingU 215 ldr r7, [sp, #124] @ r7 = linesizeV 216 sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) 217 add r10, r10, r7 @ srcV += paddingV 218 subs r1, r1, #2 @ height -= 2 219.endm 220 221.macro increment_and_test_yuv422p 222 add r6, r6, r7 @ srcU += paddingU 223 add r10,r10,r12 @ srcV += paddingV 224 subs r1, r1, #1 @ height -= 1 225.endm 226 227.macro process_nv12 ofmt 228 process_2l \ofmt 229.endm 230 231.macro process_nv21 ofmt 232 process_2l \ofmt 233.endm 234 235.macro process_yuv420p ofmt 236 process_2l \ofmt 237.endm 238 239.macro process_yuv422p ofmt 240 process_1l \ofmt 241.endm 242 243.macro declare_func ifmt ofmt 244function ff_\ifmt\()_to_\ofmt\()_neon, export=1 245 load_args_\ifmt 246 vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3) 247 vdup.16 q12, r9 @ q12 = y_offset 248 vmov d26, d0 @ q13 = y_coeff 249 vmov d27, d0 @ q13 = y_coeff 2501: 251 mov r8, r0 @ r8 = width 2522: 253 pld [r6, #64*3] 254 pld [r4, #64*3] 255 vmov.i8 d10, #128 256 load_chroma_\ifmt 257 process_\ifmt \ofmt 258 subs r8, r8, #16 @ width -= 16 259 bgt 2b 260 add r2, r2, r3 @ dst += padding 261 add r4, r4, r5 @ srcY += paddingY 262 increment_and_test_\ifmt 263 bgt 1b 264 vpop {q4-q7} 265 pop {r4-r12, lr} 266 mov pc, lr 267endfunc 268.endm 269 270.macro declare_rgb_funcs ifmt 271 declare_func \ifmt, argb 272 declare_func \ifmt, rgba 273 declare_func \ifmt, abgr 274 declare_func \ifmt, bgra 275.endm 276 277declare_rgb_funcs nv12 278declare_rgb_funcs nv21 279declare_rgb_funcs yuv420p 280declare_rgb_funcs yuv422p 281