1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29cextern pw_4 30cextern pw_8 31cextern pw_32 32cextern pw_64 33 34SECTION .text 35 36 37%macro MV0_PIXELS_MC8 0 38 lea r4, [r2*3 ] 39 lea r5, [r2*4 ] 40.next4rows: 41 movu m0, [r1 ] 42 movu m1, [r1+r2 ] 43 CHROMAMC_AVG m0, [r0 ] 44 CHROMAMC_AVG m1, [r0+r2 ] 45 mova [r0 ], m0 46 mova [r0+r2 ], m1 47 movu m0, [r1+r2*2] 48 movu m1, [r1+r4 ] 49 CHROMAMC_AVG m0, [r0+r2*2] 50 CHROMAMC_AVG m1, [r0+r4 ] 51 mova [r0+r2*2], m0 52 mova [r0+r4 ], m1 53 add r1, r5 54 add r0, r5 55 sub r3d, 4 56 jne .next4rows 57%endmacro 58 59;----------------------------------------------------------------------------- 60; void ff_put/avg_h264_chroma_mc8(pixel *dst, pixel *src, ptrdiff_t stride, 61; int h, int mx, int my) 62;----------------------------------------------------------------------------- 63%macro CHROMA_MC8 1 64cglobal %1_h264_chroma_mc8_10, 6,7,8 65 mov r6d, r5d 66 or r6d, r4d 67 jne .at_least_one_non_zero 68 ; mx == 0 AND my == 0 - no filter needed 69 MV0_PIXELS_MC8 70 REP_RET 71 72.at_least_one_non_zero: 73 mov r6d, 2 74 test r5d, r5d 75 je .x_interpolation 76 mov r6, r2 ; dxy = x ? 1 : stride 77 test r4d, r4d 78 jne .xy_interpolation 79.x_interpolation: 80 ; mx == 0 XOR my == 0 - 1 dimensional filter only 81 or r4d, r5d ; x + y 82 movd m5, r4d 83 mova m4, [pw_8] 84 mova m6, [pw_4] ; mm6 = rnd >> 3 85 SPLATW m5, m5 ; mm5 = B = x 86 psubw m4, m5 ; mm4 = A = 8-x 87 88.next1drow: 89 movu m0, [r1 ] ; mm0 = src[0..7] 90 movu m2, [r1+r6] ; mm2 = src[1..8] 91 92 pmullw m0, m4 ; mm0 = A * src[0..7] 93 pmullw m2, m5 ; mm2 = B * src[1..8] 94 95 paddw m0, m6 96 paddw m0, m2 97 psrlw m0, 3 98 CHROMAMC_AVG m0, [r0] 99 mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 100 101 add r0, r2 102 add r1, r2 103 dec r3d 104 jne .next1drow 105 REP_RET 106 107.xy_interpolation: ; general case, bilinear 108 movd m4, r4m ; x 109 movd m6, r5m ; y 110 111 SPLATW m4, m4 ; mm4 = x words 112 SPLATW m6, m6 ; mm6 = y words 113 psllw m5, m4, 3 ; mm5 = 8x 114 pmullw m4, m6 ; mm4 = x * y 115 psllw m6, 3 ; mm6 = 8y 116 paddw m1, m5, m6 ; mm7 = 8x+8y 117 mova m7, m4 ; DD = x * y 118 psubw m5, m4 ; mm5 = B = 8x - xy 119 psubw m6, m4 ; mm6 = C = 8y - xy 120 paddw m4, [pw_64] 121 psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64 122 123 movu m0, [r1 ] ; mm0 = src[0..7] 124 movu m1, [r1+2] ; mm1 = src[1..8] 125.next2drow: 126 add r1, r2 127 128 pmullw m2, m0, m4 129 pmullw m1, m5 130 paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8] 131 132 movu m0, [r1] 133 movu m1, [r1+2] 134 pmullw m3, m0, m6 135 paddw m2, m3 ; mm2 += C * src[0..7+strde] 136 pmullw m3, m1, m7 137 paddw m2, m3 ; mm2 += D * src[1..8+strde] 138 139 paddw m2, [pw_32] 140 psrlw m2, 6 141 CHROMAMC_AVG m2, [r0] 142 mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6 143 144 add r0, r2 145 dec r3d 146 jne .next2drow 147 REP_RET 148%endmacro 149 150;----------------------------------------------------------------------------- 151; void ff_put/avg_h264_chroma_mc4(pixel *dst, pixel *src, ptrdiff_t stride, 152; int h, int mx, int my) 153;----------------------------------------------------------------------------- 154;TODO: xmm mc4 155%macro MC4_OP 2 156 movq %1, [r1 ] 157 movq m1, [r1+2] 158 add r1, r2 159 pmullw %1, m4 160 pmullw m1, m2 161 paddw m1, %1 162 mova %1, m1 163 164 pmullw %2, m5 165 pmullw m1, m3 166 paddw %2, [pw_32] 167 paddw m1, %2 168 psrlw m1, 6 169 CHROMAMC_AVG m1, %2, [r0] 170 movq [r0], m1 171 add r0, r2 172%endmacro 173 174%macro CHROMA_MC4 1 175cglobal %1_h264_chroma_mc4_10, 6,6,7 176 movd m2, r4m ; x 177 movd m3, r5m ; y 178 mova m4, [pw_8] 179 mova m5, m4 180 SPLATW m2, m2 181 SPLATW m3, m3 182 psubw m4, m2 183 psubw m5, m3 184 185 movq m0, [r1 ] 186 movq m6, [r1+2] 187 add r1, r2 188 pmullw m0, m4 189 pmullw m6, m2 190 paddw m6, m0 191 192.next2rows: 193 MC4_OP m0, m6 194 MC4_OP m6, m0 195 sub r3d, 2 196 jnz .next2rows 197 REP_RET 198%endmacro 199 200;----------------------------------------------------------------------------- 201; void ff_put/avg_h264_chroma_mc2(pixel *dst, pixel *src, ptrdiff_t stride, 202; int h, int mx, int my) 203;----------------------------------------------------------------------------- 204%macro CHROMA_MC2 1 205cglobal %1_h264_chroma_mc2_10, 6,7 206 mov r6d, r4d 207 shl r4d, 16 208 sub r4d, r6d 209 add r4d, 8 210 imul r5d, r4d ; x*y<<16 | y*(8-x) 211 shl r4d, 3 212 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) 213 214 movd m5, r4d 215 movd m6, r5d 216 punpckldq m5, m5 ; mm5 = {A,B,A,B} 217 punpckldq m6, m6 ; mm6 = {C,D,C,D} 218 pxor m7, m7 219 pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] 220 221.nextrow: 222 add r1, r2 223 movq m1, m2 224 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] 225 pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2] 226 movq m2, m0 227 pmaddwd m0, m6 228 paddw m1, [pw_32] 229 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] 230 psrlw m1, 6 231 packssdw m1, m7 232 CHROMAMC_AVG m1, m3, [r0] 233 movd [r0], m1 234 add r0, r2 235 dec r3d 236 jnz .nextrow 237 REP_RET 238%endmacro 239 240%macro NOTHING 2-3 241%endmacro 242%macro AVG 2-3 243%if %0==3 244 movq %2, %3 245%endif 246 pavgw %1, %2 247%endmacro 248 249%define CHROMAMC_AVG NOTHING 250INIT_XMM sse2 251CHROMA_MC8 put 252%if HAVE_AVX_EXTERNAL 253INIT_XMM avx 254CHROMA_MC8 put 255%endif 256INIT_MMX mmxext 257CHROMA_MC4 put 258CHROMA_MC2 put 259 260%define CHROMAMC_AVG AVG 261INIT_XMM sse2 262CHROMA_MC8 avg 263%if HAVE_AVX_EXTERNAL 264INIT_XMM avx 265CHROMA_MC8 avg 266%endif 267INIT_MMX mmxext 268CHROMA_MC4 avg 269CHROMA_MC2 avg 270