1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 32 28 29sq_1: dq 1 30 dq 0 31 32cextern pw_1 33cextern pw_1023 34%define pw_pixel_max pw_1023 35 36SECTION .text 37 38;----------------------------------------------------------------------------- 39; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height, 40; int log2_denom, int weight, int offset); 41;----------------------------------------------------------------------------- 42%macro WEIGHT_PROLOGUE 0 43.prologue: 44 PROLOGUE 0,6,8 45 movifnidn r0, r0mp 46 movifnidn r1d, r1m 47 movifnidn r2d, r2m 48 movifnidn r4d, r4m 49 movifnidn r5d, r5m 50%endmacro 51 52%macro WEIGHT_SETUP 0 53 mova m0, [pw_1] 54 movd m2, r3m 55 pslld m0, m2 ; 1<<log2_denom 56 SPLATW m0, m0 57 shl r5, 19 ; *8, move to upper half of dword 58 lea r5, [r5+r4*2+0x10000] 59 movd m3, r5d ; weight<<1 | 1+(offset<<(3)) 60 pshufd m3, m3, 0 61 mova m4, [pw_pixel_max] 62 paddw m2, [sq_1] ; log2_denom+1 63%if notcpuflag(sse4) 64 pxor m7, m7 65%endif 66%endmacro 67 68%macro WEIGHT_OP 1-2 69%if %0==1 70 mova m5, [r0+%1] 71 punpckhwd m6, m5, m0 72 punpcklwd m5, m0 73%else 74 movq m5, [r0+%1] 75 movq m6, [r0+%2] 76 punpcklwd m5, m0 77 punpcklwd m6, m0 78%endif 79 pmaddwd m5, m3 80 pmaddwd m6, m3 81 psrad m5, m2 82 psrad m6, m2 83%if cpuflag(sse4) 84 packusdw m5, m6 85 pminsw m5, m4 86%else 87 packssdw m5, m6 88 CLIPW m5, m7, m4 89%endif 90%endmacro 91 92%macro WEIGHT_FUNC_DBL 0 93cglobal h264_weight_16_10 94 WEIGHT_PROLOGUE 95 WEIGHT_SETUP 96.nextrow: 97 WEIGHT_OP 0 98 mova [r0 ], m5 99 WEIGHT_OP 16 100 mova [r0+16], m5 101 add r0, r1 102 dec r2d 103 jnz .nextrow 104 REP_RET 105%endmacro 106 107INIT_XMM sse2 108WEIGHT_FUNC_DBL 109INIT_XMM sse4 110WEIGHT_FUNC_DBL 111 112 113%macro WEIGHT_FUNC_MM 0 114cglobal h264_weight_8_10 115 WEIGHT_PROLOGUE 116 WEIGHT_SETUP 117.nextrow: 118 WEIGHT_OP 0 119 mova [r0], m5 120 add r0, r1 121 dec r2d 122 jnz .nextrow 123 REP_RET 124%endmacro 125 126INIT_XMM sse2 127WEIGHT_FUNC_MM 128INIT_XMM sse4 129WEIGHT_FUNC_MM 130 131 132%macro WEIGHT_FUNC_HALF_MM 0 133cglobal h264_weight_4_10 134 WEIGHT_PROLOGUE 135 sar r2d, 1 136 WEIGHT_SETUP 137 lea r3, [r1*2] 138.nextrow: 139 WEIGHT_OP 0, r1 140 movh [r0], m5 141 movhps [r0+r1], m5 142 add r0, r3 143 dec r2d 144 jnz .nextrow 145 REP_RET 146%endmacro 147 148INIT_XMM sse2 149WEIGHT_FUNC_HALF_MM 150INIT_XMM sse4 151WEIGHT_FUNC_HALF_MM 152 153 154;----------------------------------------------------------------------------- 155; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride, 156; int height, int log2_denom, int weightd, 157; int weights, int offset); 158;----------------------------------------------------------------------------- 159%if ARCH_X86_32 160DECLARE_REG_TMP 3 161%else 162DECLARE_REG_TMP 7 163%endif 164 165%macro BIWEIGHT_PROLOGUE 0 166.prologue: 167 PROLOGUE 0,8,8 168 movifnidn r0, r0mp 169 movifnidn r1, r1mp 170 movifnidn r2d, r2m 171 movifnidn r5d, r5m 172 movifnidn r6d, r6m 173 movifnidn t0d, r7m 174%endmacro 175 176%macro BIWEIGHT_SETUP 0 177 lea t0, [t0*4+1] ; (offset<<2)+1 178 or t0, 1 179 shl r6, 16 180 or r5, r6 181 movd m4, r5d ; weightd | weights 182 movd m5, t0d ; (offset+1)|1 183 movd m6, r4m ; log2_denom 184 pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom 185 paddd m6, [sq_1] 186 pshufd m4, m4, 0 187 pshufd m5, m5, 0 188 mova m3, [pw_pixel_max] 189 movifnidn r3d, r3m 190%if notcpuflag(sse4) 191 pxor m7, m7 192%endif 193%endmacro 194 195%macro BIWEIGHT 1-2 196%if %0==1 197 mova m0, [r0+%1] 198 mova m1, [r1+%1] 199 punpckhwd m2, m0, m1 200 punpcklwd m0, m1 201%else 202 movq m0, [r0+%1] 203 movq m1, [r1+%1] 204 punpcklwd m0, m1 205 movq m2, [r0+%2] 206 movq m1, [r1+%2] 207 punpcklwd m2, m1 208%endif 209 pmaddwd m0, m4 210 pmaddwd m2, m4 211 paddd m0, m5 212 paddd m2, m5 213 psrad m0, m6 214 psrad m2, m6 215%if cpuflag(sse4) 216 packusdw m0, m2 217 pminsw m0, m3 218%else 219 packssdw m0, m2 220 CLIPW m0, m7, m3 221%endif 222%endmacro 223 224%macro BIWEIGHT_FUNC_DBL 0 225cglobal h264_biweight_16_10 226 BIWEIGHT_PROLOGUE 227 BIWEIGHT_SETUP 228.nextrow: 229 BIWEIGHT 0 230 mova [r0 ], m0 231 BIWEIGHT 16 232 mova [r0+16], m0 233 add r0, r2 234 add r1, r2 235 dec r3d 236 jnz .nextrow 237 REP_RET 238%endmacro 239 240INIT_XMM sse2 241BIWEIGHT_FUNC_DBL 242INIT_XMM sse4 243BIWEIGHT_FUNC_DBL 244 245%macro BIWEIGHT_FUNC 0 246cglobal h264_biweight_8_10 247 BIWEIGHT_PROLOGUE 248 BIWEIGHT_SETUP 249.nextrow: 250 BIWEIGHT 0 251 mova [r0], m0 252 add r0, r2 253 add r1, r2 254 dec r3d 255 jnz .nextrow 256 REP_RET 257%endmacro 258 259INIT_XMM sse2 260BIWEIGHT_FUNC 261INIT_XMM sse4 262BIWEIGHT_FUNC 263 264%macro BIWEIGHT_FUNC_HALF 0 265cglobal h264_biweight_4_10 266 BIWEIGHT_PROLOGUE 267 BIWEIGHT_SETUP 268 sar r3d, 1 269 lea r4, [r2*2] 270.nextrow: 271 BIWEIGHT 0, r2 272 movh [r0 ], m0 273 movhps [r0+r2], m0 274 add r0, r4 275 add r1, r4 276 dec r3d 277 jnz .nextrow 278 REP_RET 279%endmacro 280 281INIT_XMM sse2 282BIWEIGHT_FUNC_HALF 283INIT_XMM sse4 284BIWEIGHT_FUNC_HALF 285