1;***************************************************************************** 2;* SSE2-optimized weighted prediction code 3;***************************************************************************** 4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28;----------------------------------------------------------------------------- 29; biweight pred: 30; 31; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, 32; int height, int log2_denom, int weightd, 33; int weights, int offset); 34; and 35; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height, 36; int log2_denom, int weight, int offset); 37;----------------------------------------------------------------------------- 38 39%macro WEIGHT_SETUP 0 40 add r5, r5 41 inc r5 42 movd m3, r4d 43 movd m5, r5d 44 movd m6, r3d 45 pslld m5, m6 46 psrld m5, 1 47%if mmsize == 16 48 pshuflw m3, m3, 0 49 pshuflw m5, m5, 0 50 punpcklqdq m3, m3 51 punpcklqdq m5, m5 52%else 53 pshufw m3, m3, 0 54 pshufw m5, m5, 0 55%endif 56 pxor m7, m7 57%endmacro 58 59%macro WEIGHT_OP 2 60 movh m0, [r0+%1] 61 movh m1, [r0+%2] 62 punpcklbw m0, m7 63 punpcklbw m1, m7 64 pmullw m0, m3 65 pmullw m1, m3 66 paddsw m0, m5 67 paddsw m1, m5 68 psraw m0, m6 69 psraw m1, m6 70 packuswb m0, m1 71%endmacro 72 73INIT_MMX mmxext 74cglobal h264_weight_16, 6, 6, 0 75 WEIGHT_SETUP 76.nextrow: 77 WEIGHT_OP 0, 4 78 mova [r0 ], m0 79 WEIGHT_OP 8, 12 80 mova [r0+8], m0 81 add r0, r1 82 dec r2d 83 jnz .nextrow 84 REP_RET 85 86%macro WEIGHT_FUNC_MM 2 87cglobal h264_weight_%1, 6, 6, %2 88 WEIGHT_SETUP 89.nextrow: 90 WEIGHT_OP 0, mmsize/2 91 mova [r0], m0 92 add r0, r1 93 dec r2d 94 jnz .nextrow 95 REP_RET 96%endmacro 97 98INIT_MMX mmxext 99WEIGHT_FUNC_MM 8, 0 100INIT_XMM sse2 101WEIGHT_FUNC_MM 16, 8 102 103%macro WEIGHT_FUNC_HALF_MM 2 104cglobal h264_weight_%1, 6, 6, %2 105 WEIGHT_SETUP 106 sar r2d, 1 107 lea r3, [r1*2] 108.nextrow: 109 WEIGHT_OP 0, r1 110 movh [r0], m0 111%if mmsize == 16 112 movhps [r0+r1], m0 113%else 114 psrlq m0, 32 115 movh [r0+r1], m0 116%endif 117 add r0, r3 118 dec r2d 119 jnz .nextrow 120 REP_RET 121%endmacro 122 123INIT_MMX mmxext 124WEIGHT_FUNC_HALF_MM 4, 0 125INIT_XMM sse2 126WEIGHT_FUNC_HALF_MM 8, 8 127 128%macro BIWEIGHT_SETUP 0 129%if ARCH_X86_64 130%define off_regd r7d 131%else 132%define off_regd r3d 133%endif 134 mov off_regd, r7m 135 add off_regd, 1 136 or off_regd, 1 137 add r4d, 1 138 cmp r6d, 128 139 je .nonnormal 140 cmp r5d, 128 141 jne .normal 142.nonnormal: 143 sar r5d, 1 144 sar r6d, 1 145 sar off_regd, 1 146 sub r4d, 1 147.normal: 148%if cpuflag(ssse3) 149 movd m4, r5d 150 movd m0, r6d 151%else 152 movd m3, r5d 153 movd m4, r6d 154%endif 155 movd m5, off_regd 156 movd m6, r4d 157 pslld m5, m6 158 psrld m5, 1 159%if cpuflag(ssse3) 160 punpcklbw m4, m0 161 pshuflw m4, m4, 0 162 pshuflw m5, m5, 0 163 punpcklqdq m4, m4 164 punpcklqdq m5, m5 165 166%else 167%if mmsize == 16 168 pshuflw m3, m3, 0 169 pshuflw m4, m4, 0 170 pshuflw m5, m5, 0 171 punpcklqdq m3, m3 172 punpcklqdq m4, m4 173 punpcklqdq m5, m5 174%else 175 pshufw m3, m3, 0 176 pshufw m4, m4, 0 177 pshufw m5, m5, 0 178%endif 179 pxor m7, m7 180%endif 181%endmacro 182 183%macro BIWEIGHT_STEPA 3 184 movh m%1, [r0+%3] 185 movh m%2, [r1+%3] 186 punpcklbw m%1, m7 187 punpcklbw m%2, m7 188 pmullw m%1, m3 189 pmullw m%2, m4 190 paddsw m%1, m%2 191%endmacro 192 193%macro BIWEIGHT_STEPB 0 194 paddsw m0, m5 195 paddsw m1, m5 196 psraw m0, m6 197 psraw m1, m6 198 packuswb m0, m1 199%endmacro 200 201INIT_MMX mmxext 202cglobal h264_biweight_16, 7, 8, 0 203 BIWEIGHT_SETUP 204 movifnidn r3d, r3m 205.nextrow: 206 BIWEIGHT_STEPA 0, 1, 0 207 BIWEIGHT_STEPA 1, 2, 4 208 BIWEIGHT_STEPB 209 mova [r0], m0 210 BIWEIGHT_STEPA 0, 1, 8 211 BIWEIGHT_STEPA 1, 2, 12 212 BIWEIGHT_STEPB 213 mova [r0+8], m0 214 add r0, r2 215 add r1, r2 216 dec r3d 217 jnz .nextrow 218 REP_RET 219 220%macro BIWEIGHT_FUNC_MM 2 221cglobal h264_biweight_%1, 7, 8, %2 222 BIWEIGHT_SETUP 223 movifnidn r3d, r3m 224.nextrow: 225 BIWEIGHT_STEPA 0, 1, 0 226 BIWEIGHT_STEPA 1, 2, mmsize/2 227 BIWEIGHT_STEPB 228 mova [r0], m0 229 add r0, r2 230 add r1, r2 231 dec r3d 232 jnz .nextrow 233 REP_RET 234%endmacro 235 236INIT_MMX mmxext 237BIWEIGHT_FUNC_MM 8, 0 238INIT_XMM sse2 239BIWEIGHT_FUNC_MM 16, 8 240 241%macro BIWEIGHT_FUNC_HALF_MM 2 242cglobal h264_biweight_%1, 7, 8, %2 243 BIWEIGHT_SETUP 244 movifnidn r3d, r3m 245 sar r3, 1 246 lea r4, [r2*2] 247.nextrow: 248 BIWEIGHT_STEPA 0, 1, 0 249 BIWEIGHT_STEPA 1, 2, r2 250 BIWEIGHT_STEPB 251 movh [r0], m0 252%if mmsize == 16 253 movhps [r0+r2], m0 254%else 255 psrlq m0, 32 256 movh [r0+r2], m0 257%endif 258 add r0, r4 259 add r1, r4 260 dec r3d 261 jnz .nextrow 262 REP_RET 263%endmacro 264 265INIT_MMX mmxext 266BIWEIGHT_FUNC_HALF_MM 4, 0 267INIT_XMM sse2 268BIWEIGHT_FUNC_HALF_MM 8, 8 269 270%macro BIWEIGHT_SSSE3_OP 0 271 pmaddubsw m0, m4 272 pmaddubsw m2, m4 273 paddsw m0, m5 274 paddsw m2, m5 275 psraw m0, m6 276 psraw m2, m6 277 packuswb m0, m2 278%endmacro 279 280INIT_XMM ssse3 281cglobal h264_biweight_16, 7, 8, 8 282 BIWEIGHT_SETUP 283 movifnidn r3d, r3m 284 285.nextrow: 286 movh m0, [r0] 287 movh m2, [r0+8] 288 movh m3, [r1+8] 289 punpcklbw m0, [r1] 290 punpcklbw m2, m3 291 BIWEIGHT_SSSE3_OP 292 mova [r0], m0 293 add r0, r2 294 add r1, r2 295 dec r3d 296 jnz .nextrow 297 REP_RET 298 299INIT_XMM ssse3 300cglobal h264_biweight_8, 7, 8, 8 301 BIWEIGHT_SETUP 302 movifnidn r3d, r3m 303 sar r3, 1 304 lea r4, [r2*2] 305 306.nextrow: 307 movh m0, [r0] 308 movh m1, [r1] 309 movh m2, [r0+r2] 310 movh m3, [r1+r2] 311 punpcklbw m0, m1 312 punpcklbw m2, m3 313 BIWEIGHT_SSSE3_OP 314 movh [r0], m0 315 movhps [r0+r2], m0 316 add r0, r4 317 add r1, r4 318 dec r3d 319 jnz .nextrow 320 REP_RET 321