1;****************************************************************************** 2;* VC1 motion compensation optimizations 3;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24cextern pw_9 25cextern pw_128 26 27SECTION .text 28 29%if HAVE_MMX_INLINE 30 31; XXX some of these macros are not used right now, but they will in the future 32; when more functions are ported. 33 34%macro OP_PUT 2 ; dst, src 35%endmacro 36 37%macro OP_AVG 2 ; dst, src 38 pavgb %1, %2 39%endmacro 40 41%macro NORMALIZE_MMX 1 ; shift 42 paddw m3, m7 ; +bias-r 43 paddw m4, m7 ; +bias-r 44 psraw m3, %1 45 psraw m4, %1 46%endmacro 47 48%macro TRANSFER_DO_PACK 2 ; op, dst 49 packuswb m3, m4 50 %1 m3, [%2] 51 mova [%2], m3 52%endmacro 53 54%macro TRANSFER_DONT_PACK 2 ; op, dst 55 %1 m3, [%2] 56 %1 m3, [%2 + mmsize] 57 mova [%2], m3 58 mova [mmsize + %2], m4 59%endmacro 60 61; see MSPEL_FILTER13_CORE for use as UNPACK macro 62%macro DO_UNPACK 1 ; reg 63 punpcklbw %1, m0 64%endmacro 65%macro DONT_UNPACK 1 ; reg 66%endmacro 67 68; Compute the rounder 32-r or 8-r and unpacks it to m7 69%macro LOAD_ROUNDER_MMX 1 ; round 70 movd m7, %1 71 punpcklwd m7, m7 72 punpckldq m7, m7 73%endmacro 74 75%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3 76 paddw m%3, m%4 77 movh m%2, [srcq + stride_neg2] 78 pmullw m%3, m6 79 punpcklbw m%2, m0 80 movh m%5, [srcq + strideq] 81 psubw m%3, m%2 82 punpcklbw m%5, m0 83 paddw m%3, m7 84 psubw m%3, m%5 85 psraw m%3, shift 86 movu [dstq + %1], m%3 87 add srcq, strideq 88%endmacro 89 90INIT_MMX mmx 91; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src, 92; x86_reg stride, int rnd, int64_t shift) 93; Sacrificing m6 makes it possible to pipeline loads from src 94%if ARCH_X86_32 95cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride 96 DECLARE_REG_TMP 3, 4, 5 97 %define rnd r3mp 98 %define shift qword r4m 99%else ; X86_64 100cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride 101 DECLARE_REG_TMP 4, 5, 6 102 %define rnd r3d 103 ; We need shift either in memory or in a mm reg as it's used in psraw 104 ; On WIN64, the arg is already on the stack 105 ; On UNIX64, m5 doesn't seem to be used 106%if WIN64 107 %define shift r4mp 108%else ; UNIX64 109 %define shift m5 110 mova shift, r4q 111%endif ; WIN64 112%endif ; X86_32 113%define stride_neg2 t0q 114%define stride_9minus4 t1q 115%define i t2q 116 mov stride_neg2, strideq 117 neg stride_neg2 118 add stride_neg2, stride_neg2 119 lea stride_9minus4, [strideq * 9 - 4] 120 mov i, 3 121 LOAD_ROUNDER_MMX rnd 122 mova m6, [pw_9] 123 pxor m0, m0 124.loop: 125 movh m2, [srcq] 126 add srcq, strideq 127 movh m3, [srcq] 128 punpcklbw m2, m0 129 punpcklbw m3, m0 130 SHIFT2_LINE 0, 1, 2, 3, 4 131 SHIFT2_LINE 24, 2, 3, 4, 1 132 SHIFT2_LINE 48, 3, 4, 1, 2 133 SHIFT2_LINE 72, 4, 1, 2, 3 134 SHIFT2_LINE 96, 1, 2, 3, 4 135 SHIFT2_LINE 120, 2, 3, 4, 1 136 SHIFT2_LINE 144, 3, 4, 1, 2 137 SHIFT2_LINE 168, 4, 1, 2, 3 138 sub srcq, stride_9minus4 139 add dstq, 8 140 dec i 141 jnz .loop 142 REP_RET 143%undef rnd 144%undef shift 145%undef stride_neg2 146%undef stride_9minus4 147%undef i 148 149; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, 150; const int16_t *src, int rnd); 151; Data is already unpacked, so some operations can directly be made from 152; memory. 153%macro HOR_16B_SHIFT2 2 ; op, opname 154cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h 155 mov hq, 8 156 sub srcq, 2 157 sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias 158 LOAD_ROUNDER_MMX rndd 159 mova m5, [pw_9] 160 mova m6, [pw_128] 161 pxor m0, m0 162 163.loop: 164 mova m1, [srcq + 2 * 0] 165 mova m2, [srcq + 2 * 0 + mmsize] 166 mova m3, [srcq + 2 * 1] 167 mova m4, [srcq + 2 * 1 + mmsize] 168 paddw m3, [srcq + 2 * 2] 169 paddw m4, [srcq + 2 * 2 + mmsize] 170 paddw m1, [srcq + 2 * 3] 171 paddw m2, [srcq + 2 * 3 + mmsize] 172 pmullw m3, m5 173 pmullw m4, m5 174 psubw m3, m1 175 psubw m4, m2 176 NORMALIZE_MMX 7 177 ; remove bias 178 paddw m3, m6 179 paddw m4, m6 180 TRANSFER_DO_PACK %1, dstq 181 add srcq, 24 182 add dstq, strideq 183 dec hq 184 jnz .loop 185 186 RET 187%endmacro 188 189INIT_MMX mmx 190HOR_16B_SHIFT2 OP_PUT, put 191 192INIT_MMX mmxext 193HOR_16B_SHIFT2 OP_AVG, avg 194%endif ; HAVE_MMX_INLINE 195 196%macro INV_TRANS_INIT 0 197 movsxdifnidn linesizeq, linesized 198 movd m0, blockd 199 SPLATW m0, m0 200 pxor m1, m1 201 psubw m1, m0 202 packuswb m0, m0 203 packuswb m1, m1 204 205 DEFINE_ARGS dest, linesize, linesize3 206 lea linesize3q, [linesizeq*3] 207%endmacro 208 209%macro INV_TRANS_PROCESS 1 210 mov%1 m2, [destq+linesizeq*0] 211 mov%1 m3, [destq+linesizeq*1] 212 mov%1 m4, [destq+linesizeq*2] 213 mov%1 m5, [destq+linesize3q] 214 paddusb m2, m0 215 paddusb m3, m0 216 paddusb m4, m0 217 paddusb m5, m0 218 psubusb m2, m1 219 psubusb m3, m1 220 psubusb m4, m1 221 psubusb m5, m1 222 mov%1 [linesizeq*0+destq], m2 223 mov%1 [linesizeq*1+destq], m3 224 mov%1 [linesizeq*2+destq], m4 225 mov%1 [linesize3q +destq], m5 226%endmacro 227 228; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block) 229INIT_MMX mmxext 230cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block 231 movsx r3d, WORD [blockq] 232 mov blockd, r3d ; dc 233 shl blockd, 4 ; 16 * dc 234 lea blockd, [blockq+r3+4] ; 17 * dc + 4 235 sar blockd, 3 ; >> 3 236 mov r3d, blockd ; dc 237 shl blockd, 4 ; 16 * dc 238 lea blockd, [blockq+r3+64] ; 17 * dc + 64 239 sar blockd, 7 ; >> 7 240 241 INV_TRANS_INIT 242 243 INV_TRANS_PROCESS h 244 RET 245 246INIT_MMX mmxext 247cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block 248 movsx r3d, WORD [blockq] 249 mov blockd, r3d ; dc 250 shl blockd, 4 ; 16 * dc 251 lea blockd, [blockq+r3+4] ; 17 * dc + 4 252 sar blockd, 3 ; >> 3 253 shl blockd, 2 ; 4 * dc 254 lea blockd, [blockq*3+64] ; 12 * dc + 64 255 sar blockd, 7 ; >> 7 256 257 INV_TRANS_INIT 258 259 INV_TRANS_PROCESS h 260 lea destq, [destq+linesizeq*4] 261 INV_TRANS_PROCESS h 262 RET 263 264INIT_MMX mmxext 265cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block 266 movsx blockd, WORD [blockq] ; dc 267 lea blockd, [blockq*3+1] ; 3 * dc + 1 268 sar blockd, 1 ; >> 1 269 mov r3d, blockd ; dc 270 shl blockd, 4 ; 16 * dc 271 lea blockd, [blockq+r3+64] ; 17 * dc + 64 272 sar blockd, 7 ; >> 7 273 274 INV_TRANS_INIT 275 276 INV_TRANS_PROCESS a 277 RET 278 279INIT_MMX mmxext 280cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block 281 movsx blockd, WORD [blockq] ; dc 282 lea blockd, [blockq*3+1] ; 3 * dc + 1 283 sar blockd, 1 ; >> 1 284 lea blockd, [blockq*3+16] ; 3 * dc + 16 285 sar blockd, 5 ; >> 5 286 287 INV_TRANS_INIT 288 289 INV_TRANS_PROCESS a 290 lea destq, [destq+linesizeq*4] 291 INV_TRANS_PROCESS a 292 RET 293