1; XVID MPEG-4 VIDEO CODEC 2; 3; Conversion from gcc syntax to x264asm syntax with modifications 4; by Christophe Gisquet <christophe.gisquet@gmail.com> 5; 6; =========== SSE2 inverse discrete cosine transform =========== 7; 8; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> 9; 10; Conversion to gcc syntax with modifications 11; by Alexander Strange <astrange@ithinksw.com> 12; 13; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. 14; 15; Vertical pass is an implementation of the scheme: 16; Loeffler C., Ligtenberg A., and Moschytz C.S.: 17; Practical Fast 1D DCT Algorithm with Eleven Multiplications, 18; Proc. ICASSP 1989, 988-991. 19; 20; Horizontal pass is a double 4x4 vector/matrix multiplication, 21; (see also Intel's Application Note 922: 22; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm 23; Copyright (C) 1999 Intel Corporation) 24; 25; More details at http://skal.planet-d.net/coding/dct.html 26; 27; ======= MMX and XMM forward discrete cosine transform ======= 28; 29; Copyright(C) 2001 Peter Ross <pross@xvid.org> 30; 31; Originally provided by Intel at AP-922 32; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm 33; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) 34; but in a limited edition. 35; New macro implements a column part for precise iDCT 36; The routine precision now satisfies IEEE standard 1180-1990. 37; 38; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> 39; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> 40; 41; http://www.elecard.com/peter/idct.html 42; http://www.linuxvideo.org/mpeg2dec/ 43; 44; These examples contain code fragments for first stage iDCT 8x8 45; (for rows) and first stage DCT 8x8 (for columns) 46; 47; conversion to gcc syntax by Michael Niedermayer 48; 49; ====================================================================== 50; 51; This file is part of FFmpeg. 52; 53; FFmpeg is free software; you can redistribute it and/or 54; modify it under the terms of the GNU Lesser General Public 55; License as published by the Free Software Foundation; either 56; version 2.1 of the License, or (at your option) any later version. 57; 58; FFmpeg is distributed in the hope that it will be useful, 59; but WITHOUT ANY WARRANTY; without even the implied warranty of 60; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 61; Lesser General Public License for more details. 62; 63; You should have received a copy of the GNU Lesser General Public License 64; along with FFmpeg; if not, write to the Free Software Foundation, 65; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 66 67%include "libavutil/x86/x86util.asm" 68 69SECTION_RODATA 70; Similar to tg_1_16 in MMX code 71tan1: times 8 dw 13036 72tan2: times 8 dw 27146 73tan3: times 8 dw 43790 74sqrt2: times 8 dw 23170 75 76; SSE2 tables 77iTab1: dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d 78 dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61 79 dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7 80 dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b 81iTab2: dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5 82 dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04 83 dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41 84 dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df 85iTab3: dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf 86 dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf 87 dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d 88 dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 89iTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746 90 dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac 91 dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df 92 dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e 93 94%if ARCH_X86_32 95; ----------------------------------------------------------------------------- 96; 97; The first stage iDCT 8x8 - inverse DCTs of rows 98; 99; ----------------------------------------------------------------------------- 100; The 8-point inverse DCT direct algorithm 101; ----------------------------------------------------------------------------- 102; 103; static const short w[32] = { 104; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16), 105; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16), 106; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16), 107; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16), 108; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16), 109; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16), 110; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16), 111; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) }; 112; 113; #define DCT_8_INV_ROW(x, y) 114; { 115; int a0, a1, a2, a3, b0, b1, b2, b3; 116; 117; a0 = x[0] * w[0] + x[2] * w[1] + x[4] * w[2] + x[6] * w[3]; 118; a1 = x[0] * w[4] + x[2] * w[5] + x[4] * w[6] + x[6] * w[7]; 119; a2 = x[0] * w[8] + x[2] * w[9] + x[4] * w[10] + x[6] * w[11]; 120; a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15]; 121; b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19]; 122; b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23]; 123; b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27]; 124; b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31]; 125; 126; y[0] = SHIFT_ROUND(a0 + b0); 127; y[1] = SHIFT_ROUND(a1 + b1); 128; y[2] = SHIFT_ROUND(a2 + b2); 129; y[3] = SHIFT_ROUND(a3 + b3); 130; y[4] = SHIFT_ROUND(a3 - b3); 131; y[5] = SHIFT_ROUND(a2 - b2); 132; y[6] = SHIFT_ROUND(a1 - b1); 133; y[7] = SHIFT_ROUND(a0 - b0); 134; } 135; 136; ----------------------------------------------------------------------------- 137; 138; In this implementation the outputs of the iDCT-1D are multiplied 139; for rows 0,4 - by cos_4_16, 140; for rows 1,7 - by cos_1_16, 141; for rows 2,6 - by cos_2_16, 142; for rows 3,5 - by cos_3_16 143; and are shifted to the left for better accuracy. 144; 145; For the constants used, 146; FIX(float_const) = (short) (float_const * (1 << 15) + 0.5) 147; 148; ----------------------------------------------------------------------------- 149 150; ----------------------------------------------------------------------------- 151; Tables for mmx processors 152; ----------------------------------------------------------------------------- 153 154; Table for rows 0,4 - constants are multiplied by cos_4_16 155tab_i_04_mmx: dw 16384, 16384, 16384, -16384 156 dw 21407, 8867, 8867, -21407 ; w07 w05 w03 w01 157 dw 16384, -16384, 16384, 16384 ; w14 w12 w10 w08 158 dw -8867, 21407, -21407, -8867 ; w15 w13 w11 w09 159 dw 22725, 12873, 19266, -22725 ; w22 w20 w18 w16 160 dw 19266, 4520, -4520, -12873 ; w23 w21 w19 w17 161 dw 12873, 4520, 4520, 19266 ; w30 w28 w26 w24 162 dw -22725, 19266, -12873, -22725 ; w31 w29 w27 w25 163; Table for rows 1,7 - constants are multiplied by cos_1_16 164 dw 22725, 22725, 22725, -22725 ; movq-> w06 w04 w02 w00 165 dw 29692, 12299, 12299, -29692 ; w07 w05 w03 w01 166 dw 22725, -22725, 22725, 22725 ; w14 w12 w10 w08 167 dw -12299, 29692, -29692, -12299 ; w15 w13 w11 w09 168 dw 31521, 17855, 26722, -31521 ; w22 w20 w18 w16 169 dw 26722, 6270, -6270, -17855 ; w23 w21 w19 w17 170 dw 17855, 6270, 6270, 26722 ; w30 w28 w26 w24 171 dw -31521, 26722, -17855, -31521 ; w31 w29 w27 w25 172; Table for rows 2,6 - constants are multiplied by cos_2_16 173 dw 21407, 21407, 21407, -21407 ; movq-> w06 w04 w02 w00 174 dw 27969, 11585, 11585, -27969 ; w07 w05 w03 w01 175 dw 21407, -21407, 21407, 21407 ; w14 w12 w10 w08 176 dw -11585, 27969, -27969, -11585 ; w15 w13 w11 w09 177 dw 29692, 16819, 25172, -29692 ; w22 w20 w18 w16 178 dw 25172, 5906, -5906, -16819 ; w23 w21 w19 w17 179 dw 16819, 5906, 5906, 25172 ; w30 w28 w26 w24 180 dw -29692, 25172, -16819, -29692 ; w31 w29 w27 w25 181; Table for rows 3,5 - constants are multiplied by cos_3_16 182 dw 19266, 19266, 19266, -19266 ; movq-> w06 w04 w02 w00 183 dw 25172, 10426, 10426, -25172 ; w07 w05 w03 w01 184 dw 19266, -19266, 19266, 19266 ; w14 w12 w10 w08 185 dw -10426, 25172, -25172, -10426 ; w15 w13 w11 w09 186 dw 26722, 15137, 22654, -26722 ; w22 w20 w18 w16 187 dw 22654, 5315, -5315, -15137 ; w23 w21 w19 w17 188 dw 15137, 5315, 5315, 22654 ; w30 w28 w26 w24 189 dw -26722, 22654, -15137, -26722 ; w31 w29 w27 w25 190 191; ----------------------------------------------------------------------------- 192; Tables for xmm processors 193; ----------------------------------------------------------------------------- 194 195; %3 for rows 0,4 - constants are multiplied by cos_4_16 196tab_i_04_xmm: dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00 197 dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02 198 dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08 199 dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10 200 dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16 201 dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18 202 dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24 203 dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26 204; %3 for rows 1,7 - constants are multiplied by cos_1_16 205 dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00 206 dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02 207 dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08 208 dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10 209 dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16 210 dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18 211 dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24 212 dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26 213; %3 for rows 2,6 - constants are multiplied by cos_2_16 214 dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00 215 dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02 216 dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08 217 dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10 218 dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16 219 dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18 220 dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24 221 dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26 222; %3 for rows 3,5 - constants are multiplied by cos_3_16 223 dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00 224 dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02 225 dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08 226 dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10 227 dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16 228 dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18 229 dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24 230 dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26 231%endif ; ~ARCH_X86_32 232 233; Similar to rounder_0 in MMX code 234; 4 first similar, then: 4*8->6*16 5*8->4*16 6/7*8->5*16 235walkenIdctRounders: times 4 dd 65536 236 times 4 dd 3597 237 times 4 dd 2260 238 times 4 dd 1203 239 times 4 dd 120 240 times 4 dd 512 241 times 2 dd 0 242 243pb_127: times 8 db 127 244 245SECTION .text 246 247; Temporary storage before the column pass 248%define ROW1 xmm6 249%define ROW3 xmm4 250%define ROW5 xmm5 251%define ROW7 xmm7 252 253%macro CLEAR_ODD 1 254 pxor %1, %1 255%endmacro 256%macro PUT_ODD 1 257 pshufhw %1, xmm2, 0x1B 258%endmacro 259 260%macro MOV32 2 261%if ARCH_X86_32 262 movdqa %2, %1 263%endif 264%endmacro 265 266%macro CLEAR_EVEN 1 267%if ARCH_X86_64 268 CLEAR_ODD %1 269%endif 270%endmacro 271 272%macro PUT_EVEN 1 273%if ARCH_X86_64 274 PUT_ODD %1 275%else 276 pshufhw xmm2, xmm2, 0x1B 277 movdqa %1, xmm2 278%endif 279%endmacro 280 281%if ARCH_X86_64 282%define ROW0 xmm8 283%define REG0 ROW0 284%define ROW2 xmm9 285%define REG2 ROW2 286%define ROW4 xmm10 287%define REG4 ROW4 288%define ROW6 xmm11 289%define REG6 ROW6 290%define XMMS xmm12 291%define SREG2 REG2 292%define TAN3 xmm13 293%define TAN1 xmm14 294%else 295%define ROW0 [BLOCK + 0*16] 296%define REG0 xmm4 297%define ROW2 [BLOCK + 2*16] 298%define REG2 xmm4 299%define ROW4 [BLOCK + 4*16] 300%define REG4 xmm6 301%define ROW6 [BLOCK + 6*16] 302%define REG6 xmm6 303%define XMMS xmm2 304%define SREG2 xmm7 305%define TAN3 xmm0 306%define TAN1 xmm2 307%endif 308 309%macro JZ 2 310 test %1, %1 311 jz .%2 312%endmacro 313 314%macro JNZ 2 315 test %1, %1 316 jnz .%2 317%endmacro 318 319%macro TEST_ONE_ROW 4 ; src, reg, clear, arg 320 %3 %4 321 movq mm1, [%1] 322 por mm1, [%1 + 8] 323 paddusb mm1, mm0 324 pmovmskb %2, mm1 325%endmacro 326 327;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2 328%macro TEST_TWO_ROWS 8 329 %5 %6 330 %7 %8 331 movq mm1, [%1 + 0] 332 por mm1, [%1 + 8] 333 movq mm2, [%2 + 0] 334 por mm2, [%2 + 8] 335 paddusb mm1, mm0 336 paddusb mm2, mm0 337 pmovmskb %3, mm1 338 pmovmskb %4, mm2 339%endmacro 340 341; IDCT pass on rows. 342%macro iMTX_MULT 4-5 ; src, table, put, arg, rounder 343 movdqa xmm3, [%1] 344 movdqa xmm0, xmm3 345 pshufd xmm1, xmm3, 0x11 ; 4602 346 punpcklqdq xmm0, xmm0 ; 0246 347 pmaddwd xmm0, [%2] 348 pmaddwd xmm1, [%2+16] 349 pshufd xmm2, xmm3, 0xBB ; 5713 350 punpckhqdq xmm3, xmm3 ; 1357 351 pmaddwd xmm2, [%2+32] 352 pmaddwd xmm3, [%2+48] 353 paddd xmm0, xmm1 354 paddd xmm2, xmm3 355%if %0 == 5 356 paddd xmm0, [walkenIdctRounders+%5] 357%endif 358 movdqa xmm3, xmm2 359 paddd xmm2, xmm0 360 psubd xmm0, xmm3 361 psrad xmm2, 11 362 psrad xmm0, 11 363 packssdw xmm2, xmm0 364 %3 %4 365%endmacro 366 367%macro iLLM_HEAD 0 368 movdqa TAN3, [tan3] 369 movdqa TAN1, [tan1] 370%endmacro 371 372%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put) 373 psraw xmm5, 6 374 psraw REG0, 6 375 psraw TAN3, 6 376 psraw xmm3, 6 377 ; dct coeffs must still be written for AC prediction 378%if %2 == 0 379 movdqa [%1+1*16], TAN3 380 movdqa [%1+2*16], xmm3 381 movdqa [%1+5*16], REG0 382 movdqa [%1+6*16], xmm5 383%else 384 ; Must now load args as gprs are no longer used for masks 385 ; DEST is set to where address of dest was loaded 386 %if ARCH_X86_32 387 %if %2 == 2 ; Not enough xmms, store 388 movdqa [%1+1*16], TAN3 389 movdqa [%1+2*16], xmm3 390 movdqa [%1+5*16], REG0 391 movdqa [%1+6*16], xmm5 392 %endif 393 %xdefine DEST r2q ; BLOCK is r0, stride r1 394 movifnidn DEST, destm 395 movifnidn strideq, stridem 396 %else 397 %xdefine DEST r0q 398 %endif 399 lea r3q, [3*strideq] 400 %if %2 == 1 401 packuswb TAN3, xmm3 402 packuswb xmm5, REG0 403 movq [DEST + strideq], TAN3 404 movhps [DEST + 2*strideq], TAN3 405 ; REG0 and TAN3 are now available (and likely used in second half) 406 %endif 407%endif 408%endmacro 409 410%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms 411 psraw %3, 6 412 psraw %4, 6 413 psraw %5, 6 414 psraw %6, 6 415 ; dct coeffs must still be written for AC prediction 416%if %2 == 0 417 movdqa [%1+0*16], %3 418 movdqa [%1+3*16], %5 419 movdqa [%1+4*16], %6 420 movdqa [%1+7*16], %4 421%elif %2 == 1 422 packuswb %3, %5 423 packuswb %6, %4 424 ; address of dest may have been loaded 425 movq [DEST], %3 426 movhps [DEST + r3q], %3 427 lea DEST, [DEST + 4*strideq] 428 movq [DEST], %6 429 movhps [DEST + r3q], %6 430 ; and now write remainder of first half 431 movq [DEST + 2*strideq], xmm5 432 movhps [DEST + strideq], xmm5 433%elif %2 == 2 434 pxor xmm0, xmm0 435 %if ARCH_X86_32 436 ; free: m3 REG0=m4 m5 437 ; input: m1, m7, m2, m6 438 movq xmm3, [DEST+0*strideq] 439 movq xmm4, [DEST+1*strideq] 440 punpcklbw xmm3, xmm0 441 punpcklbw xmm4, xmm0 442 paddsw xmm3, %3 443 paddsw xmm4, [%1 + 1*16] 444 movq %3, [DEST+2*strideq] 445 movq xmm5, [DEST+ r3q] 446 punpcklbw %3, xmm0 447 punpcklbw xmm5, xmm0 448 paddsw %3, [%1 + 2*16] 449 paddsw xmm5, %5 450 packuswb xmm3, xmm4 451 packuswb %3, xmm5 452 movq [DEST+0*strideq], xmm3 453 movhps [DEST+1*strideq], xmm3 454 movq [DEST+2*strideq], %3 455 movhps [DEST+ r3q], %3 456 lea DEST, [DEST+4*strideq] 457 movq xmm3, [DEST+0*strideq] 458 movq xmm4, [DEST+1*strideq] 459 movq %3, [DEST+2*strideq] 460 movq xmm5, [DEST+ r3q] 461 punpcklbw xmm3, xmm0 462 punpcklbw xmm4, xmm0 463 punpcklbw %3, xmm0 464 punpcklbw xmm5, xmm0 465 paddsw xmm3, %6 466 paddsw xmm4, [%1 + 5*16] 467 paddsw %3, [%1 + 6*16] 468 paddsw xmm5, %4 469 packuswb xmm3, xmm4 470 packuswb %3, xmm5 471 movq [DEST+0*strideq], xmm3 472 movhps [DEST+1*strideq], xmm3 473 movq [DEST+2*strideq], %3 474 movhps [DEST+ r3q], %3 475 %else 476 ; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5 477 ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10 478 movq xmm2, [DEST+0*strideq] 479 movq xmm4, [DEST+1*strideq] 480 movq xmm12, [DEST+2*strideq] 481 movq xmm11, [DEST+ r3q] 482 punpcklbw xmm2, xmm0 483 punpcklbw xmm4, xmm0 484 punpcklbw xmm12, xmm0 485 punpcklbw xmm11, xmm0 486 paddsw xmm2, %3 487 paddsw xmm4, TAN3 488 paddsw xmm12, xmm3 489 paddsw xmm11, %5 490 packuswb xmm2, xmm4 491 packuswb xmm12, xmm11 492 movq [DEST+0*strideq], xmm2 493 movhps [DEST+1*strideq], xmm2 494 movq [DEST+2*strideq], xmm12 495 movhps [DEST+ r3q], xmm12 496 lea DEST, [DEST+4*strideq] 497 movq xmm2, [DEST+0*strideq] 498 movq xmm4, [DEST+1*strideq] 499 movq xmm12, [DEST+2*strideq] 500 movq xmm11, [DEST+ r3q] 501 punpcklbw xmm2, xmm0 502 punpcklbw xmm4, xmm0 503 punpcklbw xmm12, xmm0 504 punpcklbw xmm11, xmm0 505 paddsw xmm2, %6 506 paddsw xmm4, REG0 507 paddsw xmm12, xmm5 508 paddsw xmm11, %4 509 packuswb xmm2, xmm4 510 packuswb xmm12, xmm11 511 movq [DEST+0*strideq], xmm2 512 movhps [DEST+1*strideq], xmm2 513 movq [DEST+2*strideq], xmm12 514 movhps [DEST+ r3q], xmm12 515 %endif 516%endif 517%endmacro 518 519 520; IDCT pass on columns. 521%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put) 522 movdqa xmm1, TAN3 523 movdqa xmm3, TAN1 524 pmulhw TAN3, xmm4 525 pmulhw xmm1, xmm5 526 paddsw TAN3, xmm4 527 paddsw xmm1, xmm5 528 psubsw TAN3, xmm5 529 paddsw xmm1, xmm4 530 pmulhw xmm3, xmm7 531 pmulhw TAN1, xmm6 532 paddsw xmm3, xmm6 533 psubsw TAN1, xmm7 534 movdqa xmm7, xmm3 535 movdqa xmm6, TAN1 536 psubsw xmm3, xmm1 537 psubsw TAN1, TAN3 538 paddsw xmm1, xmm7 539 paddsw TAN3, xmm6 540 movdqa xmm6, xmm3 541 psubsw xmm3, TAN3 542 paddsw TAN3, xmm6 543 movdqa xmm4, [sqrt2] 544 pmulhw xmm3, xmm4 545 pmulhw TAN3, xmm4 546 paddsw TAN3, TAN3 547 paddsw xmm3, xmm3 548 movdqa xmm7, [tan2] 549 MOV32 ROW2, REG2 550 MOV32 ROW6, REG6 551 movdqa xmm5, xmm7 552 pmulhw xmm7, REG6 553 pmulhw xmm5, REG2 554 paddsw xmm7, REG2 555 psubsw xmm5, REG6 556 MOV32 ROW0, REG0 557 MOV32 ROW4, REG4 558 MOV32 TAN1, [BLOCK] 559 movdqa XMMS, REG0 560 psubsw REG0, REG4 561 paddsw REG4, XMMS 562 movdqa XMMS, REG4 563 psubsw REG4, xmm7 564 paddsw xmm7, XMMS 565 movdqa XMMS, REG0 566 psubsw REG0, xmm5 567 paddsw xmm5, XMMS 568 movdqa XMMS, xmm5 569 psubsw xmm5, TAN3 570 paddsw TAN3, XMMS 571 movdqa XMMS, REG0 572 psubsw REG0, xmm3 573 paddsw xmm3, XMMS 574 MOV32 [BLOCK], TAN1 575 576 FIRST_HALF %1, %2 577 578 movdqa xmm0, xmm7 579 movdqa xmm4, REG4 580 psubsw xmm7, xmm1 581 psubsw REG4, TAN1 582 paddsw xmm1, xmm0 583 paddsw TAN1, xmm4 584 585 SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4 586%endmacro 587 588; IDCT pass on columns, assuming rows 4-7 are zero 589%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add) 590 pmulhw TAN3, xmm4 591 paddsw TAN3, xmm4 592 movdqa xmm3, xmm6 593 pmulhw TAN1, xmm6 594 movdqa xmm1, xmm4 595 psubsw xmm3, xmm1 596 paddsw xmm1, xmm6 597 movdqa xmm6, TAN1 598 psubsw TAN1, TAN3 599 paddsw TAN3, xmm6 600 movdqa xmm6, xmm3 601 psubsw xmm3, TAN3 602 paddsw TAN3, xmm6 603 movdqa xmm4, [sqrt2] 604 pmulhw xmm3, xmm4 605 pmulhw TAN3, xmm4 606 paddsw TAN3, TAN3 607 paddsw xmm3, xmm3 608 movdqa xmm5, [tan2] 609 MOV32 ROW2, SREG2 610 pmulhw xmm5, SREG2 611 MOV32 ROW0, REG0 612 movdqa xmm6, REG0 613 psubsw xmm6, SREG2 614 paddsw SREG2, REG0 615 MOV32 TAN1, [BLOCK] 616 movdqa XMMS, REG0 617 psubsw REG0, xmm5 618 paddsw xmm5, XMMS 619 movdqa XMMS, xmm5 620 psubsw xmm5, TAN3 621 paddsw TAN3, XMMS 622 movdqa XMMS, REG0 623 psubsw REG0, xmm3 624 paddsw xmm3, XMMS 625 MOV32 [BLOCK], TAN1 626 627 FIRST_HALF %1, %2 628 629 movdqa xmm0, SREG2 630 movdqa xmm4, xmm6 631 psubsw SREG2, xmm1 632 psubsw xmm6, TAN1 633 paddsw xmm1, xmm0 634 paddsw TAN1, xmm4 635 636 SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6 637%endmacro 638 639%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add 640%if %1 == 0 || ARCH_X86_32 641 %define GPR0 r1d 642 %define GPR1 r2d 643 %define GPR2 r3d 644 %define GPR3 r4d 645 %define NUM_GPRS 5 646%else 647 %define GPR0 r3d 648 %define GPR1 r4d 649 %define GPR2 r5d 650 %define GPR3 r6d 651 %define NUM_GPRS 7 652%endif 653%if %1 == 0 654cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block 655%xdefine BLOCK blockq 656%else 657 %if %1 == 1 658cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block 659 %else 660cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block 661 %endif 662 %if ARCH_X86_64 663 %xdefine BLOCK blockq 664 %else 665 mov r0q, blockm 666 %xdefine BLOCK r0q 667 %endif 668%endif 669 movq mm0, [pb_127] 670 iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16 671 iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16 672 iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16 673 674 TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c 675 JZ GPR0, col1 676 iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16 677.col1: 678 TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d 679 TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi 680 681 iLLM_HEAD 682 JNZ GPR1, 2 683 JNZ GPR0, 3 684 JNZ GPR2, 4 685 JNZ GPR3, 5 686 iLLM_PASS_SPARSE BLOCK, %1 687 jmp .6 688.2: 689 iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4 690.3: 691 iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16 692 JZ GPR2, col2 693.4: 694 iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16 695.col2: 696 JZ GPR3, col3 697.5: 698 iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16 699.col3: 700%if ARCH_X86_32 701 iLLM_HEAD 702%endif 703 iLLM_PASS BLOCK, %1 704.6: 705 RET 706%endmacro 707 708INIT_XMM sse2 709IDCT_SSE2 0 710IDCT_SSE2 1 711IDCT_SSE2 2 712 713%if ARCH_X86_32 714 715; %1=offset %2=tab_offset 716; %3=rnd_offset where 4*8->6*16 5*8->4*16 6/7*8->5*16 717%macro DCT_8_INV_ROW 3 718 movq mm0, [r0+16*%1+0] ; 0 ; x3 x2 x1 x0 719 movq mm1, [r0+16*%1+8] ; 1 ; x7 x6 x5 x4 720 movq mm2, mm0 ; 2 ; x3 x2 x1 x0 721 movq mm3, [%2+ 0] ; 3 ; w06 w04 w02 w00 722%if cpuflag(mmxext) 723 pshufw mm0, mm0, 0x88 ; x2 x0 x2 x0 724 movq mm4, [%2+ 8] ; 4 ; w07 w06 w03 w02 725 movq mm5, mm1 ; 5 ; x7 x6 x5 x4 726 pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 727 movq mm6, [%2+32] ; 6 ; w21 w20 w17 w16 728 pshufw mm1, mm1, 0x88 ; x6 x4 x6 x4 729 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 730 movq mm7, [%2+40] ; 7; w23 w22 w19 w18 731 pshufw mm2, mm2, 0xdd ; x3 x1 x3 x1 732 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 733 pshufw mm5, mm5, 0xdd ; x7 x5 x7 x5 734 pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18 735 paddd mm3, [walkenIdctRounders + %3] ; +%3 736 pmaddwd mm0, [%2+16] ; x2*w13+x0*w12 x2*w09+x0*w08 737 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) 738 pmaddwd mm1, [%2+24] ; x6*w15+x4*w14 x6*w11+x4*w10 739 movq mm4, mm3 ; 4 ; a1 a0 740 pmaddwd mm2, [%2+48] ; x3*w29+x1*w28 x3*w25+x1*w24 741 paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) 742 pmaddwd mm5, [%2+56] ; x7*w31+x5*w30 x7*w27+x5*w26 743 paddd mm3, mm6 ; a1+b1 a0+b0 744 paddd mm0, [walkenIdctRounders + %3] ; +%3 745 psrad mm3, 11 ; y1=a1+b1 y0=a0+b0 746 paddd mm0, mm1 ; 1 ; a3=sum(even3) a2=sum(even2) 747 psubd mm4, mm6 ; 6 ; a1-b1 a0-b0 748 movq mm7, mm0 ; 7 ; a3 a2 749 paddd mm2, mm5 ; 5 ; b3=sum(odd3) b2=sum(odd2) 750 paddd mm0, mm2 ; a3+b3 a2+b2 751 psrad mm4, 11 ; y6=a1-b1 y7=a0-b0 752 psubd mm7, mm2 ; 2 ; a3-b3 a2-b2 753 psrad mm0, 11 ; y3=a3+b3 y2=a2+b2 754 psrad mm7, 11 ; y4=a3-b3 y5=a2-b2 755 packssdw mm3, mm0 ; 0 ; y3 y2 y1 y0 756 packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 757 movq [r0+16*%1+0], mm3 ; 3 ; save y3 y2 y1 y0 758 pshufw mm7, mm7, 0xb1 ; y7 y6 y5 y4 759%else 760 punpcklwd mm0, mm1 ; x5 x1 x4 x0 761 movq mm5, mm0 ; 5 ; x5 x1 x4 x0 762 punpckldq mm0, mm0 ; x4 x0 x4 x0 763 movq mm4, [%2+ 8] ; 4 ; w07 w05 w03 w01 764 punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 765 pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 766 movq mm6, mm2 ; 6 ; x7 x3 x6 x2 767 movq mm1, [%2+32] ; 1 ; w22 w20 w18 w16 768 punpckldq mm2, mm2 ; x6 x2 x6 x2 769 pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 770 punpckhdq mm5, mm5 ; x5 x1 x5 x1 771 pmaddwd mm0, [%2+16] ; x4*w14+x0*w12 x4*w10+x0*w08 772 punpckhdq mm6, mm6 ; x7 x3 x7 x3 773 movq mm7, [%2+40] ; 7 ; w23 w21 w19 w17 774 pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 775 paddd mm3, [walkenIdctRounders + %3] ; +%3 776 pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 777 pmaddwd mm2, [%2+24] ; x6*w15+x2*w13 x6*w11+x2*w09 778 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) 779 pmaddwd mm5, [%2+48] ; x5*w30+x1*w28 x5*w26+x1*w24 780 movq mm4, mm3 ; 4 ; a1 a0 781 pmaddwd mm6, [%2+56] ; x7*w31+x3*w29 x7*w27+x3*w25 782 paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) 783 paddd mm0, [walkenIdctRounders + %3] ; +%3 784 psubd mm3, mm1 ; a1-b1 a0-b0 785 psrad mm3, 11 ; y6=a1-b1 y7=a0-b0 786 paddd mm1, mm4 ; 4 ; a1+b1 a0+b0 787 paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) 788 psrad mm1, 11 ; y1=a1+b1 y0=a0+b0 789 paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) 790 movq mm4, mm0 ; 4 ; a3 a2 791 paddd mm0, mm5 ; a3+b3 a2+b2 792 psubd mm4, mm5 ; 5 ; a3-b3 a2-b2 793 psrad mm0, 11 ; y3=a3+b3 y2=a2+b2 794 psrad mm4, 11 ; y4=a3-b3 y5=a2-b2 795 packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0 796 packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5 797 movq mm7, mm4 ; 7 ; y6 y7 y4 y5 798 psrld mm4, 16 ; 0 y6 0 y4 799 pslld mm7, 16 ; y7 0 y5 0 800 movq [r0+16*%1+0], mm1 ; 1 ; save y3 y2 y1 y0 801 por mm7, mm4 ; 4 ; y7 y6 y5 y4 802%endif 803 movq [r0+16*%1+8], mm7 ; 7 ; save y7 y6 y5 y4 804%endmacro 805 806; ----------------------------------------------------------------------------- 807; 808; The first stage DCT 8x8 - forward DCTs of columns 809; 810; The %2puts are multiplied 811; for rows 0,4 - on cos_4_16, 812; for rows 1,7 - on cos_1_16, 813; for rows 2,6 - on cos_2_16, 814; for rows 3,5 - on cos_3_16 815; and are shifted to the left for rise of accuracy 816; 817; ----------------------------------------------------------------------------- 818; 819; The 8-point scaled forward DCT algorithm (26a8m) 820; 821; ----------------------------------------------------------------------------- 822; 823;#define DCT_8_FRW_COL(x, y) 824; { 825; short t0, t1, t2, t3, t4, t5, t6, t7; 826; short tp03, tm03, tp12, tm12, tp65, tm65; 827; short tp465, tm465, tp765, tm765; 828; 829; t0 = LEFT_SHIFT(x[0] + x[7]); 830; t1 = LEFT_SHIFT(x[1] + x[6]); 831; t2 = LEFT_SHIFT(x[2] + x[5]); 832; t3 = LEFT_SHIFT(x[3] + x[4]); 833; t4 = LEFT_SHIFT(x[3] - x[4]); 834; t5 = LEFT_SHIFT(x[2] - x[5]); 835; t6 = LEFT_SHIFT(x[1] - x[6]); 836; t7 = LEFT_SHIFT(x[0] - x[7]); 837; 838; tp03 = t0 + t3; 839; tm03 = t0 - t3; 840; tp12 = t1 + t2; 841; tm12 = t1 - t2; 842; 843; y[0] = tp03 + tp12; 844; y[4] = tp03 - tp12; 845; 846; y[2] = tm03 + tm12 * tg_2_16; 847; y[6] = tm03 * tg_2_16 - tm12; 848; 849; tp65 = (t6 + t5) * cos_4_16; 850; tm65 = (t6 - t5) * cos_4_16; 851; 852; tp765 = t7 + tp65; 853; tm765 = t7 - tp65; 854; tp465 = t4 + tm65; 855; tm465 = t4 - tm65; 856; 857; y[1] = tp765 + tp465 * tg_1_16; 858; y[7] = tp765 * tg_1_16 - tp465; 859; y[5] = tm765 * tg_3_16 + tm465; 860; y[3] = tm765 - tm465 * tg_3_16; 861; } 862; 863; ----------------------------------------------------------------------------- 864 865; ----------------------------------------------------------------------------- 866; DCT_8_INV_COL_4 INP,OUT 867; ----------------------------------------------------------------------------- 868%macro DCT_8_INV_COL 1 869 movq mm0, [tan3] 870 movq mm3, [%1+16*3] 871 movq mm1, mm0 ; tg_3_16 872 movq mm5, [%1+16*5] 873 pmulhw mm0, mm3 ; x3*(tg_3_16-1) 874 movq mm4, [tan1] 875 pmulhw mm1, mm5 ; x5*(tg_3_16-1) 876 movq mm7, [%1+16*7] 877 movq mm2, mm4 ; tg_1_16 878 movq mm6, [%1+16*1] 879 pmulhw mm4, mm7 ; x7*tg_1_16 880 paddsw mm0, mm3 ; x3*tg_3_16 881 pmulhw mm2, mm6 ; x1*tg_1_16 882 paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) 883 psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 884 movq mm3, [sqrt2] 885 paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 886 paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 887 psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 888 movq mm5, mm4 ; tp17 889 movq mm6, mm2 ; tm17 890 paddsw mm5, mm1 ; tp17+tp35 = b0 891 psubsw mm6, mm0 ; tm17-tm35 = b3 892 psubsw mm4, mm1 ; tp17-tp35 = t1 893 paddsw mm2, mm0 ; tm17+tm35 = t2 894 movq mm7, [tan2] 895 movq mm1, mm4 ; t1 896 movq [%1+3*16], mm5 ; save b0 897 paddsw mm1, mm2 ; t1+t2 898 movq [%1+5*16], mm6 ; save b3 899 psubsw mm4, mm2 ; t1-t2 900 movq mm5, [%1+2*16] 901 movq mm0, mm7 ; tg_2_16 902 movq mm6, [%1+6*16] 903 pmulhw mm0, mm5 ; x2*tg_2_16 904 pmulhw mm7, mm6 ; x6*tg_2_16 905 pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 906 movq mm2, [%1+0*16] 907 pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 908 psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 909 movq mm3, mm2 ; x0 910 movq mm6, [%1+4*16] 911 paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 912 paddsw mm2, mm6 ; x0+x4 = tp04 913 psubsw mm3, mm6 ; x0-x4 = tm04 914 movq mm5, mm2 ; tp04 915 movq mm6, mm3 ; tm04 916 psubsw mm2, mm7 ; tp04-tp26 = a3 917 paddsw mm3, mm0 ; tm04+tm26 = a1 918 paddsw mm1, mm1 ; b1 919 paddsw mm4, mm4 ; b2 920 paddsw mm5, mm7 ; tp04+tp26 = a0 921 psubsw mm6, mm0 ; tm04-tm26 = a2 922 movq mm7, mm3 ; a1 923 movq mm0, mm6 ; a2 924 paddsw mm3, mm1 ; a1+b1 925 paddsw mm6, mm4 ; a2+b2 926 psraw mm3, 6 ; dst1 927 psubsw mm7, mm1 ; a1-b1 928 psraw mm6, 6 ; dst2 929 psubsw mm0, mm4 ; a2-b2 930 movq mm1, [%1+3*16] ; load b0 931 psraw mm7, 6 ; dst6 932 movq mm4, mm5 ; a0 933 psraw mm0, 6 ; dst5 934 movq [%1+1*16], mm3 935 paddsw mm5, mm1 ; a0+b0 936 movq [%1+2*16], mm6 937 psubsw mm4, mm1 ; a0-b0 938 movq mm3, [%1+5*16] ; load b3 939 psraw mm5, 6 ; dst0 940 movq mm6, mm2 ; a3 941 psraw mm4, 6 ; dst7 942 movq [%1+5*16], mm0 943 paddsw mm2, mm3 ; a3+b3 944 movq [%1+6*16], mm7 945 psubsw mm6, mm3 ; a3-b3 946 movq [%1+0*16], mm5 947 psraw mm2, 6 ; dst3 948 movq [%1+7*16], mm4 949 psraw mm6, 6 ; dst4 950 movq [%1+3*16], mm2 951 movq [%1+4*16], mm6 952%endmacro 953 954%macro XVID_IDCT_MMX 0 955cglobal xvid_idct, 1, 1, 0, block 956%if cpuflag(mmxext) 957%define TAB tab_i_04_xmm 958%else 959%define TAB tab_i_04_mmx 960%endif 961 ; Process each row - beware of rounder offset 962 DCT_8_INV_ROW 0, TAB + 64 * 0, 0*16 963 DCT_8_INV_ROW 1, TAB + 64 * 1, 1*16 964 DCT_8_INV_ROW 2, TAB + 64 * 2, 2*16 965 DCT_8_INV_ROW 3, TAB + 64 * 3, 3*16 966 DCT_8_INV_ROW 4, TAB + 64 * 0, 6*16 967 DCT_8_INV_ROW 5, TAB + 64 * 3, 4*16 968 DCT_8_INV_ROW 6, TAB + 64 * 2, 5*16 969 DCT_8_INV_ROW 7, TAB + 64 * 1, 5*16 970 971 ; Process the columns (4 at a time) 972 DCT_8_INV_COL r0+0 973 DCT_8_INV_COL r0+8 974 975 RET 976%endmacro 977 978INIT_MMX mmx 979XVID_IDCT_MMX 980INIT_MMX mmxext 981XVID_IDCT_MMX 982 983%endif ; ~ARCH_X86_32 984