1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* ?Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* ?Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* dct.asm 33;* 34;* History 35;* 8/4/2009 Created 36;* 37;* 38;*************************************************************************/ 39 40%include "asm_inc.asm" 41 42%macro LOAD_3_PARA_TO_5_PARA_IDCT 0 43%ifdef X86_32 44 push r3 45 push r4 46 %assign push_num push_num+2 47 mov r0, [esp + push_num*4 + 4] 48 mov r1, [esp + push_num*4 + 8] 49 mov r4, [esp + push_num*4 + 12] 50%else 51 mov r4, r2 52%endif 53 mov r2, r0 54 mov r3, r1 55%endmacro 56 57%ifdef PREFIX 58 %define prefixed(a) _ %+ a 59%else 60 %define prefixed(a) a 61%endif 62 63%ifdef X86_32_PICASM 64SECTION .text align=32 65%else 66SECTION .rodata align=32 67%endif 68 69;*********************************************************************** 70; Constant 71;*********************************************************************** 72 73align 32 74wels_shufb0312_movzxw_128: 75 db 0, 80h, 3, 80h, 1, 80h, 2, 80h, 4, 80h, 7, 80h, 5, 80h, 6, 80h 76wels_shufb2301_128: 77 db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 78wels_shufb0231_128: 79 db 0, 2, 3, 1, 4, 6, 7, 5, 8, 10, 11, 9, 12, 14, 15, 13 80wels_dw32_128: 81 times 8 dw 32 82wels_p1m1p1m1w_256: 83 times 8 dw 1, -1 84wels_p1p2m1m2w_256: 85 times 4 dw 1, 2, -1, -2 86wels_p1p1m1m1w_256: 87 times 4 dw 1, 1, -1, -1 88wels_8xp1w_8xm1w: 89 times 8 dw 1 90 times 8 dw -1 91wels_4xp1w_4xm1w_256: 92 times 4 dw 1 93 times 4 dw -1 94 times 4 dw 1 95 times 4 dw -1 96wels_4xp1w_4xp2w_4xm1w_4xm2w: 97 times 4 dw 1 98 times 4 dw 2 99 times 4 dw -1 100 times 4 dw -2 101 102align 16 103wels_p1m1p1m1w_128: 104 times 4 dw 1, -1 105wels_p1p2p1p2w_128: 106 times 4 dw 1, 2 107wels_p1m1m1p1w_128: 108 times 2 dw 1, -1, -1, 1 109wels_p0m8000p0m8000w_128: 110 times 4 dw 0, -8000h 111wels_p1p1m1m1w_128: 112 times 2 dw 1, 1, -1, -1 113wels_4xp1w_4xp2w: 114 times 4 dw 1 115 times 4 dw 2 116wels_4xp0w_4xm8000w: 117 times 4 dw 0 118 times 4 dw -8000h 119 120SECTION .text 121 122;*********************************************************************** 123; MMX functions 124;*********************************************************************** 125 126%macro MMX_LoadDiff4P 5 127 movd %1, [%3] 128 movd %2, [%4] 129 punpcklbw %1, %5 130 punpcklbw %2, %5 131 psubw %1, %2 132%endmacro 133 134%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm) 135 MMX_LoadDiff4P %1, %9, %5, %7, %10 136 MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10 137 lea %5, [%5+2*%6] 138 lea %7, [%7+2*%8] 139 MMX_LoadDiff4P %3, %9, %5, %7, %10 140 MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10 141%endmacro 142 143%macro MMX_SumSubMul2 3 144 movq %3, %1 145 psllw %1, $01 146 paddw %1, %2 147 psllw %2, $01 148 psubw %3, %2 149%endmacro 150 151%macro MMX_SumSubDiv2 3 152 movq %3, %2 153 psraw %3, $01 154 paddw %3, %1 155 psraw %1, $01 156 psubw %1, %2 157%endmacro 158 159%macro MMX_SumSub 3 160 movq %3, %2 161 psubw %2, %1 162 paddw %1, %3 163%endmacro 164 165%macro MMX_DCT 6 166 MMX_SumSub %4, %1, %6 167 MMX_SumSub %3, %2, %6 168 MMX_SumSub %3, %4, %6 169 MMX_SumSubMul2 %1, %2, %5 170%endmacro 171 172%macro MMX_IDCT 6 173 MMX_SumSub %4, %5, %6 174 MMX_SumSubDiv2 %3, %2, %1 175 MMX_SumSub %1, %4, %6 176 MMX_SumSub %3, %5, %6 177%endmacro 178 179%macro MMX_StoreDiff4P 6 180 movd %2, %6 181 punpcklbw %2, %4 182 paddw %1, %3 183 psraw %1, $06 184 paddsw %1, %2 185 packuswb %1, %2 186 movd %5, %1 187%endmacro 188 189;*********************************************************************** 190; void WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 ) 191;*********************************************************************** 192WELS_EXTERN WelsDctT4_mmx 193 %assign push_num 0 194 LOAD_5_PARA 195 SIGN_EXTENSION r2, r2d 196 SIGN_EXTENSION r4, r4d 197 WELS_Zero mm7 198 199 MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7 200 201 MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6 202 MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2 203 204 MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6 205 MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5 206 207 movq [r0+ 0], mm2 208 movq [r0+ 8], mm1 209 movq [r0+16], mm5 210 movq [r0+24], mm4 211 WELSEMMS 212 LOAD_5_PARA_POP 213 ret 214 215;*********************************************************************** 216; void IdctResAddPred_mmx(uint8_t* pPred, int32_t iStride, int16_t* pDct); 217;*********************************************************************** 218WELS_EXTERN IdctResAddPred_mmx 219 %assign push_num 0 220 LOAD_3_PARA_TO_5_PARA_IDCT 221 jmp prefixed(WelsIDctT4Rec_mmx.begin) 222 223;*********************************************************************** 224; void WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs) 225;*********************************************************************** 226WELS_EXTERN WelsIDctT4Rec_mmx 227 %assign push_num 0 228 LOAD_5_PARA 229.begin: 230 SIGN_EXTENSION r1, r1d 231 SIGN_EXTENSION r3, r3d 232 movq mm0, [r4+ 0] 233 movq mm1, [r4+ 8] 234 movq mm2, [r4+16] 235 movq mm3, [r4+24] 236 237 MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4 238 MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6 239 MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2 240 MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6 241 242 WELS_Zero mm7 243 WELS_DW32 mm6 244 245 MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2] 246 MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3] 247 lea r0, [r0+2*r1] 248 lea r2, [r2+2*r3] 249 MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2] 250 MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3] 251 252 WELSEMMS 253 LOAD_5_PARA_POP 254 ret 255 256 257;*********************************************************************** 258; SSE2 functions 259;*********************************************************************** 260 261%macro SSE2_Store4x8p 6 262 movlps [%1+0x00], %2 263 movhps [%1+0x20], %2 264 movlps [%1+0x08], %3 265 movhps [%1+0x28], %3 266 movlps [%1+0x10], %4 267 movhps [%1+0x30], %4 268 movlps [%1+0x18], %5 269 movhps [%1+0x38], %5 270%endmacro 271 272%macro SSE2_Load4x8p 6 273 MOVDQ %2, [%1+0x00] 274 MOVDQ %4, [%1+0x10] 275 MOVDQ %6, [%1+0x20] 276 MOVDQ %3, [%1+0x30] 277 SSE2_XSawp qdq, %4, %3, %5 278 SSE2_XSawp qdq, %2, %6, %3 279%endmacro 280 281%macro SSE2_SumSubMul2 3 282 movdqa %3, %1 283 psllw %1, 1 284 paddw %1, %2 285 psllw %2, 1 286 psubw %3, %2 287%endmacro 288 289%macro SSE2_SumSubDiv2 4 290 movdqa %4, %1 291 movdqa %3, %2 292 psraw %2, $01 293 psraw %4, $01 294 paddw %1, %2 295 psubw %4, %3 296%endmacro 297 298%macro SSE2_StoreDiff16p 9 299 paddw %1, %4 300 psraw %1, $06 301 movq %3, %7 302 punpcklbw %3, %5 303 paddsw %1, %3 304 paddw %2, %4 305 psraw %2, $06 306 movq %3, %9 307 punpcklbw %3, %5 308 paddsw %2, %3 309 packuswb %1, %2 310 movlps %6, %1 311 movhps %8, %1 312%endmacro 313 314%macro SSE2_StoreDiff8p 5 315 movq %2, %5 316 punpcklbw %2, %3 317 paddsw %2, %1 318 packuswb %2, %2 319 movq %4, %2 320%endmacro 321 322%macro SSE2_Load2x4P 2 323 MOVDQ %1, [%2] 324%endmacro 325 326%macro SSE2_Store2x4P 2 327 MOVDQ [%1], %2 328%endmacro 329 330; out=%1 pPixel1Line1=%2 pPixel1Line2=%3 pPixel2Line1=%4 pPixel2Line2=%5 zero=%6 clobber=%7,%8 331%macro SSE2_LoadDiff2x4P 8 332 movd %1, [%2] 333 movd %7, [%3] 334 punpckldq %1, %7 335 punpcklbw %1, %6 336 movd %7, [%4] 337 movd %8, [%5] 338 punpckldq %7, %8 339 punpcklbw %7, %6 340 psubw %1, %7 341%endmacro 342 343; pRec1=%1 pRec2=%2 data=%3 pPred1=%4 pPred2=%5 dw32=%6 zero=%7 clobber=%8,%9 344%macro SSE2_StoreDiff2x4P 9 345 paddw %3, %6 346 psraw %3, 6 347 movd %8, [%4] 348 movd %9, [%5] 349 punpckldq %8, %9 350 punpcklbw %8, %7 351 paddsw %3, %8 352 packuswb %3, %3 353 movd [%1], %3 354 psrlq %3, 32 355 movd [%2], %3 356%endmacro 357 358%macro SSE2_Load8DC 6 359 movdqa %1, %6 ; %1 = dc0 dc1 360 paddw %1, %5 361 psraw %1, $06 ; (dc + 32) >> 6 362 363 movdqa %2, %1 364 psrldq %2, 4 365 punpcklwd %2, %2 366 punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3 367 368 movdqa %3, %1 369 psrldq %3, 8 370 punpcklwd %3, %3 371 punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5 372 373 movdqa %4, %1 374 psrldq %4, 12 375 punpcklwd %4, %4 376 punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7 377 378 punpcklwd %1, %1 379 punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1 380%endmacro 381 382%macro SSE2_DCT 6 383 SSE2_SumSub %6, %3, %5 384 SSE2_SumSub %1, %2, %5 385 SSE2_SumSub %3, %2, %5 386 SSE2_SumSubMul2 %6, %1, %4 387%endmacro 388 389%macro SSE2_IDCT 7 390 SSE2_SumSub %7, %2, %6 391 SSE2_SumSubDiv2 %1, %3, %5, %4 392 SSE2_SumSub %2, %1, %5 393 SSE2_SumSub %7, %4, %5 394%endmacro 395 396; Do 2 horizontal 4-pt DCTs in parallel packed as 8 words in an xmm register. 397; out=%1 in=%1 clobber=%2 398%macro SSE2_DCT_HORIZONTAL 2 399 pshuflw %2, %1, 1bh ; [x[3],x[2],x[1],x[0]] low qw 400 pmullw %1, [pic(wels_p1m1p1m1w_128)] ; [x[0],-x[1],x[2],-x[3], ...] 401 pshufhw %2, %2, 1bh ; [x[3],x[2],x[1],x[0]] high qw 402 paddw %1, %2 ; s = [x[0]+x[3],-x[1]+x[2],x[2]+x[1],-x[3]+x[0], ...] 403 pshufd %2, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...] 404 pmullw %1, [pic(wels_p1m1m1p1w_128)] ; [s[0],-s[1],-s[2],s[3], ...] 405 pmullw %2, [pic(wels_p1p2p1p2w_128)] ; [s[2],2*s[3],s[0],2*s[1], ...]] 406 paddw %1, %2 ; y = [s[0]+s[2],-s[1]+2*s[3],-s[2]+s[0],s[3]+2*s[1], ...] 407%endmacro 408 409; Do 2 horizontal 4-pt IDCTs in parallel packed as 8 words in an xmm register. 410; 411; Use a multiply by reciprocal to get -x>>1, and x+=-x>>1 to get x>>1, which 412; avoids a cumbersome blend with SSE2 to get a vector with right-shifted odd 413; elements. 414; 415; out=%1 in=%1 wels_p1m1m1p1w_128=%2 clobber=%3,%4 416%macro SSE2_IDCT_HORIZONTAL 4 417 movdqa %3, [pic(wels_p0m8000p0m8000w_128)] 418 pmulhw %3, %1 ; x[0:7] * [0,-8000h,0,-8000h, ...] >> 16 419 pshufd %4, %1, 0b1h ; [x[2],x[3],x[0],x[1], ...] 420 pmullw %4, %2 ; [x[2],-x[3],-x[0],x[1], ...] 421 paddw %1, %3 ; [x[0]+0,x[1]+(-x[1]>>1),x[2]+0,x[3]+(-x[3]>>1), ...] 422 paddw %1, %4 ; s = [x[0]+x[2],(x[1]>>1)-x[3],x[2]-x[0],(x[3]>>1)+x[1], ...] 423 pshuflw %3, %1, 1bh ; [s[3],s[2],s[1],s[0]] low qw 424 pmullw %1, [pic(wels_p1p1m1m1w_128)] ; [s[0],s[1],-s[2],-s[3], ...] 425 pshufhw %3, %3, 1bh ; [s[3],s[2],s[1],s[0]] high qw 426 pmullw %3, %2 ; [s[3],-s[2],-s[1],s[0], ...] 427 paddw %1, %3 ; y = [s[0]+s[3],s[1]-s[2],-s[2]-s[1],-s[3]+s[0], ...] 428%endmacro 429 430; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in 2 xmm registers. 431; Uses scrambled input to save a negation. 432; [y0,y1]=%1 [y2,y3]=%2 [x1,x0]=%1 [x2,x3]=%2 clobber=%3 433%macro SSE2_DCT_4x4P 3 434 movdqa %3, %1 435 psubw %1, %2 ; [x1-x2,x0-x3] 436 paddw %2, %3 ; [x1+x2,x0+x3] 437 movdqa %3, %2 438 punpckhqdq %2, %1 ; s03 = [x0+x3,x0-x3] 439 punpcklqdq %3, %1 ; s12 = [x1+x2,x1-x2] 440 movdqa %1, %2 441 pmullw %1, [pic(wels_4xp1w_4xp2w)] ; [s03[0],2*s03[1]] 442 paddw %1, %3 ; [y0,y1] = [s03[0]+s12[0],2*s03[1]+s12[1]] 443 pmullw %3, [pic(wels_4xp1w_4xp2w)] ; [s12[0],2*s12[1]] 444 psubw %2, %3 ; [y2,y3] = [s03[0]-s12[0],s03[1]-2*s12[1]] 445%endmacro 446 447; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in 2 xmm registers. 448; Output is scrambled to save a negation. 449; [y1,y0]=%1 [y2,y3]=%2 [x0,x1]=%1 [x2,x3]=%2 clobber=%3,%4 450%macro SSE2_IDCT_4x4P 4 451 movdqa %4, [pic(wels_4xp0w_4xm8000w)] 452 movdqa %3, %1 453 pmulhw %3, %4 ; x[0:1] * [0,-8000h] >> 16 454 pmulhw %4, %2 ; x[2:3] * [0,-8000h] >> 16 455 paddw %3, %1 ; [x[0],x[1]>>1] 456 paddw %4, %2 ; [x[2],x[3]>>1] 457 psubw %3, %2 ; [x[0]-x[2],(x[1]>>1)-x[3]] 458 paddw %1, %4 ; [x[2]+x[0],(x[3]>>1)+x[1]] 459 movdqa %2, %3 460 punpckhqdq %3, %1 ; s13 = [(x[1]>>1)-x[3],(x[3]>>1)+x[1]] 461 punpcklqdq %2, %1 ; s02 = [x[0]-x[2], x[2]+x[0]] 462 movdqa %1, %2 463 paddw %1, %3 ; [y1,y0] = [s02[0]+s13[0],s02[1]+s13[1]] 464 psubw %2, %3 ; [y2,y3] = [s02[0]-s13[0],s02[1]-s13[1]] 465%endmacro 466 467;*********************************************************************** 468; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 ) 469;*********************************************************************** 470WELS_EXTERN WelsDctFourT4_sse2 471 %assign push_num 0 472 INIT_X86_32_PIC r5 473 LOAD_5_PARA 474 PUSH_XMM 8 475 SIGN_EXTENSION r2, r2d 476 SIGN_EXTENSION r4, r4d 477 pxor xmm7, xmm7 478 ;Load 4x8 479 SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3] 480 SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4] 481 lea r1, [r1 + 2 * r2] 482 lea r3, [r3 + 2 * r4] 483 SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3] 484 SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4] 485 486 SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0 487 SSE2_DCT_HORIZONTAL xmm2, xmm5 488 SSE2_DCT_HORIZONTAL xmm0, xmm5 489 SSE2_DCT_HORIZONTAL xmm3, xmm5 490 SSE2_DCT_HORIZONTAL xmm4, xmm5 491 492 SSE2_Store4x8p r0, xmm2, xmm0, xmm3, xmm4, xmm1 493 494 lea r1, [r1 + 2 * r2] 495 lea r3, [r3 + 2 * r4] 496 497 ;Load 4x8 498 SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ] 499 SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4] 500 lea r1, [r1 + 2 * r2] 501 lea r3, [r3 + 2 * r4] 502 SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3] 503 SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4] 504 505 SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0 506 SSE2_DCT_HORIZONTAL xmm2, xmm5 507 SSE2_DCT_HORIZONTAL xmm0, xmm5 508 SSE2_DCT_HORIZONTAL xmm3, xmm5 509 SSE2_DCT_HORIZONTAL xmm4, xmm5 510 511 SSE2_Store4x8p r0+64, xmm2, xmm0, xmm3, xmm4, xmm1 512 513 POP_XMM 514 LOAD_5_PARA_POP 515 DEINIT_X86_32_PIC 516 ret 517 518;*********************************************************************** 519; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs); 520;*********************************************************************** 521WELS_EXTERN WelsIDctFourT4Rec_sse2 522 %assign push_num 0 523 INIT_X86_32_PIC r5 524 LOAD_5_PARA 525 PUSH_XMM 8 526 SIGN_EXTENSION r1, r1d 527 SIGN_EXTENSION r3, r3d 528 ;Load 4x8 529 SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5 530 531 movdqa xmm7, [pic(wels_p1m1m1p1w_128)] 532 SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6 533 SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6 534 SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6 535 SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6 536 SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0 537 538 WELS_Zero xmm7 539 WELS_DW32 xmm6 540 541 SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3] 542 lea r0, [r0 + 2 * r1] 543 lea r2, [r2 + 2 * r3] 544 SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3] 545 546 lea r0, [r0 + 2 * r1] 547 lea r2, [r2 + 2 * r3] 548 SSE2_Load4x8p r4+64, xmm0, xmm1, xmm4, xmm2, xmm5 549 550 movdqa xmm7, [pic(wels_p1m1m1p1w_128)] 551 SSE2_IDCT_HORIZONTAL xmm0, xmm7, xmm5, xmm6 552 SSE2_IDCT_HORIZONTAL xmm1, xmm7, xmm5, xmm6 553 SSE2_IDCT_HORIZONTAL xmm4, xmm7, xmm5, xmm6 554 SSE2_IDCT_HORIZONTAL xmm2, xmm7, xmm5, xmm6 555 SSE2_IDCT xmm1, xmm4, xmm2, xmm3, xmm5, xmm6, xmm0 556 557 WELS_Zero xmm7 558 WELS_DW32 xmm6 559 560 SSE2_StoreDiff16p xmm1, xmm3, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3] 561 lea r0, [r0 + 2 * r1] 562 lea r2, [r2 + 2 * r3] 563 SSE2_StoreDiff16p xmm0, xmm4, xmm5, xmm6, xmm7, [r0], [r2], [r0 + r1], [r2 + r3] 564 POP_XMM 565 LOAD_5_PARA_POP 566 DEINIT_X86_32_PIC 567 ret 568 569;*********************************************************************** 570; void WelsDctT4_sse2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) 571;*********************************************************************** 572WELS_EXTERN WelsDctT4_sse2 573 %assign push_num 0 574 INIT_X86_32_PIC r5 575 LOAD_5_PARA 576 PUSH_XMM 5 577 SIGN_EXTENSION r2, r2d 578 SIGN_EXTENSION r4, r4d 579 580 WELS_Zero xmm2 581 SSE2_LoadDiff2x4P xmm0, r1+r2, r1, r3+r4, r3, xmm2, xmm3, xmm4 582 add r1, r2 583 add r3, r4 584 SSE2_LoadDiff2x4P xmm1, r1+r2, r1+2*r2, r3+r4, r3+2*r4, xmm2, xmm3, xmm4 585 SSE2_DCT_HORIZONTAL xmm0, xmm3 586 SSE2_DCT_HORIZONTAL xmm1, xmm3 587 SSE2_DCT_4x4P xmm0, xmm1, xmm3 588 SSE2_Store2x4P r0, xmm0 589 SSE2_Store2x4P r0+16, xmm1 590 591 POP_XMM 592 LOAD_5_PARA_POP 593 DEINIT_X86_32_PIC 594 ret 595 596;*********************************************************************** 597; void IdctResAddPred_sse2(uint8_t* pPred, int32_t iStride, int16_t* pDct); 598;*********************************************************************** 599WELS_EXTERN IdctResAddPred_sse2 600 %assign push_num 0 601 LOAD_3_PARA_TO_5_PARA_IDCT 602 jmp prefixed(WelsIDctT4Rec_sse2.begin) 603 604;*********************************************************************** 605; void WelsIDctT4Rec_sse2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct); 606;*********************************************************************** 607WELS_EXTERN WelsIDctT4Rec_sse2 608 %assign push_num 0 609 LOAD_5_PARA 610.begin: 611 INIT_X86_32_PIC r5 612 PUSH_XMM 6 613 SIGN_EXTENSION r1, r1d 614 SIGN_EXTENSION r3, r3d 615 616 SSE2_Load2x4P xmm0, r4 617 SSE2_Load2x4P xmm1, r4+16 618 movdqa xmm4, [pic(wels_p1m1m1p1w_128)] 619 SSE2_IDCT_HORIZONTAL xmm0, xmm4, xmm2, xmm3 620 SSE2_IDCT_HORIZONTAL xmm1, xmm4, xmm2, xmm3 621 SSE2_IDCT_4x4P xmm0, xmm1, xmm2, xmm3 622 WELS_Zero xmm4 623 WELS_DW32 xmm5 624 SSE2_StoreDiff2x4P r0+r1, r0, xmm0, r2+r3, r2, xmm5, xmm4, xmm2, xmm3 625 add r0, r1 626 add r2, r3 627 SSE2_StoreDiff2x4P r0+r1, r0+2*r1, xmm1, r2+r3, r2+2*r3, xmm5, xmm4, xmm2, xmm3 628 629 POP_XMM 630 DEINIT_X86_32_PIC 631 LOAD_5_PARA_POP 632 ret 633 634%macro SSE2_StoreDiff4x8p 8 635 SSE2_StoreDiff8p %1, %3, %4, [%5], [%6] 636 SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8] 637 SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8] 638 SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8] 639%endmacro 640 641 ;*********************************************************************** 642; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc) 643;*********************************************************************** 644WELS_EXTERN WelsIDctRecI16x16Dc_sse2 645 %assign push_num 0 646 LOAD_5_PARA 647 PUSH_XMM 8 648 SIGN_EXTENSION r1, r1d 649 SIGN_EXTENSION r3, r3d 650 pxor xmm7, xmm7 651 WELS_DW32 xmm6 652 653 SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4] 654 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 655 656 lea r0, [r0 + 2 * r1] 657 lea r2, [r2 + 2 * r3] 658 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 659 660 lea r0, [r0 + 2 * r1] 661 lea r2, [r2 + 2 * r3] 662 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 663 664 lea r0, [r0 + 2 * r1] 665 lea r2, [r2 + 2 * r3] 666 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 667 668 SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16] 669 lea r0, [r0 + 2 * r1] 670 lea r2, [r2 + 2 * r3] 671 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 672 673 lea r0, [r0 + 2 * r1] 674 lea r2, [r2 + 2 * r3] 675 SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3 676 677 lea r0, [r0 + 2 * r1] 678 lea r2, [r2 + 2 * r3] 679 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 680 681 lea r0, [r0 + 2 * r1] 682 lea r2, [r2 + 2 * r3] 683 SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3 684 POP_XMM 685 LOAD_5_PARA_POP 686 ret 687 688 689;*********************************************************************** 690; AVX2 functions 691;*********************************************************************** 692 693%ifdef HAVE_AVX2 694; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8 695%macro AVX2_LoadDiff16P 8 696 vmovq x%1, [%2 ] 697 vpbroadcastq y%7, [%2 + 4 * %3] 698 vpblendd y%1, y%1, y%7, 11110000b 699 vpshufb y%1, y%1, y%6 700 vmovq x%7, [%4 ] 701 vpbroadcastq y%8, [%4 + 4 * %5] 702 vpblendd y%7, y%7, y%8, 11110000b 703 vpshufb y%7, y%7, y%6 704 vpsubw y%1, y%1, y%7 705%endmacro 706 707; pRec=%1 iStride=%2 data=%3,%4 pPred=%5 iPredStride=%6 dw32=%7 wels_shufb0312_movzxw=%8 clobber=%9,%10 708%macro AVX2_StoreDiff32P 10 709 vpaddw y%3, y%3, y%7 710 vpsraw y%3, y%3, 6 711 vmovq x%9, [%5 ] 712 vpbroadcastq y%10, [%5 + 4 * %6] 713 add %5, %6 714 vpblendd y%9, y%9, y%10, 11110000b 715 vpshufb y%9, y%9, y%8 716 vpaddsw y%3, y%3, y%9 717 vpaddw y%4, y%4, y%7 718 vpsraw y%4, y%4, 6 719 vmovq x%9, [%5 ] 720 vpbroadcastq y%10, [%5 + 4 * %6] 721 vpblendd y%9, y%9, y%10, 11110000b 722 vpshufb y%9, y%9, y%8 723 vpaddsw y%4, y%4, y%9 724 vpackuswb y%3, y%3, y%4 725 vbroadcasti128 y%4, [pic(wels_shufb0231_128)] 726 vpshufb y%3, y%3, y%4 727 vextracti128 x%4, y%3, 1 728 vmovlps [%1 ], x%3 729 vmovlps [%1 + 4 * %2], x%4 730 add %1, %2 731 vmovhps [%1 ], x%3 732 vmovhps [%1 + 4 * %2], x%4 733%endmacro 734 735; out=%1,%2,%3,%4 pDct=%5 clobber=%6 736%macro AVX2_Load4x16P 6 737 vmovdqa x%2, [%5+0x00] 738 vinserti128 y%2, y%2, [%5+0x40], 1 739 vmovdqa x%6, [%5+0x20] 740 vinserti128 y%6, y%6, [%5+0x60], 1 741 vpunpcklqdq y%1, y%2, y%6 742 vpunpckhqdq y%2, y%2, y%6 743 vmovdqa x%4, [%5+0x10] 744 vinserti128 y%4, y%4, [%5+0x50], 1 745 vmovdqa x%6, [%5+0x30] 746 vinserti128 y%6, y%6, [%5+0x70], 1 747 vpunpcklqdq y%3, y%4, y%6 748 vpunpckhqdq y%4, y%4, y%6 749%endmacro 750 751; pDct=%1 data=%1,%2,%3,%4 clobber=%5 752%macro AVX2_Store4x16P 6 753 vpunpcklqdq y%6, y%2, y%3 754 vmovdqa [%1+0x00], x%6 755 vextracti128 [%1+0x40], y%6, 1 756 vpunpckhqdq y%6, y%2, y%3 757 vmovdqa [%1+0x20], x%6 758 vextracti128 [%1+0x60], y%6, 1 759 vpunpcklqdq y%6, y%4, y%5 760 vmovdqa [%1+0x10], x%6 761 vextracti128 [%1+0x50], y%6, 1 762 vpunpckhqdq y%6, y%4, y%5 763 vmovdqa [%1+0x30], x%6 764 vextracti128 [%1+0x70], y%6, 1 765%endmacro 766 767%macro AVX2_Load4x4P 2 768 vmovdqu y%1, [%2] 769%endmacro 770 771%macro AVX2_Store4x4P 2 772 vmovdqu [%1], y%2 773%endmacro 774 775; Load 4 lines of 4 pixels, shuffle and zero extend to 16-bit. 776; out=%1 pPixel=%2 iStride=%3 [wels_shufb0312_movzxw]=%4 clobber=%5,%6 777%macro AVX2_Loadzx4x4P 6 778 vmovd x%1, [%2 ] 779 add %2, %3 780 vpbroadcastd x%5, [%2 + 2 * %3] 781 vpblendd x%1, x%1, x%5, 1010b 782 vpbroadcastd y%5, [%2 ] 783 vpbroadcastd y%6, [%2 + %3] 784 vpblendd y%5, y%5, y%6, 10101010b 785 vpblendd y%1, y%1, y%5, 11110000b 786 vpshufb y%1, y%1, %4 787%endmacro 788 789; out=%1 pPixel1=%2 iStride1=%3 pPixel2=%4 iStride2=%5 wels_shufb0312_movzxw=%6 clobber=%7,%8,%9 790%macro AVX2_LoadDiff4x4P 9 791 AVX2_Loadzx4x4P %1, %2, %3, y%6, %7, %8 792 AVX2_Loadzx4x4P %7, %4, %5, y%6, %8, %9 793 vpsubw y%1, y%1, y%7 794%endmacro 795 796; pRec=%1 iStride=%2 data=%3 pPred=%4 iPredStride=%5 dw32=%6 wels_shufb0312_movzxw=%7 clobber=%8,%9,%10 797%macro AVX2_StoreDiff4x4P 10 798 vpaddw y%3, y%3, y%6 799 vpsraw y%3, y%3, 6 800 AVX2_Loadzx4x4P %8, %4, %5, y%7, %9, %10 801 vpaddsw y%3, y%3, y%8 802 vpackuswb y%3, y%3, y%3 803 vbroadcasti128 y%8, [pic(wels_shufb0231_128)] 804 vpshufb y%3, y%3, y%8 805 vextracti128 x%8, y%3, 1 806 vmovd [%1 ], x%3 807 add %1, %2 808 vmovd [%1 ], x%8 809 vpsrlq x%8, x%8, 32 810 vmovd [%1 + %2], x%8 811 vpsrlq x%3, x%3, 32 812 vmovd [%1 + 2 * %2], x%3 813%endmacro 814 815; 4-pt DCT 816; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5 817%macro AVX2_DCT 5 818 vpsubw %5, %1, %4 ; s3 = x0 - x3 819 vpaddw %1, %1, %4 ; s0 = x0 + x3 820 vpsubw %4, %2, %3 ; s2 = x1 - x2 821 vpaddw %2, %2, %3 ; s1 = x1 + x2 822 vpsubw %3, %1, %2 ; y2 = s0 - s1 823 vpaddw %1, %1, %2 ; y0 = s0 + s1 824 vpsllw %2, %5, 1 825 vpaddw %2, %2, %4 ; y1 = 2 * s3 + s2 826 vpsllw %4, %4, 1 827 vpsubw %4, %5, %4 ; y3 = s3 - 2 * s2 828%endmacro 829 830; 4-pt IDCT 831; out=%1,%2,%3,%4 in=%1,%2,%3,%4 clobber=%5 832%macro AVX2_IDCT 5 833 vpsraw %5, %2, 1 834 vpsubw %5, %5, %4 ; t3 = (x1 >> 1) - x3 835 vpsraw %4, %4, 1 836 vpaddw %4, %2, %4 ; t2 = x1 + (x3 >> 1) 837 vpaddw %2, %1, %3 ; t0 = x0 + x2 838 vpsubw %3, %1, %3 ; t1 = x0 - x2 839 vpaddw %1, %2, %4 ; y0 = t0 + t2 840 vpsubw %4, %2, %4 ; y3 = t0 - t2 841 vpaddw %2, %3, %5 ; y1 = t1 + t3 842 vpsubw %3, %3, %5 ; y2 = t1 - t3 843%endmacro 844 845; Do 4 horizontal 4-pt DCTs in parallel packed as 16 words in a ymm register. 846; Uses scrambled input to save a negation. 847; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 wels_shufb2301=%2 clobber=%3 848%macro AVX2_DCT_HORIZONTAL 3 849 vpsignw %3, %1, [pic(wels_p1m1p1m1w_256)] ; [x0,-x3,x1,-x2] 850 vpshufb %1, %1, %2 ; [x3,x0,x2,x1] 851 vpaddw %1, %1, %3 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1] 852 vpmullw %3, %1, [pic(wels_p1p2m1m2w_256)] ; [s[0],2*s[1],-s[2],-2*s[3], ...] 853 vpshufd %1, %1, 0b1h ; [s[2],s[3],s[0],s[1], ...] 854 vpaddw %1, %1, %3 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1], ...] 855%endmacro 856 857; Do 4 horizontal 4-pt IDCTs in parallel packed as 16 words in a ymm register. 858; Output is scrambled to save a negation. 859; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 wels_shufb2301=%2 clobber=%3 860%macro AVX2_IDCT_HORIZONTAL 3 861 vpsraw %3, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1] 862 vpblendw %3, %1, %3, 10101010b ; [x0,x1>>1,x2,x3>>1] 863 vpsignw %1, %1, [pic(wels_p1p1m1m1w_256)] ; [x0,x1,-x2,-x3] 864 vpshufd %3, %3, 0b1h ; [x2,x3>>1,x0,x1>>1] 865 vpaddw %1, %3, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3] 866 vpshufb %3, %1, %2 ; [s[1],s[0],s[3],s[2], ...] 867 vpsignw %1, %1, [pic(wels_p1m1p1m1w_256)] ; [s[0],-s[1],s[2],-s[3], ...] 868 vpaddw %1, %1, %3 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2], ...] 869%endmacro 870 871; Do 4 vertical 4-pt DCTs in parallel packed as 16 words in a ymm register. 872; Uses scrambled input to save a negation. 873; [y0,y1,y2,y3]=%1 [x0,x3,x1,x2]=%1 clobber=%2 874%macro AVX2_DCT_4x4P 2 875 vpsignw %2, %1, [pic(wels_4xp1w_4xm1w_256)] ; [x0,-x3,x1,-x2] 876 vpshufd %1, %1, 4eh ; [x3,x0,x2,x1] 877 vpaddw %1, %1, %2 ; s = [x0+x3,-x3+x0,x1+x2,-x2+x1] 878 vpmullw %2, %1, [pic(wels_4xp1w_4xp2w_4xm1w_4xm2w)] ; [s[0],2*s[1],-s[2],-2*s[3]] 879 vpermq %1, %1, 4eh ; [s[2],s[3],s[0],s[1]] 880 vpaddw %1, %1, %2 ; [y0,y1,y2,y3] = [s[0]+s[2],2*s[1]+s[3],-s[2]+s[0],-2*s[3]+s[1]] 881%endmacro 882 883; Do 4 vertical 4-pt IDCTs in parallel packed as 16 words in a ymm register. 884; Output is scrambled to save a negation. 885; [y0,y3,y1,y2]=%1 [x0,x1,x2,x3]=%1 clobber=%2 886%macro AVX2_IDCT_4x4P 2 887 vpsraw %2, %1, 1 ; [x0>>1,x1>>1,x2>>1,x3>>1] 888 vpblendw %2, %1, %2, 11110000b ; [x0,x1>>1,x2,x3>>1] 889 vpsignw %1, %1, [pic(wels_8xp1w_8xm1w)] ; [x0,x1,-x2,-x3] 890 vpermq %2, %2, 4eh ; [x2,x3>>1,x0,x1>>1] 891 vpaddw %1, %2, %1 ; s = [x2+x0,(x3>>1)+x1,x0-x2,(x1>>1)-x3] 892 vpshufd %2, %1, 4eh ; [s[1],s[0],s[3],s[2]] 893 vpmullw %1, %1, [pic(wels_4xp1w_4xm1w_256)] ; [s[0],-s[1],s[2],-s[3], ...] 894 vpaddw %1, %1, %2 ; [y0,y3,y1,y2] = [s[0]+s[1],-s[1]+s[0],s[2]+s[3],-s[3]+s[2]] 895%endmacro 896 897;*********************************************************************** 898; void WelsDctFourT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) 899;*********************************************************************** 900WELS_EXTERN WelsDctFourT4_avx2 901 %assign push_num 0 902 INIT_X86_32_PIC r5 903 LOAD_5_PARA 904 PUSH_XMM 7 905 SIGN_EXTENSION r2, r2d 906 SIGN_EXTENSION r4, r4d 907 908 vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)] 909 910 ;Load 4x16 911 AVX2_LoadDiff16P mm0, r1, r2, r3, r4, mm6, mm4, mm5 912 add r1, r2 913 add r3, r4 914 AVX2_LoadDiff16P mm1, r1, r2, r3, r4, mm6, mm4, mm5 915 add r1, r2 916 add r3, r4 917 AVX2_LoadDiff16P mm2, r1, r2, r3, r4, mm6, mm4, mm5 918 add r1, r2 919 add r3, r4 920 AVX2_LoadDiff16P mm3, r1, r2, r3, r4, mm6, mm4, mm5 921 922 AVX2_DCT ymm0, ymm1, ymm2, ymm3, ymm5 923 vbroadcasti128 ymm6, [pic(wels_shufb2301_128)] 924 AVX2_DCT_HORIZONTAL ymm0, ymm6, ymm5 925 AVX2_DCT_HORIZONTAL ymm1, ymm6, ymm5 926 AVX2_DCT_HORIZONTAL ymm2, ymm6, ymm5 927 AVX2_DCT_HORIZONTAL ymm3, ymm6, ymm5 928 929 AVX2_Store4x16P r0, mm0, mm1, mm2, mm3, mm5 930 vzeroupper 931 932 POP_XMM 933 LOAD_5_PARA_POP 934 DEINIT_X86_32_PIC 935 ret 936 937;*********************************************************************** 938; void IdctFourResAddPred_avx2(uint8_t* pPred, int32_t iStride, const int16_t* pDct, const int8_t* pNzc); 939;*********************************************************************** 940WELS_EXTERN IdctFourResAddPred_avx2 941 %assign push_num 0 942 LOAD_3_PARA_TO_5_PARA_IDCT 943 jmp prefixed(WelsIDctFourT4Rec_avx2.begin) 944 945;*********************************************************************** 946; void WelsIDctFourT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct); 947;*********************************************************************** 948WELS_EXTERN WelsIDctFourT4Rec_avx2 949 %assign push_num 0 950 LOAD_5_PARA 951.begin: 952 INIT_X86_32_PIC r5 953 PUSH_XMM 8 954 SIGN_EXTENSION r1, r1d 955 SIGN_EXTENSION r3, r3d 956 957 AVX2_Load4x16P mm0, mm1, mm2, mm3, r4, mm5 958 vbroadcasti128 ymm6, [pic(wels_shufb2301_128)] 959 AVX2_IDCT_HORIZONTAL ymm0, ymm6, ymm5 960 AVX2_IDCT_HORIZONTAL ymm1, ymm6, ymm5 961 AVX2_IDCT_HORIZONTAL ymm2, ymm6, ymm5 962 AVX2_IDCT_HORIZONTAL ymm3, ymm6, ymm5 963 AVX2_IDCT ymm0, ymm1, ymm2, ymm3, ymm5 964 965 vbroadcasti128 ymm6, [pic(wels_shufb0312_movzxw_128)] 966 vbroadcasti128 ymm7, [pic(wels_dw32_128)] 967 AVX2_StoreDiff32P r0, r1, mm0, mm1, r2, r3, mm7, mm6, mm5, mm4 968 add r2, r3 969 add r0, r1 970 AVX2_StoreDiff32P r0, r1, mm2, mm3, r2, r3, mm7, mm6, mm5, mm4 971 vzeroupper 972 973 POP_XMM 974 DEINIT_X86_32_PIC 975 LOAD_5_PARA_POP 976 ret 977 978;*********************************************************************** 979; void WelsDctT4_avx2(int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2) 980;*********************************************************************** 981WELS_EXTERN WelsDctT4_avx2 982 %assign push_num 0 983 INIT_X86_32_PIC r5 984 LOAD_5_PARA 985 PUSH_XMM 5 986 SIGN_EXTENSION r2, r2d 987 SIGN_EXTENSION r4, r4d 988 989 vbroadcasti128 ymm1, [pic(wels_shufb0312_movzxw_128)] 990 AVX2_LoadDiff4x4P mm0, r1, r2, r3, r4, mm1, mm2, mm3, mm4 991 AVX2_DCT_4x4P ymm0, ymm2 992 vbroadcasti128 ymm1, [pic(wels_shufb2301_128)] 993 AVX2_DCT_HORIZONTAL ymm0, ymm1, ymm2 994 AVX2_Store4x4P r0, mm0 995 vzeroupper 996 997 POP_XMM 998 LOAD_5_PARA_POP 999 DEINIT_X86_32_PIC 1000 ret 1001 1002;*********************************************************************** 1003; void IdctResAddPred_avx2(uint8_t* pPred, int32_t iStride, int16_t* pDct); 1004;*********************************************************************** 1005WELS_EXTERN IdctResAddPred_avx2 1006 %assign push_num 0 1007 LOAD_3_PARA_TO_5_PARA_IDCT 1008 jmp prefixed(WelsIDctT4Rec_avx2.begin) 1009 1010;*********************************************************************** 1011; void WelsIDctT4Rec_avx2(uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct); 1012;*********************************************************************** 1013WELS_EXTERN WelsIDctT4Rec_avx2 1014 %assign push_num 0 1015 LOAD_5_PARA 1016.begin: 1017 INIT_X86_32_PIC r5 1018 PUSH_XMM 6 1019 SIGN_EXTENSION r1, r1d 1020 SIGN_EXTENSION r3, r3d 1021 1022 AVX2_Load4x4P mm0, r4 1023 vbroadcasti128 ymm4, [pic(wels_shufb2301_128)] 1024 AVX2_IDCT_HORIZONTAL ymm0, ymm4, ymm1 1025 AVX2_IDCT_4x4P ymm0, ymm1 1026 vbroadcasti128 ymm4, [pic(wels_shufb0312_movzxw_128)] 1027 vbroadcasti128 ymm5, [pic(wels_dw32_128)] 1028 AVX2_StoreDiff4x4P r0, r1, mm0, r2, r3, mm5, mm4, mm1, mm2, mm3 1029 vzeroupper 1030 1031 POP_XMM 1032 DEINIT_X86_32_PIC 1033 LOAD_5_PARA_POP 1034 ret 1035%endif 1036 1037