1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* intra_pred.asm 33;* 34;* Abstract 35;* sse2 and mmx function for intra predict operations(decoder) 36;* 37;* History 38;* 18/09/2009 Created 39;* 19/11/2010 Added 40;* WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2, 41;* WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2 42;* and WelsDecoderIChromaPredDcNA_mmx 43;* 44;* 45;*************************************************************************/ 46 47%include "asm_inc.asm" 48;******************************************************************************* 49; Local Data (Read Only) 50;******************************************************************************* 51 52%ifdef X86_32_PICASM 53SECTION .text align=16 54%else 55SECTION .rodata align=16 56%endif 57 58align 16 59sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0 60align 16 61sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8 62align 16 63sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1 64 65; for chroma plane mode 66sse2_plane_inc_c dw 1, 2, 3, 4 67sse2_plane_dec_c dw 4, 3, 2, 1 68align 16 69sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4 70 71align 16 72mmx_01bytes: times 16 db 1 73 74align 16 75mmx_0x02: dw 0x02, 0x00, 0x00, 0x00 76 77align 16 78sse2_dc_0x80: times 16 db 0x80 79align 16 80sse2_wd_0x02: times 8 dw 0x02 81 82;******************************************************************************* 83; macros 84;******************************************************************************* 85;xmm0, xmm1, xmm2, eax, ecx 86;lower 64 bits of xmm0 save the result 87%macro SSE2_PRED_H_4X4_TWO_LINE 5 88 movd %1, [%4-1] 89 movdqa %3, %1 90 punpcklbw %1, %3 91 movdqa %3, %1 92 punpcklbw %1, %3 93 94 ;add %4, %5 95 movd %2, [%4+%5-1] 96 movdqa %3, %2 97 punpcklbw %2, %3 98 movdqa %3, %2 99 punpcklbw %2, %3 100 punpckldq %1, %2 101%endmacro 102 103 104%macro LOAD_COLUMN 6 105 movd %1, [%5] 106 movd %2, [%5+%6] 107 punpcklbw %1, %2 108 lea %5, [%5+2*%6] 109 movd %3, [%5] 110 movd %2, [%5+%6] 111 punpcklbw %3, %2 112 punpcklwd %1, %3 113 lea %5, [%5+2*%6] 114 movd %4, [%5] 115 movd %2, [%5+%6] 116 punpcklbw %4, %2 117 lea %5, [%5+2*%6] 118 movd %3, [%5] 119 movd %2, [%5+%6] 120 lea %5, [%5+2*%6] 121 punpcklbw %3, %2 122 punpcklwd %4, %3 123 punpckhdq %1, %4 124%endmacro 125 126%macro SUMW_HORIZON 3 127 movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 128 paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 129 punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 130 movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 131 paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 132 pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 133 paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 134%endmacro 135 136%macro COPY_16_TIMES 2 137 movdqa %2, [%1-16] 138 psrldq %2, 15 139 pmuludq %2, [pic(mmx_01bytes)] 140 pshufd %2, %2, 0 141%endmacro 142 143%macro COPY_16_TIMESS 3 144 movdqa %2, [%1+%3-16] 145 psrldq %2, 15 146 pmuludq %2, [pic(mmx_01bytes)] 147 pshufd %2, %2, 0 148%endmacro 149 150%macro LOAD_COLUMN_C 6 151 movd %1, [%5] 152 movd %2, [%5+%6] 153 punpcklbw %1,%2 154 lea %5, [%5+2*%6] 155 movd %3, [%5] 156 movd %2, [%5+%6] 157 punpcklbw %3, %2 158 punpckhwd %1, %3 159 lea %5, [%5+2*%6] 160%endmacro 161 162%macro LOAD_2_LEFT_AND_ADD 0 163 lea r0, [r0+2*r1] 164 movzx r3, byte [r0-0x01] 165 add r2, r3 166 movzx r3, byte [r0+r1-0x01] 167 add r2, r3 168%endmacro 169 170;******************************************************************************* 171; Code 172;******************************************************************************* 173 174SECTION .text 175 176 177;******************************************************************************* 178; void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride) 179; 180; pPred must align to 16 181;******************************************************************************* 182WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2 183 %assign push_num 0 184 INIT_X86_32_PIC r3 185 LOAD_2_PARA 186 SIGN_EXTENSION r1, r1d 187 188 movzx r2, byte [r0-1] 189 movd xmm0, r2d 190 pmuludq xmm0, [pic(mmx_01bytes)] 191 192 movzx r2, byte [r0+r1-1] 193 movd xmm1, r2d 194 pmuludq xmm1, [pic(mmx_01bytes)] 195 196 lea r0, [r0+r1] 197 movzx r2, byte [r0+r1-1] 198 movd xmm2, r2d 199 pmuludq xmm2, [pic(mmx_01bytes)] 200 201 movzx r2, byte [r0+2*r1-1] 202 movd xmm3, r2d 203 pmuludq xmm3, [pic(mmx_01bytes)] 204 205 sub r0, r1 206 movd [r0], xmm0 207 movd [r0+r1], xmm1 208 lea r0, [r0+2*r1] 209 movd [r0], xmm2 210 movd [r0+r1], xmm3 211 212 DEINIT_X86_32_PIC 213 ret 214 215;******************************************************************************* 216; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride); 217;******************************************************************************* 218WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2 219 push r3 220 push r4 221 %assign push_num 2 222 INIT_X86_32_PIC r5 223 LOAD_2_PARA 224 PUSH_XMM 8 225 SIGN_EXTENSION r1, r1d 226 mov r4, r0 ; save r0 in r4 227 sub r0, 1 228 sub r0, r1 229 230 ;for H 231 pxor xmm7, xmm7 232 movq xmm0, [r0] 233 movdqa xmm5, [pic(sse2_plane_dec)] 234 punpcklbw xmm0, xmm7 235 pmullw xmm0, xmm5 236 movq xmm1, [r0 + 9] 237 movdqa xmm6, [pic(sse2_plane_inc)] 238 punpcklbw xmm1, xmm7 239 pmullw xmm1, xmm6 240 psubw xmm1, xmm0 241 242 SUMW_HORIZON xmm1,xmm0,xmm2 243 movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]); 244 movsx r2, r2w 245 imul r2, 5 246 add r2, 32 247 sar r2, 6 ; b = (5 * H + 32) >> 6; 248 SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b 249 250 movzx r3, BYTE [r0+16] 251 sub r0, 3 252 LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1 253 254 add r0, 3 255 movzx r2, BYTE [r0+8*r1] 256 add r3, r2 257 shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4; 258 259 sub r0, 3 260 add r0, r1 261 LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1 262 pxor xmm4, xmm4 263 punpckhbw xmm0, xmm4 264 pmullw xmm0, xmm5 265 punpckhbw xmm7, xmm4 266 pmullw xmm7, xmm6 267 psubw xmm7, xmm0 268 269 ; Indicate that xmm2 is fully initialized. Its actual value doesn't 270 ; matter in SUMW_HORIZON below, but after being used in LOAD_COLUMN above, 271 ; valgrind thinks that xmm2 contains uninitalized data (if the columns outside 272 ; of the left are uninitialized, such as in DecUT_IntraPrediction), which taints 273 ; r2d below, even if actually isn't based on the uninitialized data. 274 pxor xmm2, xmm2 275 276 SUMW_HORIZON xmm7,xmm0,xmm2 277 movd r2d, xmm7 ; V 278 movsx r2, r2w 279 280 imul r2, 5 281 add r2, 32 282 sar r2, 6 ; c = (5 * V + 32) >> 6; 283 SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c 284 285 mov r0, r4 286 add r3, 16 287 imul r2, -7 288 add r3, r2 ; s = a + 16 + (-7)*c 289 SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s 290 291 xor r2, r2 292 movdqa xmm5, [pic(sse2_plane_inc_minus)] 293 294get_i16x16_luma_pred_plane_sse2_1: 295 movdqa xmm2, xmm1 296 pmullw xmm2, xmm5 297 paddw xmm2, xmm0 298 psraw xmm2, 5 299 movdqa xmm3, xmm1 300 pmullw xmm3, xmm6 301 paddw xmm3, xmm0 302 psraw xmm3, 5 303 packuswb xmm2, xmm3 304 movdqa [r0], xmm2 305 paddw xmm0, xmm4 306 add r0, r1 307 inc r2 308 cmp r2, 16 309 jnz get_i16x16_luma_pred_plane_sse2_1 310 311 POP_XMM 312 DEINIT_X86_32_PIC 313 pop r4 314 pop r3 315 ret 316 317 318 319;******************************************************************************* 320; void WelsDecoderI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride); 321;******************************************************************************* 322 323%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2 324 lea %1, [%1+%2*2] 325 326 COPY_16_TIMES %1, xmm0 327 movdqa [%1], xmm0 328 COPY_16_TIMESS %1, xmm0, %2 329 movdqa [%1+%2], xmm0 330%endmacro 331 332WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2 333 %assign push_num 0 334 INIT_X86_32_PIC_NOPRESERVE r2 335 LOAD_2_PARA 336 SIGN_EXTENSION r1, r1d 337 338 COPY_16_TIMES r0, xmm0 339 movdqa [r0], xmm0 340 COPY_16_TIMESS r0, xmm0, r1 341 movdqa [r0+r1], xmm0 342 343 SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 344 SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 345 SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 346 SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 347 SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 348 SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 349 SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1 350 351 DEINIT_X86_32_PIC 352 ret 353 354;******************************************************************************* 355; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride); 356;******************************************************************************* 357WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2 358 %assign push_num 0 359 LOAD_2_PARA 360 SIGN_EXTENSION r1, r1d 361 362 sub r0, r1 363 movdqa xmm0, [r0] 364 365 movdqa [r0+r1], xmm0 366 lea r0, [r0+2*r1] 367 movdqa [r0], xmm0 368 movdqa [r0+r1], xmm0 369 lea r0, [r0+2*r1] 370 movdqa [r0], xmm0 371 movdqa [r0+r1], xmm0 372 lea r0, [r0+2*r1] 373 movdqa [r0], xmm0 374 movdqa [r0+r1], xmm0 375 lea r0, [r0+2*r1] 376 movdqa [r0], xmm0 377 movdqa [r0+r1], xmm0 378 lea r0, [r0+2*r1] 379 movdqa [r0], xmm0 380 movdqa [r0+r1], xmm0 381 lea r0, [r0+2*r1] 382 movdqa [r0], xmm0 383 movdqa [r0+r1], xmm0 384 lea r0, [r0+2*r1] 385 movdqa [r0], xmm0 386 movdqa [r0+r1], xmm0 387 lea r0, [r0+2*r1] 388 movdqa [r0], xmm0 389 390 ret 391 392;******************************************************************************* 393; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride); 394;******************************************************************************* 395WELS_EXTERN WelsDecoderIChromaPredPlane_sse2 396 push r3 397 push r4 398 %assign push_num 2 399 INIT_X86_32_PIC r5 400 LOAD_2_PARA 401 PUSH_XMM 8 402 SIGN_EXTENSION r1, r1d 403 mov r4, r0 404 sub r0, 1 405 sub r0, r1 406 407 pxor mm7, mm7 408 movq mm0, [r0] 409 movq mm5, [pic(sse2_plane_dec_c)] 410 punpcklbw mm0, mm7 411 pmullw mm0, mm5 412 movq mm1, [r0 + 5] 413 movq mm6, [pic(sse2_plane_inc_c)] 414 punpcklbw mm1, mm7 415 pmullw mm1, mm6 416 psubw mm1, mm0 417 418 movq2dq xmm1, mm1 419 pxor xmm2, xmm2 420 SUMW_HORIZON xmm1,xmm0,xmm2 421 movd r2d, xmm1 422 movsx r2, r2w 423 imul r2, 17 424 add r2, 16 425 sar r2, 5 ; b = (17 * H + 16) >> 5; 426 SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b 427 428 movzx r3, BYTE [r0+8] 429 sub r0, 3 430 LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1 431 432 add r0, 3 433 movzx r2, BYTE [r0+4*r1] 434 add r3, r2 435 shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4; 436 437 sub r0, 3 438 add r0, r1 439 LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1 440 pxor mm4, mm4 441 punpckhbw mm0, mm4 442 pmullw mm0, mm5 443 punpckhbw mm7, mm4 444 pmullw mm7, mm6 445 psubw mm7, mm0 446 447 movq2dq xmm7, mm7 448 pxor xmm2, xmm2 449 SUMW_HORIZON xmm7,xmm0,xmm2 450 movd r2d, xmm7 ; V 451 movsx r2, r2w 452 453 imul r2, 17 454 add r2, 16 455 sar r2, 5 ; c = (17 * V + 16) >> 5; 456 SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c 457 458 mov r0, r4 459 add r3, 16 460 imul r2, -3 461 add r3, r2 ; s = a + 16 + (-3)*c 462 SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s 463 464 xor r2, r2 465 movdqa xmm5, [pic(sse2_plane_mul_b_c)] 466 467get_i_chroma_pred_plane_sse2_1: 468 movdqa xmm2, xmm1 469 pmullw xmm2, xmm5 470 paddw xmm2, xmm0 471 psraw xmm2, 5 472 packuswb xmm2, xmm2 473 movq [r0], xmm2 474 paddw xmm0, xmm4 475 add r0, r1 476 inc r2 477 cmp r2, 8 478 jnz get_i_chroma_pred_plane_sse2_1 479 480 POP_XMM 481 DEINIT_X86_32_PIC 482 pop r4 483 pop r3 484 WELSEMMS 485 ret 486 487;******************************************************************************* 488; 0 |1 |2 |3 |4 | 489; 6 |7 |8 |9 |10| 490; 11|12|13|14|15| 491; 16|17|18|19|20| 492; 21|22|23|24|25| 493; 7 is the start pixel of current 4x4 block 494; pPred[7] = ([6]+[0]*2+[1]+2)/4 495; 496; void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride) 497; 498;******************************************************************************* 499WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx 500 %assign push_num 0 501 INIT_X86_32_PIC r3 502 LOAD_2_PARA 503 SIGN_EXTENSION r1, r1d 504 mov r2, r0 505 506 movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11 507 movq mm2,[r2-8] ;get value of 6 mm2[8] = 6 508 sub r2, r1 ;mov eax to above line of current block(postion of 1) 509 punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6] 510 movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3] 511 punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11] 512 psllq mm3,18h ;mm3[5]=[1] 513 psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] 514 por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11] 515 movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] 516 lea r2,[r2+r1*2-8h] ;set eax point to 12 517 movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16] 518 psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0 519 psrlq mm4,38h ;mm4[1]=[16] 520 por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16] 521 movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16] 522 movq mm4,[r2+r1*2] ;mm4[8]=[21] 523 psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0 524 psrlq mm4,38h ;mm4[1]=[21] 525 por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21] 526 movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21] 527 pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2 528 pxor mm1,mm4 ;find odd value in the lowest bit of each byte 529 pand mm1,[pic(mmx_01bytes)] ;set the odd bit 530 psubusb mm3,mm1 ;decrease 1 from odd bytes 531 pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2 532 533 lea r0,[r0+r1] 534 movd [r0+2*r1],mm2 535 sub r0,r1 536 psrlq mm2,8 537 movd [r0+2*r1],mm2 538 psrlq mm2,8 539 movd [r0+r1],mm2 540 psrlq mm2,8 541 movd [r0],mm2 542 DEINIT_X86_32_PIC 543 WELSEMMS 544 ret 545 546 547;******************************************************************************* 548; void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride) 549; copy 8 pixel of 8 line from left 550;******************************************************************************* 551%macro MMX_PRED_H_8X8_ONE_LINE 4 552 movq %1, [%3-8] 553 psrlq %1, 38h 554 555 pmullw %1, [pic(mmx_01bytes)] 556 pshufw %1, %1, 0 557 movq [%4], %1 558%endmacro 559 560%macro MMX_PRED_H_8X8_ONE_LINEE 4 561 movq %1, [%3+r1-8] 562 psrlq %1, 38h 563 564 pmullw %1, [pic(mmx_01bytes)] 565 pshufw %1, %1, 0 566 movq [%4], %1 567%endmacro 568 569WELS_EXTERN WelsDecoderIChromaPredH_mmx 570 %assign push_num 0 571 INIT_X86_32_PIC r3 572 LOAD_2_PARA 573 SIGN_EXTENSION r1, r1d 574 mov r2, r0 575 576 movq mm0, [r2-8] 577 psrlq mm0, 38h 578 579 pmullw mm0, [pic(mmx_01bytes)] 580 pshufw mm0, mm0, 0 581 movq [r0], mm0 582 583 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 584 585 lea r2, [r2+r1*2] 586 MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 587 588 lea r0, [r0+2*r1] 589 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 590 591 lea r2, [r2+r1*2] 592 MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 593 594 lea r0, [r0+2*r1] 595 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 596 597 lea r2, [r2+r1*2] 598 MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1 599 600 lea r0, [r0+2*r1] 601 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1 602 603 DEINIT_X86_32_PIC 604 WELSEMMS 605 ret 606 607 608;******************************************************************************* 609; void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride) 610; copy 8 pixels from top 8 pixels 611;******************************************************************************* 612WELS_EXTERN WelsDecoderIChromaPredV_mmx 613 %assign push_num 0 614 LOAD_2_PARA 615 SIGN_EXTENSION r1, r1d 616 617 sub r0, r1 618 movq mm0, [r0] 619 620 movq [r0+r1], mm0 621 movq [r0+2*r1], mm0 622 lea r0, [r0+2*r1] 623 movq [r0+r1], mm0 624 movq [r0+2*r1], mm0 625 lea r0, [r0+2*r1] 626 movq [r0+r1], mm0 627 movq [r0+2*r1], mm0 628 lea r0, [r0+2*r1] 629 movq [r0+r1], mm0 630 movq [r0+2*r1], mm0 631 632 WELSEMMS 633 ret 634 635 636;******************************************************************************* 637; lt|t0|t1|t2|t3| 638; l0| 639; l1| 640; l2| 641; l3| 642; t3 will never been used 643; destination: 644; |a |b |c |d | 645; |e |f |a |b | 646; |g |h |e |f | 647; |i |j |g |h | 648 649; a = (1 + lt + l0)>>1 650; e = (1 + l0 + l1)>>1 651; g = (1 + l1 + l2)>>1 652; i = (1 + l2 + l3)>>1 653 654; d = (2 + t0 + (t1<<1) + t2)>>2 655; c = (2 + lt + (t0<<1) + t1)>>2 656; b = (2 + l0 + (lt<<1) + t0)>>2 657 658; f = (2 + l1 + (l0<<1) + lt)>>2 659; h = (2 + l2 + (l1<<1) + l0)>>2 660; j = (2 + l3 + (l2<<1) + l1)>>2 661; [b a f e h g j i] + [d c b a] --> mov to memory 662; 663; void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride) 664;******************************************************************************* 665WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx 666 %assign push_num 0 667 INIT_X86_32_PIC r3 668 LOAD_2_PARA 669 SIGN_EXTENSION r1, r1d 670 mov r2, r0 671 sub r2, r1 672 movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt] 673 psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx] 674 675 movd mm1, [r2+2*r1-4] 676 punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1 677 lea r2, [r2+2*r1] 678 movd mm2, [r2+2*r1-4] 679 punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3 680 punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx] 681 psrlq mm2, 20h 682 pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3] 683 684 movq mm1, mm0 685 psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1] 686 movq mm2, mm0 687 psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2] 688 movq mm3, mm2 689 movq mm4, mm1 690 pavgb mm1, mm0 691 692 pxor mm4, mm0 ; find odd value in the lowest bit of each byte 693 pand mm4, [pic(mmx_01bytes)] ; set the odd bit 694 psubusb mm1, mm4 ; decrease 1 from odd bytes 695 696 pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j] 697 698 movq mm4, mm0 699 pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i] 700 punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i] 701 702 psrlq mm2, 20h 703 psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0] 704 movq mm4, mm3 705 psrlq mm4, 10h ; mm4 = [0 0 b a f e h j] 706 pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx] 707 psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a] 708 709 movd [r0], mm2 710 lea r0, [r0+r1] 711 movd [r0+2*r1], mm3 712 sub r0, r1 713 psrlq mm3, 10h 714 movd [r0+2*r1], mm3 715 psrlq mm3, 10h 716 movd [r0+r1], mm3 717 DEINIT_X86_32_PIC 718 WELSEMMS 719 ret 720 721 722 723;******************************************************************************* 724; lt|t0|t1|t2|t3| 725; l0| 726; l1| 727; l2| 728; l3| 729; t3 will never been used 730; destination: 731; |a |b |c |d | 732; |c |d |e |f | 733; |e |f |g |g | 734; |g |g |g |g | 735 736; a = (1 + l0 + l1)>>1 737; c = (1 + l1 + l2)>>1 738; e = (1 + l2 + l3)>>1 739; g = l3 740 741; b = (2 + l0 + (l1<<1) + l2)>>2 742; d = (2 + l1 + (l2<<1) + l3)>>2 743; f = (2 + l2 + (l3<<1) + l3)>>2 744 745; [g g f e d c b a] + [g g g g] --> mov to memory 746; 747; void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride) 748;******************************************************************************* 749WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx 750 %assign push_num 0 751 INIT_X86_32_PIC r3 752 LOAD_2_PARA 753 SIGN_EXTENSION r1, r1d 754 mov r2, r0 755 756 movd mm0, [r2-4] ; mm0[3] = l0 757 punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0 758 lea r2, [r2+2*r1] 759 movd mm2, [r2-4] ; mm2[3] = l2 760 movd mm4, [r2+r1-4] ; mm4[3] = l3 761 punpcklbw mm2, mm4 762 punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx] 763 764 psrlq mm4, 18h 765 psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx] 766 psrlq mm0, 8h 767 pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx] 768 769 movq mm1, mm0 770 psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx] 771 movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx] 772 pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx] 773 774 movq mm2, mm0 775 psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx] 776 movq mm5, mm2 777 pavgb mm2, mm0 778 779 pxor mm5, mm0 ; find odd value in the lowest bit of each byte 780 pand mm5, [pic(mmx_01bytes)] ; set the odd bit 781 psubusb mm2, mm5 ; decrease 1 from odd bytes 782 783 pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx] 784 785 psrlq mm2, 8h 786 pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx] 787 788 punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a] 789 punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx] 790 punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx] 791 792 psrlq mm4, 20h 793 lea r0, [r0+r1] 794 movd [r0+2*r1], mm4 795 796 sub r0, r1 797 movd [r0], mm1 798 psrlq mm1, 10h 799 movd [r0+r1], mm1 800 psrlq mm1, 10h 801 movd [r0+2*r1], mm1 802 DEINIT_X86_32_PIC 803 WELSEMMS 804 ret 805 806 807 808;******************************************************************************* 809; lt|t0|t1|t2|t3| 810; l0| 811; l1| 812; l2| 813; l3| 814; l3 will never been used 815; destination: 816; |a |b |c |d | 817; |e |f |g |h | 818; |i |a |b |c | 819; |j |e |f |g | 820 821; a = (1 + lt + t0)>>1 822; b = (1 + t0 + t1)>>1 823; c = (1 + t1 + t2)>>1 824; d = (1 + t2 + t3)>>1 825 826; e = (2 + l0 + (lt<<1) + t0)>>2 827; f = (2 + lt + (t0<<1) + t1)>>2 828; g = (2 + t0 + (t1<<1) + t2)>>2 829 830; h = (2 + t1 + (t2<<1) + t3)>>2 831; i = (2 + lt + (l0<<1) + l1)>>2 832; j = (2 + l0 + (l1<<1) + l2)>>2 833; 834; void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride) 835;******************************************************************************* 836WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx 837 %assign push_num 0 838 INIT_X86_32_PIC r3 839 LOAD_2_PARA 840 SIGN_EXTENSION r1, r1d 841 mov r2, r0 842 sub r2, r1 843 movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt] 844 psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx] 845 846 movd mm1, [r2+2*r1-4] 847 punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1 848 lea r2, [r2+2*r1] 849 movq mm2, [r2+r1-8] ; mm2[7] = l2 850 punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx] 851 psrlq mm2, 28h 852 pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2] 853 854 movq mm1, mm0 855 psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx] 856 pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx] 857 858 movq mm2, mm0 859 psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx] 860 movq mm3, mm2 861 pavgb mm2, mm0 862 863 pxor mm3, mm0 ; find odd value in the lowest bit of each byte 864 pand mm3, [pic(mmx_01bytes)] ; set the odd bit 865 psubusb mm2, mm3 ; decrease 1 from odd bytes 866 867 movq mm3, mm0 868 psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx] 869 pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx] 870 movq mm2, mm3 871 872 psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a] 873 movd [r0], mm1 874 875 psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e] 876 movd [r0+r1], mm2 877 878 movq mm4, mm3 879 psllq mm4, 20h 880 psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i] 881 882 movq mm5, mm3 883 psllq mm5, 28h 884 psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j] 885 886 psllq mm1, 8h 887 pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i] 888 movd [r0+2*r1], mm4 889 890 psllq mm2, 8h 891 pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j] 892 lea r0, [r0+2*r1] 893 movd [r0+r1], mm5 894 DEINIT_X86_32_PIC 895 WELSEMMS 896 ret 897 898;******************************************************************************* 899; lt|t0|t1|t2|t3|t4|t5|t6|t7 900; l0| 901; l1| 902; l2| 903; l3| 904; lt,t0,t1,t2,t3 will never been used 905; destination: 906; |a |b |c |d | 907; |b |c |d |e | 908; |c |d |e |f | 909; |d |e |f |g | 910 911; a = (2 + t0 + t2 + (t1<<1))>>2 912; b = (2 + t1 + t3 + (t2<<1))>>2 913; c = (2 + t2 + t4 + (t3<<1))>>2 914; d = (2 + t3 + t5 + (t4<<1))>>2 915 916; e = (2 + t4 + t6 + (t5<<1))>>2 917; f = (2 + t5 + t7 + (t6<<1))>>2 918; g = (2 + t6 + t7 + (t7<<1))>>2 919 920; [g f e d c b a] --> mov to memory 921; 922; void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride) 923;******************************************************************************* 924WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx 925 %assign push_num 0 926 INIT_X86_32_PIC r3 927 LOAD_2_PARA 928 SIGN_EXTENSION r1, r1d 929 mov r2, r0 930 sub r2, r1 931 movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] 932 movq mm1, mm0 933 movq mm2, mm0 934 935 movq mm3, mm0 936 psrlq mm3, 38h 937 psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx] 938 939 psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx] 940 psrlq mm2, 8h 941 pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1] 942 943 movq mm3, mm1 944 pavgb mm1, mm2 945 pxor mm3, mm2 ; find odd value in the lowest bit of each byte 946 pand mm3, [pic(mmx_01bytes)] ; set the odd bit 947 psubusb mm1, mm3 ; decrease 1 from odd bytes 948 949 pavgb mm0, mm1 ; mm0 = [g f e d c b a xx] 950 951 psrlq mm0, 8h 952 movd [r0], mm0 953 psrlq mm0, 8h 954 movd [r0+r1], mm0 955 psrlq mm0, 8h 956 movd [r0+2*r1], mm0 957 psrlq mm0, 8h 958 lea r0, [r0+2*r1] 959 movd [r0+r1], mm0 960 DEINIT_X86_32_PIC 961 WELSEMMS 962 ret 963 964 965;******************************************************************************* 966; lt|t0|t1|t2|t3|t4|t5|t6|t7 967; l0| 968; l1| 969; l2| 970; l3| 971; lt,t0,t1,t2,t3 will never been used 972; destination: 973; |a |b |c |d | 974; |e |f |g |h | 975; |b |c |d |i | 976; |f |g |h |j | 977 978; a = (1 + t0 + t1)>>1 979; b = (1 + t1 + t2)>>1 980; c = (1 + t2 + t3)>>1 981; d = (1 + t3 + t4)>>1 982; i = (1 + t4 + t5)>>1 983 984; e = (2 + t0 + (t1<<1) + t2)>>2 985; f = (2 + t1 + (t2<<1) + t3)>>2 986; g = (2 + t2 + (t3<<1) + t4)>>2 987; h = (2 + t3 + (t4<<1) + t5)>>2 988; j = (2 + t4 + (t5<<1) + t6)>>2 989 990; [i d c b a] + [j h g f e] --> mov to memory 991; 992; void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride) 993;******************************************************************************* 994WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx 995 %assign push_num 0 996 INIT_X86_32_PIC r3 997 LOAD_2_PARA 998 SIGN_EXTENSION r1, r1d 999 mov r2, r0 1000 1001 sub r2, r1 1002 movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] 1003 movq mm1, mm0 1004 movq mm2, mm0 1005 1006 psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1] 1007 psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2] 1008 1009 movq mm3, mm1 1010 pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a] 1011 1012 movq mm4, mm2 1013 pavgb mm2, mm0 1014 pxor mm4, mm0 ; find odd value in the lowest bit of each byte 1015 pand mm4, [pic(mmx_01bytes)] ; set the odd bit 1016 psubusb mm2, mm4 ; decrease 1 from odd bytes 1017 1018 pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e] 1019 1020 movd [r0], mm3 1021 psrlq mm3, 8h 1022 movd [r0+2*r1], mm3 1023 1024 movd [r0+r1], mm2 1025 psrlq mm2, 8h 1026 lea r0, [r0+2*r1] 1027 movd [r0+r1], mm2 1028 DEINIT_X86_32_PIC 1029 WELSEMMS 1030 ret 1031 1032;******************************************************************************* 1033; 1034; void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride) 1035;******************************************************************************* 1036WELS_EXTERN WelsDecoderIChromaPredDc_sse2 1037 push r3 1038 push r4 1039 %assign push_num 2 1040 INIT_X86_32_PIC r5 1041 LOAD_2_PARA 1042 SIGN_EXTENSION r1, r1d 1043 mov r4, r0 1044 1045 sub r0, r1 1046 movq mm0, [r0] 1047 1048 movzx r2, byte [r0+r1-0x01] ; l1 1049 lea r0, [r0+2*r1] 1050 movzx r3, byte [r0-0x01] ; l2 1051 add r2, r3 1052 movzx r3, byte [r0+r1-0x01] ; l3 1053 add r2, r3 1054 lea r0, [r0+2*r1] 1055 movzx r3, byte [r0-0x01] ; l4 1056 add r2, r3 1057 movd mm1, r2d ; mm1 = l1+l2+l3+l4 1058 1059 movzx r2, byte [r0+r1-0x01] ; l5 1060 lea r0, [r0+2*r1] 1061 movzx r3, byte [r0-0x01] ; l6 1062 add r2, r3 1063 movzx r3, byte [r0+r1-0x01] ; l7 1064 add r2, r3 1065 lea r0, [r0+2*r1] 1066 movzx r3, byte [r0-0x01] ; l8 1067 add r2, r3 1068 movd mm2, r2d ; mm2 = l5+l6+l7+l8 1069 1070 movq mm3, mm0 1071 psrlq mm0, 0x20 1072 psllq mm3, 0x20 1073 psrlq mm3, 0x20 1074 pxor mm4, mm4 1075 psadbw mm0, mm4 1076 psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2 1077 1078 paddq mm3, mm1 1079 movq mm1, mm2 1080 paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1 1081 1082 movq mm4, [pic(mmx_0x02)] 1083 1084 paddq mm0, mm4 1085 psrlq mm0, 0x02 1086 1087 paddq mm2, mm4 1088 psrlq mm2, 0x02 1089 1090 paddq mm3, mm4 1091 paddq mm3, mm4 1092 psrlq mm3, 0x03 1093 1094 paddq mm1, mm4 1095 paddq mm1, mm4 1096 psrlq mm1, 0x03 1097 1098 pmuludq mm0, [pic(mmx_01bytes)] 1099 pmuludq mm3, [pic(mmx_01bytes)] 1100 psllq mm0, 0x20 1101 pxor mm0, mm3 ; mm0 = m_up 1102 1103 pmuludq mm2, [pic(mmx_01bytes)] 1104 pmuludq mm1, [pic(mmx_01bytes)] 1105 psllq mm1, 0x20 1106 pxor mm1, mm2 ; mm2 = m_down 1107 1108 movq [r4], mm0 1109 movq [r4+r1], mm0 1110 movq [r4+2*r1], mm0 1111 lea r4, [r4+2*r1] 1112 movq [r4+r1], mm0 1113 1114 movq [r4+2*r1], mm1 1115 lea r4, [r4+2*r1] 1116 movq [r4+r1], mm1 1117 movq [r4+2*r1], mm1 1118 lea r4, [r4+2*r1] 1119 movq [r4+r1], mm1 1120 1121 DEINIT_X86_32_PIC 1122 pop r4 1123 pop r3 1124 WELSEMMS 1125 ret 1126 1127 1128 1129;******************************************************************************* 1130; 1131; void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride) 1132;******************************************************************************* 1133WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2 1134 push r3 1135 push r4 1136 %assign push_num 2 1137 INIT_X86_32_PIC r5 1138 LOAD_2_PARA 1139 SIGN_EXTENSION r1, r1d 1140 mov r4, r0 1141 sub r0, r1 1142 movdqa xmm0, [r0] ; read one row 1143 pxor xmm1, xmm1 1144 psadbw xmm0, xmm1 1145 movdqa xmm1, xmm0 1146 psrldq xmm1, 0x08 1147 pslldq xmm0, 0x08 1148 psrldq xmm0, 0x08 1149 paddw xmm0, xmm1 1150 1151 movzx r2, byte [r0+r1-0x01] 1152 movzx r3, byte [r0+2*r1-0x01] 1153 add r2, r3 1154 lea r0, [r0+r1] 1155 LOAD_2_LEFT_AND_ADD 1156 LOAD_2_LEFT_AND_ADD 1157 LOAD_2_LEFT_AND_ADD 1158 LOAD_2_LEFT_AND_ADD 1159 LOAD_2_LEFT_AND_ADD 1160 LOAD_2_LEFT_AND_ADD 1161 LOAD_2_LEFT_AND_ADD 1162 add r2, 0x10 1163 movd xmm1, r2d 1164 paddw xmm0, xmm1 1165 psrld xmm0, 0x05 1166 pmuludq xmm0, [pic(mmx_01bytes)] 1167 pshufd xmm0, xmm0, 0 1168 1169 movdqa [r4], xmm0 1170 movdqa [r4+r1], xmm0 1171 movdqa [r4+2*r1], xmm0 1172 lea r4, [r4+2*r1] 1173 1174 movdqa [r4+r1], xmm0 1175 movdqa [r4+2*r1], xmm0 1176 lea r4, [r4+2*r1] 1177 1178 movdqa [r4+r1], xmm0 1179 movdqa [r4+2*r1], xmm0 1180 lea r4, [r4+2*r1] 1181 1182 movdqa [r4+r1], xmm0 1183 movdqa [r4+2*r1], xmm0 1184 lea r4, [r4+2*r1] 1185 1186 movdqa [r4+r1], xmm0 1187 movdqa [r4+2*r1], xmm0 1188 lea r4, [r4+2*r1] 1189 1190 movdqa [r4+r1], xmm0 1191 movdqa [r4+2*r1], xmm0 1192 lea r4, [r4+2*r1] 1193 1194 movdqa [r4+r1], xmm0 1195 movdqa [r4+2*r1], xmm0 1196 lea r4, [r4+2*r1] 1197 1198 movdqa [r4+r1], xmm0 1199 1200 DEINIT_X86_32_PIC 1201 pop r4 1202 pop r3 1203 1204 ret 1205 1206;******************************************************************************* 1207; for intra prediction as follows, 11/19/2010 1208;******************************************************************************* 1209 1210;******************************************************************************* 1211; void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride) 1212;******************************************************************************* 1213WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2 1214 %assign push_num 0 1215 LOAD_2_PARA 1216 PUSH_XMM 8 1217 SIGN_EXTENSION r1, r1d 1218 mov r2, r0 1219 sub r2, r1 1220 movdqa xmm0, [r2] ; pPred-kiStride, top line 1221 pxor xmm7, xmm7 1222 psadbw xmm0, xmm7 1223 movdqa xmm1, xmm0 1224 psrldq xmm1, 8 1225 paddw xmm0, xmm1 1226 xor r2, r2 1227 movd r2d, xmm0 1228 ;movdqa xmm1, xmm0 1229 ;punpcklbw xmm0, xmm7 1230 ;punpckhbw xmm1, xmm7 1231 1232 ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope 1233 ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4 1234 ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 1235 ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 1236 ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6) 1237 ;pshuflw xmm1, xmm0, 0b1h ; 10110001 1238 ;paddw xmm0, xmm1 ; sum in word unit (x8) 1239 ;xor r3, r3 1240 ;movd r3d, xmm0 1241 ;and edx, 0ffffh 1242 1243 add r2, 8 1244 sar r2, 4 1245 SSE2_Copy16Times xmm1, r2d 1246 ;mov dh, dl 1247 ;mov r2, edx 1248 ;shl r2, 010h 1249 ;or edx, r2 1250 ;movd xmm1, edx 1251 ;pshufd xmm0, xmm1, 00h 1252 ;movdqa xmm1, xmm0 1253 movdqa xmm0, xmm1 1254 lea r2, [2*r1+r1] ; 3*kiStride 1255 1256 movdqa [r0], xmm0 1257 movdqa [r0+r1], xmm1 1258 movdqa [r0+2*r1], xmm0 1259 movdqa [r0+r2], xmm1 1260 1261 lea r0, [r0+4*r1] 1262 movdqa [r0], xmm0 1263 movdqa [r0+r1], xmm1 1264 movdqa [r0+2*r1], xmm0 1265 movdqa [r0+r2], xmm1 1266 1267 lea r0, [r0+4*r1] 1268 movdqa [r0], xmm0 1269 movdqa [r0+r1], xmm1 1270 movdqa [r0+2*r1], xmm0 1271 movdqa [r0+r2], xmm1 1272 1273 lea r0, [r0+4*r1] 1274 movdqa [r0], xmm0 1275 movdqa [r0+r1], xmm1 1276 movdqa [r0+2*r1], xmm0 1277 movdqa [r0+r2], xmm1 1278 1279 POP_XMM 1280 ret 1281 1282;******************************************************************************* 1283; void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride) 1284;******************************************************************************* 1285WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2 1286 %assign push_num 0 1287 INIT_X86_32_PIC r3 1288 LOAD_2_PARA 1289 SIGN_EXTENSION r1, r1d 1290 lea r2, [2*r1+r1] ; 3*kiStride 1291 1292 movdqa xmm0, [pic(sse2_dc_0x80)] 1293 movdqa xmm1, xmm0 1294 movdqa [r0], xmm0 1295 movdqa [r0+r1], xmm1 1296 movdqa [r0+2*r1], xmm0 1297 movdqa [r0+r2], xmm1 1298 lea r0, [r0+4*r1] 1299 movdqa [r0], xmm0 1300 movdqa [r0+r1], xmm1 1301 movdqa [r0+2*r1], xmm0 1302 movdqa [r0+r2], xmm1 1303 lea r0, [r0+4*r1] 1304 movdqa [r0], xmm0 1305 movdqa [r0+r1], xmm1 1306 movdqa [r0+2*r1], xmm0 1307 movdqa [r0+r2], xmm1 1308 lea r0, [r0+4*r1] 1309 movdqa [r0], xmm0 1310 movdqa [r0+r1], xmm1 1311 movdqa [r0+2*r1], xmm0 1312 movdqa [r0+r2], xmm1 1313 1314 DEINIT_X86_32_PIC 1315 ret 1316 1317;******************************************************************************* 1318; void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride) 1319;******************************************************************************* 1320WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx 1321 push r3 1322 push r4 1323 %assign push_num 2 1324 LOAD_2_PARA 1325 SIGN_EXTENSION r1, r1d 1326 mov r4, r0 1327 ; for left 1328 dec r0 1329 xor r2, r2 1330 xor r3, r3 1331 movzx r2, byte [r0] 1332 movzx r3, byte [r0+r1] 1333 add r2, r3 1334 lea r0, [r0+2*r1] 1335 movzx r3, byte [r0] 1336 add r2, r3 1337 movzx r3, byte [r0+r1] 1338 add r2, r3 1339 add r2, 02h 1340 sar r2, 02h 1341 ;SSE2_Copy16Times mm0, r2d 1342 mov r3, r2 1343 sal r3, 8 1344 or r2, r3 1345 movd mm1, r2d 1346 pshufw mm0, mm1, 00h 1347 ;mov bh, bl 1348 ;movd mm1, ebx 1349 ;pshufw mm0, mm1, 00h ; up64 1350 movq mm1, mm0 1351 xor r2, r2 1352 lea r0, [r0+2*r1] 1353 movzx r2, byte [r0] 1354 movzx r3, byte [r0+r1] 1355 add r2, r3 1356 lea r0, [r0+2*r1] 1357 movzx r3, byte [r0] 1358 add r2, r3 1359 movzx r3, byte [r0+r1] 1360 add r2, r3 1361 add r2, 02h 1362 sar r2, 02h 1363 mov r3, r2 1364 sal r3, 8 1365 or r2, r3 1366 movd mm3, r2d 1367 pshufw mm2, mm3, 00h 1368 ;mov bh, bl 1369 ;movd mm3, ebx 1370 ;pshufw mm2, mm3, 00h ; down64 1371 ;SSE2_Copy16Times mm2, r2d 1372 movq mm3, mm2 1373 lea r2, [2*r1+r1] 1374 movq [r4], mm0 1375 movq [r4+r1], mm1 1376 movq [r4+2*r1], mm0 1377 movq [r4+r2], mm1 1378 lea r4, [r4+4*r1] 1379 movq [r4], mm2 1380 movq [r4+r1], mm3 1381 movq [r4+2*r1], mm2 1382 movq [r4+r2], mm3 1383 pop r4 1384 pop r3 1385 emms 1386 ret 1387 1388;******************************************************************************* 1389; void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride) 1390;******************************************************************************* 1391WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2 1392 %assign push_num 0 1393 LOAD_2_PARA 1394 PUSH_XMM 8 1395 SIGN_EXTENSION r1, r1d 1396 mov r2, r0 1397 sub r2, r1 1398 movq xmm0, [r2] ; top: 8x1 pixels 1399 pxor xmm7, xmm7 1400 punpcklbw xmm0, xmm7 ; ext 8x2 words 1401 pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2 1402 paddw xmm0, xmm1 ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2 1403 movdqa xmm1, xmm0 1404 pshuflw xmm2, xmm0, 0B1h ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3 1405 pshufhw xmm3, xmm1, 0B1h ; 10110001 B, w4+6 w5+7 w4+6 w5+7 .. 1406 paddw xmm0, xmm2 ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3 1407 paddw xmm1, xmm3 ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 .. 1408 punpckhqdq xmm1, xmm7 1409 punpcklqdq xmm0, xmm1 ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0 1410%ifdef X86_32_PICASM 1411 pcmpeqw xmm6, xmm6 1412 psrlw xmm6, 15 1413 psllw xmm6, 1 1414%else 1415 movdqa xmm6, [sse2_wd_0x02] 1416%endif 1417 paddw xmm0, xmm6 1418 psraw xmm0, 02h 1419 packuswb xmm0, xmm7 1420 lea r2, [2*r1+r1] 1421 movq [r0], xmm0 1422 movq [r0+r1], xmm0 1423 movq [r0+2*r1], xmm0 1424 movq [r0+r2], xmm0 1425 lea r0, [r0+4*r1] 1426 movq [r0], xmm0 1427 movq [r0+r1], xmm0 1428 movq [r0+2*r1], xmm0 1429 movq [r0+r2], xmm0 1430 POP_XMM 1431 ret 1432 1433;******************************************************************************* 1434; void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride) 1435;******************************************************************************* 1436WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx 1437 %assign push_num 0 1438 INIT_X86_32_PIC r3 1439 LOAD_2_PARA 1440 SIGN_EXTENSION r1, r1d 1441 lea r2, [2*r1+r1] 1442 movq mm0, [pic(sse2_dc_0x80)] 1443 movq mm1, mm0 1444 movq [r0], mm0 1445 movq [r0+r1], mm1 1446 movq [r0+2*r1], mm0 1447 movq [r0+r2], mm1 1448 lea r0, [r0+4*r1] 1449 movq [r0], mm0 1450 movq [r0+r1], mm1 1451 movq [r0+2*r1], mm0 1452 movq [r0+r2], mm1 1453 DEINIT_X86_32_PIC 1454 emms 1455 ret 1456 1457