1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* intra_pred.asm 33;* 34;* Abstract 35;* sse2 function for intra predict operations 36;* 37;* History 38;* 18/09/2009 Created 39;* 40;* 41;*************************************************************************/ 42%include "asm_inc.asm" 43 44;*********************************************************************** 45; Local Data (Read Only) 46;*********************************************************************** 47 48%ifdef X86_32_PICASM 49SECTION .text align=16 50%else 51SECTION .rodata align=16 52%endif 53 54align 16 55sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0 56align 16 57sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8 58align 16 59sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1 60 61; for chroma plane mode 62sse2_plane_inc_c dw 1, 2, 3, 4 63sse2_plane_dec_c dw 4, 3, 2, 1 64align 16 65sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4 66 67align 16 68mmx_01bytes: times 16 db 1 69 70align 16 71mmx_0x02: dw 0x02, 0x00, 0x00, 0x00 72 73 74;*********************************************************************** 75; macros 76;*********************************************************************** 77;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 78;%1 will keep the last result 79%macro SSE_DB_1_2REG 2 80 pxor %1, %1 81 pcmpeqw %2, %2 82 psubb %1, %2 83%endmacro 84 85;xmm0, xmm1, xmm2, eax, ecx 86;lower 64 bits of xmm0 save the result 87%macro SSE2_PRED_H_4X4_TWO_LINE 5 88 movd %1, [%4-1] 89 movdqa %3, %1 90 punpcklbw %1, %3 91 movdqa %3, %1 92 punpcklbw %1, %3 93 94 ;add %4, %5 95 movd %2, [%4+%5-1] 96 movdqa %3, %2 97 punpcklbw %2, %3 98 movdqa %3, %2 99 punpcklbw %2, %3 100 punpckldq %1, %2 101%endmacro 102 103%macro SUMW_HORIZON1 2 104 movdqa %2, %1 105 psrldq %2, 8 106 paddusw %1, %2 107 movdqa %2, %1 108 psrldq %2, 4 109 paddusw %1, %2 110 movdqa %2, %1 111 psrldq %2, 2 112 paddusw %1, %2 113%endmacro 114 115%macro LOAD_COLUMN 6 116 movd %1, [%5] 117 movd %2, [%5+%6] 118 punpcklbw %1, %2 119 lea %5, [%5+2*%6] 120 movd %3, [%5] 121 movd %2, [%5+%6] 122 punpcklbw %3, %2 123 punpcklwd %1, %3 124 lea %5, [%5+2*%6] 125 movd %4, [%5] 126 movd %2, [%5+%6] 127 punpcklbw %4, %2 128 lea %5, [%5+2*%6] 129 movd %3, [%5] 130 movd %2, [%5+%6] 131 lea %5, [%5+2*%6] 132 punpcklbw %3, %2 133 punpcklwd %4, %3 134 punpckhdq %1, %4 135%endmacro 136 137%macro SUMW_HORIZON 3 138 movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4 139 paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04 140 punpcklwd %1, %3 ; x1 = d37 d26 d15 d04 141 movhlps %2, %1 ; x2 = xxxx xxxx d37 d26 142 paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246 143 pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357 144 paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567 145%endmacro 146 147 148%macro COPY_16_TIMES 2 149 movdqa %2, [%1-16] 150 psrldq %2, 15 151 pmuludq %2, [pic(mmx_01bytes)] 152 pshufd %2, %2, 0 153%endmacro 154 155%macro COPY_16_TIMESS 3 156 movdqa %2, [%1+%3-16] 157 psrldq %2, 15 158 pmuludq %2, [pic(mmx_01bytes)] 159 pshufd %2, %2, 0 160%endmacro 161 162%macro LOAD_COLUMN_C 6 163 movd %1, [%5] 164 movd %2, [%5+%6] 165 punpcklbw %1,%2 166 lea %5, [%5+2*%6] 167 movd %3, [%5] 168 movd %2, [%5+%6] 169 punpcklbw %3, %2 170 punpckhwd %1, %3 171 lea %5, [%5+2*%6] 172%endmacro 173 174%macro LOAD_2_LEFT_AND_ADD 0 175 lea r1, [r1+2*r2] 176 movzx r4, byte [r1-0x01] 177 add r3, r4 178 movzx r4, byte [r1+r2-0x01] 179 add r3, r4 180%endmacro 181 182;*********************************************************************** 183; Code 184;*********************************************************************** 185 186SECTION .text 187 188;*********************************************************************** 189; void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) 190; 191; pred must align to 16 192;*********************************************************************** 193WELS_EXTERN WelsI4x4LumaPredH_sse2 194 push r3 195 %assign push_num 1 196 INIT_X86_32_PIC r4 197 LOAD_3_PARA 198 SIGN_EXTENSION r2, r2d 199 movzx r3, byte [r1-1] 200 movd xmm0, r3d 201 pmuludq xmm0, [pic(mmx_01bytes)] 202 203 movzx r3, byte [r1+r2-1] 204 movd xmm1, r3d 205 pmuludq xmm1, [pic(mmx_01bytes)] 206 207 unpcklps xmm0, xmm1 208 209 lea r1, [r1+r2*2] 210 movzx r3, byte [r1-1] 211 movd xmm2, r3d 212 pmuludq xmm2, [pic(mmx_01bytes)] 213 214 movzx r3, byte [r1+r2-1] 215 movd xmm3, r3d 216 pmuludq xmm3, [pic(mmx_01bytes)] 217 218 unpcklps xmm2, xmm3 219 unpcklpd xmm0, xmm2 220 221 movdqa [r0], xmm0 222 DEINIT_X86_32_PIC 223 pop r3 224 ret 225 226;*********************************************************************** 227; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); 228;*********************************************************************** 229WELS_EXTERN WelsI16x16LumaPredPlane_sse2 230 push r3 231 push r4 232 %assign push_num 2 233 INIT_X86_32_PIC r5 234 LOAD_3_PARA 235 PUSH_XMM 8 236 SIGN_EXTENSION r2, r2d 237 sub r1, 1 238 sub r1, r2 239 240 ;for H 241 pxor xmm7, xmm7 242 movq xmm0, [r1] 243 movdqa xmm5, [pic(sse2_plane_dec)] 244 punpcklbw xmm0, xmm7 245 pmullw xmm0, xmm5 246 movq xmm1, [r1 + 9] 247 movdqa xmm6, [pic(sse2_plane_inc)] 248 punpcklbw xmm1, xmm7 249 pmullw xmm1, xmm6 250 psubw xmm1, xmm0 251 252 SUMW_HORIZON xmm1,xmm0,xmm2 253 movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]); 254 movsx r3, r3w 255 imul r3, 5 256 add r3, 32 257 sar r3, 6 ; b = (5 * H + 32) >> 6; 258 SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b 259 260 movzx r4, BYTE [r1+16] 261 sub r1, 3 262 LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2 263 264 add r1, 3 265 movzx r3, BYTE [r1+8*r2] 266 add r4, r3 267 shl r4, 4 ; a = (left[15*stride] + top[15]) << 4; 268 269 sub r1, 3 270 add r1, r2 271 LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2 272 pxor xmm4, xmm4 273 punpckhbw xmm0, xmm4 274 pmullw xmm0, xmm5 275 punpckhbw xmm7, xmm4 276 pmullw xmm7, xmm6 277 psubw xmm7, xmm0 278 279 SUMW_HORIZON xmm7,xmm0,xmm2 280 movd r3d, xmm7 ; V 281 movsx r3, r3w 282 imul r3, 5 283 add r3, 32 284 sar r3, 6 ; c = (5 * V + 32) >> 6; 285 SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c 286 287 add r4, 16 288 imul r3, -7 289 add r3, r4 ; s = a + 16 + (-7)*c 290 SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s 291 292 xor r3, r3 293 movdqa xmm5, [pic(sse2_plane_inc_minus)] 294 295get_i16x16_luma_pred_plane_sse2_1: 296 movdqa xmm2, xmm1 297 pmullw xmm2, xmm5 298 paddw xmm2, xmm0 299 psraw xmm2, 5 300 movdqa xmm3, xmm1 301 pmullw xmm3, xmm6 302 paddw xmm3, xmm0 303 psraw xmm3, 5 304 packuswb xmm2, xmm3 305 movdqa [r0], xmm2 306 paddw xmm0, xmm4 307 add r0, 16 308 inc r3 309 cmp r3, 16 310 jnz get_i16x16_luma_pred_plane_sse2_1 311 POP_XMM 312 DEINIT_X86_32_PIC 313 pop r4 314 pop r3 315 ret 316 317;*********************************************************************** 318; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride); 319;*********************************************************************** 320WELS_EXTERN WelsIChromaPredPlane_sse2 321 push r3 322 push r4 323 %assign push_num 2 324 INIT_X86_32_PIC r5 325 LOAD_3_PARA 326 PUSH_XMM 8 327 SIGN_EXTENSION r2, r2d 328 sub r1, 1 329 sub r1, r2 330 331 pxor mm7, mm7 332 movq mm0, [r1] 333 movq mm5, [pic(sse2_plane_dec_c)] 334 punpcklbw mm0, mm7 335 pmullw mm0, mm5 336 movq mm1, [r1 + 5] 337 movq mm6, [pic(sse2_plane_inc_c)] 338 punpcklbw mm1, mm7 339 pmullw mm1, mm6 340 psubw mm1, mm0 341 342 movq2dq xmm1, mm1 343 pxor xmm2, xmm2 344 SUMW_HORIZON xmm1,xmm0,xmm2 345 movd r3d, xmm1 346 movsx r3, r3w 347 imul r3, 17 348 add r3, 16 349 sar r3, 5 ; b = (17 * H + 16) >> 5; 350 SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b 351 352 movzx r3, BYTE [r1+8] 353 sub r1, 3 354 LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2 355 356 add r1, 3 357 movzx r4, BYTE [r1+4*r2] 358 add r4, r3 359 shl r4, 4 ; a = (left[7*stride] + top[7]) << 4; 360 361 sub r1, 3 362 add r1, r2 363 LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2 364 pxor mm4, mm4 365 punpckhbw mm0, mm4 366 pmullw mm0, mm5 367 punpckhbw mm7, mm4 368 pmullw mm7, mm6 369 psubw mm7, mm0 370 371 movq2dq xmm7, mm7 372 pxor xmm2, xmm2 373 SUMW_HORIZON xmm7,xmm0,xmm2 374 movd r3d, xmm7 ; V 375 movsx r3, r3w 376 imul r3, 17 377 add r3, 16 378 sar r3, 5 ; c = (17 * V + 16) >> 5; 379 SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c 380 381 add r4, 16 382 imul r3, -3 383 add r3, r4 ; s = a + 16 + (-3)*c 384 SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s 385 386 xor r3, r3 387 movdqa xmm5, [pic(sse2_plane_mul_b_c)] 388 389get_i_chroma_pred_plane_sse2_1: 390 movdqa xmm2, xmm1 391 pmullw xmm2, xmm5 392 paddw xmm2, xmm0 393 psraw xmm2, 5 394 packuswb xmm2, xmm2 395 movq [r0], xmm2 396 paddw xmm0, xmm4 397 add r0, 8 398 inc r3 399 cmp r3, 8 400 jnz get_i_chroma_pred_plane_sse2_1 401 POP_XMM 402 DEINIT_X86_32_PIC 403 pop r4 404 pop r3 405 WELSEMMS 406 ret 407 408;*********************************************************************** 409; 0 |1 |2 |3 |4 | 410; 6 |7 |8 |9 |10| 411; 11|12|13|14|15| 412; 16|17|18|19|20| 413; 21|22|23|24|25| 414; 7 is the start pixel of current 4x4 block 415; pred[7] = ([6]+[0]*2+[1]+2)/4 416; 417; void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) 418; 419;*********************************************************************** 420WELS_EXTERN WelsI4x4LumaPredDDR_mmx 421 %assign push_num 0 422 INIT_X86_32_PIC r3 423 LOAD_3_PARA 424 SIGN_EXTENSION r2, r2d 425 movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11 426 movq mm2,[r1-8] ;get value of 6 mm2[8] = 6 427 sub r1, r2 ;mov eax to above line of current block(postion of 1) 428 punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6] 429 movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3] 430 punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11] 431 psllq mm3,18h ;mm3[5]=[1] 432 psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] 433 por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11] 434 movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11] 435 lea r1,[r1+r2*2-8h] ;set eax point to 12 436 movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16] 437 psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0 438 psrlq mm4,38h ;mm4[1]=[16] 439 por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16] 440 movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16] 441 movq mm4,[r1+r2*2] ;mm4[8]=[21] 442 psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0 443 psrlq mm4,38h ;mm4[1]=[21] 444 por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21] 445 movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21] 446 pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2 447 pxor mm1,mm4 ;find odd value in the lowest bit of each byte 448 pand mm1,[pic(mmx_01bytes)] ;set the odd bit 449 psubusb mm3,mm1 ;decrease 1 from odd bytes 450 pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2 451 452 movd [r0+12],mm2 453 psrlq mm2,8 454 movd [r0+8],mm2 455 psrlq mm2,8 456 movd [r0+4],mm2 457 psrlq mm2,8 458 movd [r0],mm2 459 DEINIT_X86_32_PIC 460 WELSEMMS 461 ret 462 463;*********************************************************************** 464; 0 |1 |2 |3 |4 | 465; 5 |6 |7 |8 |9 | 466; 10|11|12|13|14| 467; 15|16|17|18|19| 468; 20|21|22|23|24| 469; 6 is the start pixel of current 4x4 block 470; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8 471; 472; void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride) 473; 474;*********************************************************************** 475WELS_EXTERN WelsI4x4LumaPredDc_sse2 476 push r3 477 push r4 478 %assign push_num 2 479 LOAD_3_PARA 480 SIGN_EXTENSION r2, r2d 481 movzx r4, byte [r1-1h] 482 sub r1, r2 483 movd xmm0, [r1] 484 pxor xmm1, xmm1 485 psadbw xmm0, xmm1 486 xor r3, r3 487 movd r3d, xmm0 488 add r3, r4 489 movzx r4, byte [r1+r2*2-1h] 490 add r3, r4 491 492 lea r1, [r1+r2*2-1] 493 movzx r4, byte [r1+r2] 494 add r3, r4 495 496 movzx r4, byte [r1+r2*2] 497 add r3, r4 498 add r3, 4 499 sar r3, 3 500 imul r3, 0x01010101 501 502 movd xmm0, r3d 503 pshufd xmm0, xmm0, 0 504 movdqa [r0], xmm0 505 pop r4 506 pop r3 507 ret 508 509;*********************************************************************** 510; void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride) 511; copy 8 pixel of 8 line from left 512;*********************************************************************** 513%macro MMX_PRED_H_8X8_ONE_LINE 4 514 movq %1, [%3-8] 515 psrlq %1, 38h 516 517 ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes 518 pmullw %1, [pic(mmx_01bytes)] 519 pshufw %1, %1, 0 520 movq [%4], %1 521%endmacro 522 523%macro MMX_PRED_H_8X8_ONE_LINEE 4 524 movq %1, [%3+r2-8] 525 psrlq %1, 38h 526 527 ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes 528 pmullw %1, [pic(mmx_01bytes)] 529 pshufw %1, %1, 0 530 movq [%4], %1 531%endmacro 532 533WELS_EXTERN WelsIChromaPredH_mmx 534 %assign push_num 0 535 INIT_X86_32_PIC r3 536 LOAD_3_PARA 537 SIGN_EXTENSION r2, r2d 538 movq mm0, [r1-8] 539 psrlq mm0, 38h 540 541 ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes 542 pmullw mm0, [pic(mmx_01bytes)] 543 pshufw mm0, mm0, 0 544 movq [r0], mm0 545 546 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8 547 548 lea r1,[r1+r2*2] 549 MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16 550 551 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24 552 553 lea r1,[r1+r2*2] 554 MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32 555 556 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40 557 558 lea r1,[r1+r2*2] 559 MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48 560 561 MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56 562 DEINIT_X86_32_PIC 563 WELSEMMS 564 ret 565 566;*********************************************************************** 567; void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) 568; copy pixels from top 4 pixels 569;*********************************************************************** 570WELS_EXTERN WelsI4x4LumaPredV_sse2 571 %assign push_num 0 572 LOAD_3_PARA 573 SIGN_EXTENSION r2, r2d 574 sub r1, r2 575 movd xmm0, [r1] 576 pshufd xmm0, xmm0, 0 577 movdqa [r0], xmm0 578 ret 579 580;*********************************************************************** 581; void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) 582; copy 8 pixels from top 8 pixels 583;*********************************************************************** 584WELS_EXTERN WelsIChromaPredV_sse2 585 %assign push_num 0 586 LOAD_3_PARA 587 SIGN_EXTENSION r2, r2d 588 sub r1, r2 589 movq xmm0, [r1] 590 movdqa xmm1, xmm0 591 punpcklqdq xmm0, xmm1 592 movdqa [r0], xmm0 593 movdqa [r0+16], xmm0 594 movdqa [r0+32], xmm0 595 movdqa [r0+48], xmm0 596 ret 597 598;*********************************************************************** 599; lt|t0|t1|t2|t3| 600; l0| 601; l1| 602; l2| 603; l3| 604; t3 will never been used 605; destination: 606; |a |b |c |d | 607; |e |f |a |b | 608; |g |h |e |f | 609; |i |j |g |h | 610 611; a = (1 + lt + l0)>>1 612; e = (1 + l0 + l1)>>1 613; g = (1 + l1 + l2)>>1 614; i = (1 + l2 + l3)>>1 615 616; d = (2 + t0 + (t1<<1) + t2)>>2 617; c = (2 + lt + (t0<<1) + t1)>>2 618; b = (2 + l0 + (lt<<1) + t0)>>2 619 620; f = (2 + l1 + (l0<<1) + lt)>>2 621; h = (2 + l2 + (l1<<1) + l0)>>2 622; j = (2 + l3 + (l2<<1) + l1)>>2 623; [b a f e h g j i] + [d c b a] --> mov to memory 624; 625; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) 626;*********************************************************************** 627WELS_EXTERN WelsI4x4LumaPredHD_mmx 628 %assign push_num 0 629 INIT_X86_32_PIC r3 630 LOAD_3_PARA 631 SIGN_EXTENSION r2, r2d 632 sub r1, r2 633 movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt] 634 psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx] 635 636 movd mm1, [r1+2*r2-4] 637 punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 638 lea r1, [r1+2*r2] 639 movd mm2, [r1+2*r2-4] 640 punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3 641 punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx] 642 psrlq mm2, 20h 643 pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3] 644 645 movq mm1, mm0 646 psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1] 647 movq mm2, mm0 648 psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2] 649 movq mm3, mm2 650 movq mm4, mm1 651 pavgb mm1, mm0 652 653 pxor mm4, mm0 ; find odd value in the lowest bit of each byte 654 pand mm4, [pic(mmx_01bytes)] ; set the odd bit 655 psubusb mm1, mm4 ; decrease 1 from odd bytes 656 657 pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j] 658 659 movq mm4, mm0 660 pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i] 661 punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i] 662 663 psrlq mm2, 20h 664 psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0] 665 movq mm4, mm3 666 psrlq mm4, 10h ; mm4 = [0 0 b a f e h j] 667 pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx] 668 psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a] 669 670 movd [r0], mm2 671 movd [r0+12], mm3 672 psrlq mm3, 10h 673 movd [r0+8], mm3 674 psrlq mm3, 10h 675 movd [r0+4], mm3 676 DEINIT_X86_32_PIC 677 WELSEMMS 678 ret 679 680;*********************************************************************** 681; lt|t0|t1|t2|t3| 682; l0| 683; l1| 684; l2| 685; l3| 686; t3 will never been used 687; destination: 688; |a |b |c |d | 689; |c |d |e |f | 690; |e |f |g |g | 691; |g |g |g |g | 692 693; a = (1 + l0 + l1)>>1 694; c = (1 + l1 + l2)>>1 695; e = (1 + l2 + l3)>>1 696; g = l3 697 698; b = (2 + l0 + (l1<<1) + l2)>>2 699; d = (2 + l1 + (l2<<1) + l3)>>2 700; f = (2 + l2 + (l3<<1) + l3)>>2 701 702; [g g f e d c b a] + [g g g g] --> mov to memory 703; 704; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) 705;*********************************************************************** 706WELS_EXTERN WelsI4x4LumaPredHU_mmx 707 %assign push_num 0 708 INIT_X86_32_PIC r3 709 LOAD_3_PARA 710 SIGN_EXTENSION r2, r2d 711 movd mm0, [r1-4] ; mm0[3] = l0 712 punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0 713 lea r1, [r1+2*r2] 714 movd mm2, [r1-4] ; mm2[3] = l2 715 movd mm4, [r1+r2-4] ; mm4[3] = l3 716 punpcklbw mm2, mm4 717 punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx] 718 719 psrlq mm4, 18h 720 psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx] 721 psrlq mm0, 8h 722 pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx] 723 724 movq mm1, mm0 725 psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx] 726 movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx] 727 pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx] 728 729 movq mm2, mm0 730 psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx] 731 movq mm5, mm2 732 pavgb mm2, mm0 733 734 pxor mm5, mm0 ; find odd value in the lowest bit of each byte 735 pand mm5, [pic(mmx_01bytes)] ; set the odd bit 736 psubusb mm2, mm5 ; decrease 1 from odd bytes 737 738 pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx] 739 740 psrlq mm2, 8h 741 pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx] 742 743 punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a] 744 punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx] 745 punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx] 746 747 psrlq mm4, 20h 748 movd [r0+12], mm4 749 750 movd [r0], mm1 751 psrlq mm1, 10h 752 movd [r0+4], mm1 753 psrlq mm1, 10h 754 movd [r0+8], mm1 755 DEINIT_X86_32_PIC 756 WELSEMMS 757 ret 758 759 760 761;*********************************************************************** 762; lt|t0|t1|t2|t3| 763; l0| 764; l1| 765; l2| 766; l3| 767; l3 will never been used 768; destination: 769; |a |b |c |d | 770; |e |f |g |h | 771; |i |a |b |c | 772; |j |e |f |g | 773 774; a = (1 + lt + t0)>>1 775; b = (1 + t0 + t1)>>1 776; c = (1 + t1 + t2)>>1 777; d = (1 + t2 + t3)>>1 778 779; e = (2 + l0 + (lt<<1) + t0)>>2 780; f = (2 + lt + (t0<<1) + t1)>>2 781; g = (2 + t0 + (t1<<1) + t2)>>2 782 783; h = (2 + t1 + (t2<<1) + t3)>>2 784; i = (2 + lt + (l0<<1) + l1)>>2 785; j = (2 + l0 + (l1<<1) + l2)>>2 786; 787; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) 788;*********************************************************************** 789WELS_EXTERN WelsI4x4LumaPredVR_mmx 790 %assign push_num 0 791 INIT_X86_32_PIC r3 792 LOAD_3_PARA 793 SIGN_EXTENSION r2, r2d 794 sub r1, r2 795 movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt] 796 psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx] 797 798 movd mm1, [r1+2*r2-4] 799 punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1 800 lea r1, [r1+2*r2] 801 movq mm2, [r1+r2-8] ; mm2[7] = l2 802 punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx] 803 psrlq mm2, 28h 804 pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2] 805 806 movq mm1, mm0 807 psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx] 808 pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx] 809 810 movq mm2, mm0 811 psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx] 812 movq mm3, mm2 813 pavgb mm2, mm0 814 815 pxor mm3, mm0 ; find odd value in the lowest bit of each byte 816 pand mm3, [pic(mmx_01bytes)] ; set the odd bit 817 psubusb mm2, mm3 ; decrease 1 from odd bytes 818 819 movq mm3, mm0 820 psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx] 821 pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx] 822 movq mm2, mm3 823 824 psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a] 825 movd [r0], mm1 826 827 psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e] 828 movd [r0+4], mm2 829 830 movq mm4, mm3 831 psllq mm4, 20h 832 psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i] 833 834 movq mm5, mm3 835 psllq mm5, 28h 836 psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j] 837 838 psllq mm1, 8h 839 pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i] 840 movd [r0+8], mm4 841 842 psllq mm2, 8h 843 pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j] 844 movd [r0+12], mm5 845 DEINIT_X86_32_PIC 846 WELSEMMS 847 ret 848 849;*********************************************************************** 850; lt|t0|t1|t2|t3|t4|t5|t6|t7 851; l0| 852; l1| 853; l2| 854; l3| 855; lt,t0,t1,t2,t3 will never been used 856; destination: 857; |a |b |c |d | 858; |b |c |d |e | 859; |c |d |e |f | 860; |d |e |f |g | 861 862; a = (2 + t0 + t2 + (t1<<1))>>2 863; b = (2 + t1 + t3 + (t2<<1))>>2 864; c = (2 + t2 + t4 + (t3<<1))>>2 865; d = (2 + t3 + t5 + (t4<<1))>>2 866 867; e = (2 + t4 + t6 + (t5<<1))>>2 868; f = (2 + t5 + t7 + (t6<<1))>>2 869; g = (2 + t6 + t7 + (t7<<1))>>2 870 871; [g f e d c b a] --> mov to memory 872; 873; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) 874;*********************************************************************** 875WELS_EXTERN WelsI4x4LumaPredDDL_mmx 876 %assign push_num 0 877 INIT_X86_32_PIC r3 878 LOAD_3_PARA 879 SIGN_EXTENSION r2, r2d 880 sub r1, r2 881 movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] 882 movq mm1, mm0 883 movq mm2, mm0 884 885 movq mm3, mm0 886 psrlq mm3, 38h 887 psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx] 888 889 psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx] 890 psrlq mm2, 8h 891 pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1] 892 893 movq mm3, mm1 894 pavgb mm1, mm2 895 pxor mm3, mm2 ; find odd value in the lowest bit of each byte 896 pand mm3, [pic(mmx_01bytes)] ; set the odd bit 897 psubusb mm1, mm3 ; decrease 1 from odd bytes 898 899 pavgb mm0, mm1 ; mm0 = [g f e d c b a xx] 900 901 psrlq mm0, 8h 902 movd [r0], mm0 903 psrlq mm0, 8h 904 movd [r0+4], mm0 905 psrlq mm0, 8h 906 movd [r0+8], mm0 907 psrlq mm0, 8h 908 movd [r0+12], mm0 909 DEINIT_X86_32_PIC 910 WELSEMMS 911 ret 912 913 914;*********************************************************************** 915; lt|t0|t1|t2|t3|t4|t5|t6|t7 916; l0| 917; l1| 918; l2| 919; l3| 920; lt,t0,t1,t2,t3 will never been used 921; destination: 922; |a |b |c |d | 923; |e |f |g |h | 924; |b |c |d |i | 925; |f |g |h |j | 926 927; a = (1 + t0 + t1)>>1 928; b = (1 + t1 + t2)>>1 929; c = (1 + t2 + t3)>>1 930; d = (1 + t3 + t4)>>1 931; i = (1 + t4 + t5)>>1 932 933; e = (2 + t0 + (t1<<1) + t2)>>2 934; f = (2 + t1 + (t2<<1) + t3)>>2 935; g = (2 + t2 + (t3<<1) + t4)>>2 936; h = (2 + t3 + (t4<<1) + t5)>>2 937; j = (2 + t4 + (t5<<1) + t6)>>2 938 939; [i d c b a] + [j h g f e] --> mov to memory 940; 941; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride) 942;*********************************************************************** 943WELS_EXTERN WelsI4x4LumaPredVL_mmx 944 %assign push_num 0 945 INIT_X86_32_PIC r3 946 LOAD_3_PARA 947 SIGN_EXTENSION r2, r2d 948 sub r1, r2 949 movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0] 950 movq mm1, mm0 951 movq mm2, mm0 952 953 psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1] 954 psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2] 955 956 movq mm3, mm1 957 pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a] 958 959 movq mm4, mm2 960 pavgb mm2, mm0 961 pxor mm4, mm0 ; find odd value in the lowest bit of each byte 962 pand mm4, [pic(mmx_01bytes)] ; set the odd bit 963 psubusb mm2, mm4 ; decrease 1 from odd bytes 964 965 pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e] 966 967 movd [r0], mm3 968 psrlq mm3, 8h 969 movd [r0+8], mm3 970 971 movd [r0+4], mm2 972 psrlq mm2, 8h 973 movd [r0+12], mm2 974 DEINIT_X86_32_PIC 975 WELSEMMS 976 ret 977 978;*********************************************************************** 979; 980; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) 981;*********************************************************************** 982WELS_EXTERN WelsIChromaPredDc_sse2 983 push r3 984 push r4 985 %assign push_num 2 986 INIT_X86_32_PIC r5 987 LOAD_3_PARA 988 SIGN_EXTENSION r2, r2d 989 sub r1, r2 990 movq mm0, [r1] 991 992 movzx r3, byte [r1+r2-0x01] ; l1 993 lea r1, [r1+2*r2] 994 movzx r4, byte [r1-0x01] ; l2 995 add r3, r4 996 movzx r4, byte [r1+r2-0x01] ; l3 997 add r3, r4 998 lea r1, [r1+2*r2] 999 movzx r4, byte [r1-0x01] ; l4 1000 add r3, r4 1001 movd mm1, r3d ; mm1 = l1+l2+l3+l4 1002 1003 movzx r3, byte [r1+r2-0x01] ; l5 1004 lea r1, [r1+2*r2] 1005 movzx r4, byte [r1-0x01] ; l6 1006 add r3, r4 1007 movzx r4, byte [r1+r2-0x01] ; l7 1008 add r3, r4 1009 lea r1, [r1+2*r2] 1010 movzx r4, byte [r1-0x01] ; l8 1011 add r3, r4 1012 movd mm2, r3d ; mm2 = l5+l6+l7+l8 1013 1014 movq mm3, mm0 1015 psrlq mm0, 0x20 1016 psllq mm3, 0x20 1017 psrlq mm3, 0x20 1018 pxor mm4, mm4 1019 psadbw mm0, mm4 1020 psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2 1021 1022 paddq mm3, mm1 1023 movq mm1, mm2 1024 paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1 1025 1026 movq mm4, [pic(mmx_0x02)] 1027 1028 paddq mm0, mm4 1029 psrlq mm0, 0x02 1030 1031 paddq mm2, mm4 1032 psrlq mm2, 0x02 1033 1034 paddq mm3, mm4 1035 paddq mm3, mm4 1036 psrlq mm3, 0x03 1037 1038 paddq mm1, mm4 1039 paddq mm1, mm4 1040 psrlq mm1, 0x03 1041 1042 pmuludq mm0, [pic(mmx_01bytes)] 1043 pmuludq mm3, [pic(mmx_01bytes)] 1044 psllq mm0, 0x20 1045 pxor mm0, mm3 ; mm0 = m_up 1046 1047 pmuludq mm2, [pic(mmx_01bytes)] 1048 pmuludq mm1, [pic(mmx_01bytes)] 1049 psllq mm1, 0x20 1050 pxor mm1, mm2 ; mm2 = m_down 1051 1052 movq [r0], mm0 1053 movq [r0+0x08], mm0 1054 movq [r0+0x10], mm0 1055 movq [r0+0x18], mm0 1056 1057 movq [r0+0x20], mm1 1058 movq [r0+0x28], mm1 1059 movq [r0+0x30], mm1 1060 movq [r0+0x38], mm1 1061 1062 DEINIT_X86_32_PIC 1063 pop r4 1064 pop r3 1065 WELSEMMS 1066 ret 1067 1068 1069 1070;*********************************************************************** 1071; 1072; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride) 1073;*********************************************************************** 1074WELS_EXTERN WelsI16x16LumaPredDc_sse2 1075 push r3 1076 push r4 1077 %assign push_num 2 1078 INIT_X86_32_PIC r5 1079 LOAD_3_PARA 1080 SIGN_EXTENSION r2, r2d 1081 sub r1, r2 1082 movdqa xmm0, [r1] ; read one row 1083 pxor xmm1, xmm1 1084 psadbw xmm0, xmm1 1085 movdqa xmm1, xmm0 1086 psrldq xmm1, 0x08 1087 pslldq xmm0, 0x08 1088 psrldq xmm0, 0x08 1089 paddw xmm0, xmm1 1090 1091 movzx r3, byte [r1+r2-0x01] 1092 movzx r4, byte [r1+2*r2-0x01] 1093 add r3, r4 1094 lea r1, [r1+r2] 1095 LOAD_2_LEFT_AND_ADD 1096 LOAD_2_LEFT_AND_ADD 1097 LOAD_2_LEFT_AND_ADD 1098 LOAD_2_LEFT_AND_ADD 1099 LOAD_2_LEFT_AND_ADD 1100 LOAD_2_LEFT_AND_ADD 1101 LOAD_2_LEFT_AND_ADD 1102 add r3, 0x10 1103 movd xmm1, r3d 1104 paddw xmm0, xmm1 1105 psrld xmm0, 0x05 1106 pmuludq xmm0, [pic(mmx_01bytes)] 1107 pshufd xmm0, xmm0, 0 1108 1109 movdqa [r0], xmm0 1110 movdqa [r0+0x10], xmm0 1111 movdqa [r0+0x20], xmm0 1112 movdqa [r0+0x30], xmm0 1113 movdqa [r0+0x40], xmm0 1114 movdqa [r0+0x50], xmm0 1115 movdqa [r0+0x60], xmm0 1116 movdqa [r0+0x70], xmm0 1117 movdqa [r0+0x80], xmm0 1118 movdqa [r0+0x90], xmm0 1119 movdqa [r0+0xa0], xmm0 1120 movdqa [r0+0xb0], xmm0 1121 movdqa [r0+0xc0], xmm0 1122 movdqa [r0+0xd0], xmm0 1123 movdqa [r0+0xe0], xmm0 1124 movdqa [r0+0xf0], xmm0 1125 1126 DEINIT_X86_32_PIC 1127 pop r4 1128 pop r3 1129 ret 1130