1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;*************************************************************************/ 32%include "asm_inc.asm" 33 34;*********************************************************************** 35; Local Data (Read Only) 36;*********************************************************************** 37%ifdef X86_32_PICASM 38SECTION .text align=16 39%else 40SECTION .rodata align=16 41%endif 42 43ALIGN 16 44mv_x_inc_x4 dw 0x10, 0x10, 0x10, 0x10 45mv_y_inc_x4 dw 0x04, 0x04, 0x04, 0x04 46mx_x_offset_x4 dw 0x00, 0x04, 0x08, 0x0C 47 48SECTION .text 49%ifdef X86_32 50;********************************************************************************************************************** 51;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 52; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 53;********************************************************************************************************************* 54WELS_EXTERN SumOf8x8BlockOfFrame_sse2 55%define pushsize 16 56%define localsize 4 57%define ref esp + pushsize + localsize + 4 58%define sum_ref esp + pushsize + localsize + 20 59%define times_of_sum esp + pushsize + localsize + 24 60%define width esp + pushsize + localsize + 8 61%define height esp + pushsize + localsize + 12 62%define linesize esp + pushsize + localsize + 16 63%define tmp_width esp + 0 64 push ebx 65 push ebp 66 push esi 67 push edi 68 sub esp, localsize 69 70 pxor xmm0, xmm0 71 mov esi, [ref] 72 mov edi, [sum_ref] 73 mov edx, [times_of_sum] 74 mov ebx, [linesize] 75 mov eax, [width] 76 lea ecx, [ebx+ebx*2] ; 3*linesize 77 78 mov [tmp_width], eax 79 lea ebp, [esi+ebx*4] 80FIRST_ROW: 81 movq xmm1, [esi] 82 movq xmm2, [esi+ebx] 83 movq xmm3, [esi+ebx*2] 84 movq xmm4, [esi+ecx] 85 86 shufps xmm1, xmm2, 01000100b 87 shufps xmm3, xmm4, 01000100b 88 psadbw xmm1, xmm0 89 psadbw xmm3, xmm0 90 paddd xmm1, xmm3 91 92 movq xmm2, [ebp] 93 movq xmm3, [ebp+ebx] 94 movq xmm4, [ebp+ebx*2] 95 movq xmm5, [ebp+ecx] 96 97 shufps xmm2, xmm3, 01000100b 98 shufps xmm4, xmm5, 01000100b 99 psadbw xmm2, xmm0 100 psadbw xmm4, xmm0 101 paddd xmm2, xmm4 102 103 paddd xmm1, xmm2 104 pshufd xmm2, xmm1, 00001110b 105 paddd xmm1, xmm2 106 movd eax, xmm1 107 mov [edi], ax 108 inc dword [edx+eax*4] 109 110 inc esi 111 inc ebp 112 add edi, 2 113 114 dec dword [tmp_width] 115 jg FIRST_ROW 116 117 mov esi, [ref] 118 mov edi, [sum_ref] 119 mov ebp, [width] 120 dec dword [height] 121HEIGHT_LOOP: 122 mov [tmp_width], ebp 123WIDTH_LOOP: 124 movq xmm1, [esi+ebx*8] 125 movq xmm2, [esi] 126 psadbw xmm1, xmm0 127 psadbw xmm2, xmm0 128 psubd xmm1, xmm2 129 movd eax, xmm1 130 mov cx, [edi] 131 add eax, ecx 132 133 mov [edi+ebp*2], ax 134 inc dword [edx+eax*4] 135 136 inc esi 137 add edi, 2 138 139 dec dword [tmp_width] 140 jg WIDTH_LOOP 141 142 add esi, ebx 143 sub esi, ebp 144 145 dec dword [height] 146 jg HEIGHT_LOOP 147 148 add esp, localsize 149 pop edi 150 pop esi 151 pop ebp 152 pop ebx 153%undef pushsize 154%undef localsize 155%undef ref 156%undef sum_ref 157%undef times_of_sum 158%undef width 159%undef height 160%undef linesize 161%undef tmp_width 162 ret 163 164 165%macro COUNT_SUM 3 166%define xmm_reg %1 167%define tmp_reg %2 168 movd tmp_reg, xmm_reg 169 inc dword [edx+tmp_reg*4] 170%if %3 == 1 171 psrldq xmm_reg, 4 172%endif 173%endmacro 174 175 176;----------------------------------------------------------------------------- 177; requires: width % 8 == 0 && height > 1 178;----------------------------------------------------------------------------- 179;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 180; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 181;----------------------------------------------------------------------------- 182; read extra (16 - (width % 8) ) mod 16 bytes of every line 183; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref 184WELS_EXTERN SumOf8x8BlockOfFrame_sse4 185%define pushsize 16 186%define localsize 4 187%define ref esp + pushsize + localsize + 4 188%define sum_ref esp + pushsize + localsize + 20 189%define times_of_sum esp + pushsize + localsize + 24 190%define width esp + pushsize + localsize + 8 191%define height esp + pushsize + localsize + 12 192%define linesize esp + pushsize + localsize + 16 193%define tmp_width esp + 0 194 push ebx 195 push ebp 196 push esi 197 push edi 198 sub esp, localsize 199 200 pxor xmm0, xmm0 201 mov esi, [ref] 202 mov edi, [sum_ref] 203 mov edx, [times_of_sum] 204 mov ebx, [linesize] 205 mov eax, [width] 206 lea ecx, [ebx+ebx*2] ; 3*linesize 207 208 mov [tmp_width], eax 209 lea ebp, [esi+ebx*4] 210FIRST_ROW_SSE4: 211 movdqu xmm1, [esi] 212 movdqu xmm3, [esi+ebx] 213 movdqu xmm5, [esi+ebx*2] 214 movdqu xmm7, [esi+ecx] 215 216 movdqa xmm2, xmm1 217 mpsadbw xmm1, xmm0, 000b 218 mpsadbw xmm2, xmm0, 100b 219 paddw xmm1, xmm2 ; 8 sums of line1 220 221 movdqa xmm4, xmm3 222 mpsadbw xmm3, xmm0, 000b 223 mpsadbw xmm4, xmm0, 100b 224 paddw xmm3, xmm4 ; 8 sums of line2 225 226 movdqa xmm2, xmm5 227 mpsadbw xmm5, xmm0, 000b 228 mpsadbw xmm2, xmm0, 100b 229 paddw xmm5, xmm2 ; 8 sums of line3 230 231 movdqa xmm4, xmm7 232 mpsadbw xmm7, xmm0, 000b 233 mpsadbw xmm4, xmm0, 100b 234 paddw xmm7, xmm4 ; 8 sums of line4 235 236 paddw xmm1, xmm3 237 paddw xmm5, xmm7 238 paddw xmm1, xmm5 ; sum the upper 4 lines first 239 240 movdqu xmm2, [ebp] 241 movdqu xmm3, [ebp+ebx] 242 movdqu xmm4, [ebp+ebx*2] 243 movdqu xmm5, [ebp+ecx] 244 245 movdqa xmm6, xmm2 246 mpsadbw xmm2, xmm0, 000b 247 mpsadbw xmm6, xmm0, 100b 248 paddw xmm2, xmm6 249 250 movdqa xmm7, xmm3 251 mpsadbw xmm3, xmm0, 000b 252 mpsadbw xmm7, xmm0, 100b 253 paddw xmm3, xmm7 254 255 movdqa xmm6, xmm4 256 mpsadbw xmm4, xmm0, 000b 257 mpsadbw xmm6, xmm0, 100b 258 paddw xmm4, xmm6 259 260 movdqa xmm7, xmm5 261 mpsadbw xmm5, xmm0, 000b 262 mpsadbw xmm7, xmm0, 100b 263 paddw xmm5, xmm7 264 265 paddw xmm2, xmm3 266 paddw xmm4, xmm5 267 paddw xmm1, xmm2 268 paddw xmm1, xmm4 ; sum of lines 1- 8 269 270 movdqu [edi], xmm1 271 272 movdqa xmm2, xmm1 273 punpcklwd xmm1, xmm0 274 punpckhwd xmm2, xmm0 275 276 COUNT_SUM xmm1, eax, 1 277 COUNT_SUM xmm1, eax, 1 278 COUNT_SUM xmm1, eax, 1 279 COUNT_SUM xmm1, eax, 0 280 COUNT_SUM xmm2, eax, 1 281 COUNT_SUM xmm2, eax, 1 282 COUNT_SUM xmm2, eax, 1 283 COUNT_SUM xmm2, eax, 0 284 285 lea esi, [esi+8] 286 lea ebp, [ebp+8] 287 lea edi, [edi+16] ; element size is 2 288 289 sub dword [tmp_width], 8 290 jg near FIRST_ROW_SSE4 291 292 mov esi, [ref] 293 mov edi, [sum_ref] 294 mov ebp, [width] 295 dec dword [height] 296HEIGHT_LOOP_SSE4: 297 mov ecx, ebp 298WIDTH_LOOP_SSE4: 299 movdqu xmm1, [esi+ebx*8] 300 movdqu xmm2, [esi] 301 movdqu xmm7, [edi] 302 303 movdqa xmm3, xmm1 304 mpsadbw xmm1, xmm0, 000b 305 mpsadbw xmm3, xmm0, 100b 306 paddw xmm1, xmm3 307 308 movdqa xmm4, xmm2 309 mpsadbw xmm2, xmm0, 000b 310 mpsadbw xmm4, xmm0, 100b 311 paddw xmm2, xmm4 312 313 paddw xmm7, xmm1 314 psubw xmm7, xmm2 315 movdqu [edi+ebp*2], xmm7 316 317 movdqa xmm6, xmm7 318 punpcklwd xmm7, xmm0 319 punpckhwd xmm6, xmm0 320 321 COUNT_SUM xmm7, eax, 1 322 COUNT_SUM xmm7, eax, 1 323 COUNT_SUM xmm7, eax, 1 324 COUNT_SUM xmm7, eax, 0 325 COUNT_SUM xmm6, eax, 1 326 COUNT_SUM xmm6, eax, 1 327 COUNT_SUM xmm6, eax, 1 328 COUNT_SUM xmm6, eax, 0 329 330 lea esi, [esi+8] 331 lea edi, [edi+16] 332 333 sub ecx, 8 334 jg near WIDTH_LOOP_SSE4 335 336 lea esi, [esi+ebx] 337 sub esi, ebp 338 339 dec dword [height] 340 jg near HEIGHT_LOOP_SSE4 341 342 add esp, localsize 343 pop edi 344 pop esi 345 pop ebp 346 pop ebx 347%undef pushsize 348%undef localsize 349%undef ref 350%undef sum_ref 351%undef times_of_sum 352%undef width 353%undef height 354%undef linesize 355%undef tmp_width 356 ret 357 358 359;**************************************************************************************************************************************************** 360;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 361; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 362;**************************************************************************************************************************************************** 363WELS_EXTERN SumOf16x16BlockOfFrame_sse2 364%define pushsize 16 365%define localsize 4 366%define ref esp + pushsize + localsize + 4 367%define sum_ref esp + pushsize + localsize + 20 368%define times_of_sum esp + pushsize + localsize + 24 369%define width esp + pushsize + localsize + 8 370%define height esp + pushsize + localsize + 12 371%define linesize esp + pushsize + localsize + 16 372%define tmp_width esp 373 push ebx 374 push ebp 375 push esi 376 push edi 377 sub esp, localsize 378 379 pxor xmm0, xmm0 380 mov esi, [ref] 381 mov edi, [sum_ref] 382 mov edx, [times_of_sum] 383 mov ebx, [linesize] 384 mov eax, [width] 385 386 lea ecx, [ebx+ebx*2] 387 mov [tmp_width], eax 388FIRST_ROW_X16H: 389 movdqu xmm1, [esi] 390 movdqu xmm2, [esi+ebx] 391 movdqu xmm3, [esi+ebx*2] 392 movdqu xmm4, [esi+ecx] 393 394 psadbw xmm1, xmm0 395 psadbw xmm2, xmm0 396 psadbw xmm3, xmm0 397 psadbw xmm4, xmm0 398 paddw xmm1, xmm2 399 paddw xmm3, xmm4 400 paddw xmm1, xmm3 401 402 lea ebp, [esi+ebx*4] 403 movdqu xmm2, [ebp] 404 movdqu xmm3, [ebp+ebx] 405 movdqu xmm4, [ebp+ebx*2] 406 movdqu xmm5, [ebp+ecx] 407 408 psadbw xmm2, xmm0 409 psadbw xmm3, xmm0 410 psadbw xmm4, xmm0 411 psadbw xmm5, xmm0 412 paddw xmm2, xmm3 413 paddw xmm4, xmm5 414 paddw xmm2, xmm4 415 416 paddw xmm1, xmm2 417 418 lea ebp, [ebp+ebx*4] 419 movdqu xmm2, [ebp] 420 movdqu xmm3, [ebp+ebx] 421 movdqu xmm4, [ebp+ebx*2] 422 movdqu xmm5, [ebp+ecx] 423 424 psadbw xmm2, xmm0 425 psadbw xmm3, xmm0 426 psadbw xmm4, xmm0 427 psadbw xmm5, xmm0 428 paddw xmm2, xmm3 429 paddw xmm4, xmm5 430 paddw xmm2, xmm4 431 432 paddw xmm1, xmm2 433 434 lea ebp, [ebp+ebx*4] 435 movdqu xmm2, [ebp] 436 movdqu xmm3, [ebp+ebx] 437 movdqu xmm4, [ebp+ebx*2] 438 movdqu xmm5, [ebp+ecx] 439 440 psadbw xmm2, xmm0 441 psadbw xmm3, xmm0 442 psadbw xmm4, xmm0 443 psadbw xmm5, xmm0 444 paddw xmm2, xmm3 445 paddw xmm4, xmm5 446 paddw xmm2, xmm4 447 448 paddw xmm1, xmm2 449 movdqa xmm2, xmm1 450 punpckhwd xmm2, xmm0 451 paddw xmm1, xmm2 452 movd eax, xmm1 453 mov [edi], ax 454 inc dword [edx+eax*4] 455 456 inc esi 457 lea edi, [edi+2] 458 459 dec dword [tmp_width] 460 jg near FIRST_ROW_X16H 461 462 mov esi, [ref] 463 mov edi, [sum_ref] 464 mov ebp, [width] 465 dec dword [height] 466 467 mov ecx, ebx 468 sal ecx, 4 ; succeeded 16th line 469HEIGHT_LOOP_X16: 470 mov [tmp_width], ebp 471WIDTH_LOOP_X16: 472 movdqu xmm1, [esi+ecx] 473 movdqu xmm2, [esi] 474 psadbw xmm1, xmm0 475 psadbw xmm2, xmm0 476 psubw xmm1, xmm2 477 movdqa xmm2, xmm1 478 punpckhwd xmm2, xmm0 479 paddw xmm1, xmm2 480 movd eax, xmm1 481 add ax, word [edi] 482 mov [edi+ebp*2], ax 483 inc dword [edx+eax*4] 484 485 inc esi 486 add edi, 2 487 488 dec dword [tmp_width] 489 jg near WIDTH_LOOP_X16 490 491 add esi, ebx 492 sub esi, ebp 493 494 dec dword [height] 495 jg near HEIGHT_LOOP_X16 496 497 add esp, localsize 498 pop edi 499 pop esi 500 pop ebp 501 pop ebx 502%undef pushsize 503%undef localsize 504%undef ref 505%undef sum_ref 506%undef times_of_sum 507%undef width 508%undef height 509%undef linesize 510%undef tmp_width 511 ret 512 513; requires: width % 16 == 0 && height > 1 514;----------------------------------------------------------------------------------------------------------------------------- 515;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 516; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 517;----------------------------------------------------------------------------------------------------------------------------- 518; try 8 mv via offset 519%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1 520 movdqu %2, [%1] 521 movdqu %3, [%1+8h] 522 movdqa %4, %2 523 movdqa %5, %3 524 525 mpsadbw %2, xmm0, 0 ; 000 B 526 mpsadbw %4, xmm0, 5 ; 101 B 527 mpsadbw %3, xmm0, 2 ; 010 B 528 mpsadbw %5, xmm0, 7 ; 111 B 529 paddw %2, %4 530 paddw %3, %5 531 paddw %2, %3 ; accumulate cost 532%endmacro ; end of SAD_16x16_LINE_SSE41 533 534WELS_EXTERN SumOf16x16BlockOfFrame_sse4 535%define pushsize 16 536%define localsize 4 537%define ref esp + pushsize + localsize + 4 538%define sum_ref esp + pushsize + localsize + 20 539%define times_of_sum esp + pushsize + localsize + 24 540%define width esp + pushsize + localsize + 8 541%define height esp + pushsize + localsize + 12 542%define linesize esp + pushsize + localsize + 16 543%define tmp_width esp 544 push ebx 545 push ebp 546 push esi 547 push edi 548 sub esp, localsize 549 550 pxor xmm0, xmm0 551 mov esi, [ref] 552 mov edi, [sum_ref] 553 mov edx, [times_of_sum] 554 mov ebx, [linesize] 555 mov eax, [width] 556 557 lea ecx, [ebx+ebx*2] 558 mov [tmp_width], eax 559FIRST_ROW_X16_SSE4: 560 SUM_LINE_X16_SSE41 esi, xmm1, xmm2, xmm3, xmm4 561 SUM_LINE_X16_SSE41 esi+ebx, xmm2, xmm3, xmm4, xmm5 562 SUM_LINE_X16_SSE41 esi+ebx*2, xmm3, xmm4, xmm5, xmm6 563 SUM_LINE_X16_SSE41 esi+ecx, xmm4, xmm5, xmm6, xmm7 564 paddw xmm1, xmm2 565 paddw xmm3, xmm4 566 paddw xmm1, xmm3 567 568 lea ebp, [esi+ebx*4] 569 SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5 570 paddw xmm1, xmm2 571 SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5 572 paddw xmm1, xmm2 573 SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5 574 paddw xmm1, xmm2 575 SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5 576 paddw xmm1, xmm2 577 578 lea ebp, [ebp+ebx*4] 579 SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5 580 paddw xmm1, xmm2 581 SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5 582 paddw xmm1, xmm2 583 SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5 584 paddw xmm1, xmm2 585 SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5 586 paddw xmm1, xmm2 587 588 lea ebp, [ebp+ebx*4] 589 SUM_LINE_X16_SSE41 ebp, xmm2, xmm3, xmm4, xmm5 590 paddw xmm1, xmm2 591 SUM_LINE_X16_SSE41 ebp+ebx, xmm2, xmm3, xmm4, xmm5 592 paddw xmm1, xmm2 593 SUM_LINE_X16_SSE41 ebp+ebx*2, xmm2, xmm3, xmm4, xmm5 594 paddw xmm1, xmm2 595 SUM_LINE_X16_SSE41 ebp+ecx, xmm2, xmm3, xmm4, xmm5 596 paddw xmm1, xmm2 597 598 movdqa [edi], xmm1 599 movdqa xmm2, xmm1 600 punpcklwd xmm1, xmm0 601 punpckhwd xmm2, xmm0 602 603 COUNT_SUM xmm1, eax, 1 604 COUNT_SUM xmm1, eax, 1 605 COUNT_SUM xmm1, eax, 1 606 COUNT_SUM xmm1, eax, 0 607 COUNT_SUM xmm2, eax, 1 608 COUNT_SUM xmm2, eax, 1 609 COUNT_SUM xmm2, eax, 1 610 COUNT_SUM xmm2, eax, 0 611 612 lea esi, [esi+8] 613 lea edi, [edi+16] ; element size is 2 614 615 sub dword [tmp_width], 8 616 jg near FIRST_ROW_X16_SSE4 617 618 mov esi, [ref] 619 mov edi, [sum_ref] 620 mov ebp, [width] 621 dec dword [height] 622 623 mov ecx, ebx 624 sal ecx, 4 ; succeeded 16th line 625 626HEIGHT_LOOP_X16_SSE4: 627 mov [tmp_width], ebp 628WIDTH_LOOP_X16_SSE4: 629 movdqa xmm7, [edi] 630 SUM_LINE_X16_SSE41 esi+ecx, xmm1, xmm2, xmm3, xmm4 631 SUM_LINE_X16_SSE41 esi, xmm2, xmm3, xmm4, xmm5 632 633 paddw xmm7, xmm1 634 psubw xmm7, xmm2 635 movdqa [edi+ebp*2], xmm7 636 637 movdqa xmm6, xmm7 638 punpcklwd xmm7, xmm0 639 punpckhwd xmm6, xmm0 640 641 COUNT_SUM xmm7, eax, 1 642 COUNT_SUM xmm7, eax, 1 643 COUNT_SUM xmm7, eax, 1 644 COUNT_SUM xmm7, eax, 0 645 COUNT_SUM xmm6, eax, 1 646 COUNT_SUM xmm6, eax, 1 647 COUNT_SUM xmm6, eax, 1 648 COUNT_SUM xmm6, eax, 0 649 650 lea esi, [esi+8] 651 lea edi, [edi+16] 652 653 sub dword [tmp_width], 8 654 jg near WIDTH_LOOP_X16_SSE4 655 656 add esi, ebx 657 sub esi, ebp 658 659 dec dword [height] 660 jg near HEIGHT_LOOP_X16_SSE4 661 662 add esp, localsize 663 pop edi 664 pop esi 665 pop ebp 666 pop ebx 667%undef pushsize 668%undef localsize 669%undef ref 670%undef sum_ref 671%undef times_of_sum 672%undef width 673%undef height 674%undef linesize 675%undef tmp_width 676 ret 677 678 679;----------------------------------------------------------------------------------------------------------------------------- 680; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) 681;----------------------------------------------------------------------------------------------------------------------------- 682WELS_EXTERN FillQpelLocationByFeatureValue_sse2 683 push esi 684 push edi 685 push ebx 686 push ebp 687 688 %define _ps 16 ; push size 689 %define _ls 4 ; local size 690 %define sum_ref esp+_ps+_ls+4 691 %define pos_list esp+_ps+_ls+16 692 %define width esp+_ps+_ls+8 693 %define height esp+_ps+_ls+12 694 %define i_height esp 695 sub esp, _ls 696 697 mov esi, [sum_ref] 698 mov edi, [pos_list] 699 mov ebp, [width] 700 mov ebx, [height] 701 mov [i_height], ebx 702 703 %assign push_num 5 704 INIT_X86_32_PIC_NOPRESERVE ecx 705 movq xmm7, [pic(mv_x_inc_x4)] ; x_qpel inc 706 movq xmm6, [pic(mv_y_inc_x4)] ; y_qpel inc 707 movq xmm5, [pic(mx_x_offset_x4)] ; x_qpel vector 708 DEINIT_X86_32_PIC 709 pxor xmm4, xmm4 710 pxor xmm3, xmm3 ; y_qpel vector 711HASH_HEIGHT_LOOP_SSE2: 712 movdqa xmm2, xmm5 ; x_qpel vector 713 mov ecx, ebp 714HASH_WIDTH_LOOP_SSE2: 715 movq xmm0, [esi] ; load x8 sum 716 punpcklwd xmm0, xmm4 717 movdqa xmm1, xmm2 718 punpcklwd xmm1, xmm3 719%rep 3 720 movd edx, xmm0 721 lea ebx, [edi+edx*4] 722 mov eax, [ebx] 723 movd [eax], xmm1 724 mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation 725 lea eax, [eax+4] 726 mov [ebx], eax 727 psrldq xmm1, 4 728 psrldq xmm0, 4 729%endrep 730 movd edx, xmm0 731 lea ebx, [edi+edx*4] 732 mov eax, [ebx] 733 movd [eax], xmm1 734 mov edx, [eax+4] ; explictly load eax+4 due cache miss from vtune observation 735 lea eax, [eax+4] 736 mov [ebx], eax 737 738 paddw xmm2, xmm7 739 lea esi, [esi+8] 740 sub ecx, 4 741 jnz near HASH_WIDTH_LOOP_SSE2 742 paddw xmm3, xmm6 743 dec dword [i_height] 744 jnz near HASH_HEIGHT_LOOP_SSE2 745 746 add esp, _ls 747 %undef _ps 748 %undef _ls 749 %undef sum_ref 750 %undef pos_list 751 %undef width 752 %undef height 753 %undef i_height 754 pop ebp 755 pop ebx 756 pop edi 757 pop esi 758 ret 759 760;--------------------------------------------------------------------------------------------------------------------------------------------------- 761; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, 762; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList ) 763;--------------------------------------------------------------------------------------------------------------------------------------------------- 764WELS_EXTERN InitializeHashforFeature_sse2 765 push ebx 766 push esi 767 push edi 768 push ebp 769 %define _ps 16 ; push size 770 mov edi, [esp+_ps+16] ; pPositionOfSum 771 mov ebp, [esp+_ps+20] ; sum_idx_list 772 mov esi, [esp+_ps+4] ; pTimesOfSum 773 mov ebx, [esp+_ps+8] ; pBuf 774 mov edx, [esp+_ps+12] ; list_sz 775 sar edx, 2 776 mov ecx, 0 777 pxor xmm7, xmm7 778hash_assign_loop_x4_sse2: 779 movdqa xmm0, [esi+ecx] 780 pslld xmm0, 2 781 782 movdqa xmm1, xmm0 783 pcmpeqd xmm1, xmm7 784 movmskps eax, xmm1 785 cmp eax, 0x0f 786 je near hash_assign_with_copy_sse2 787 788%assign x 0 789%rep 4 790 lea eax, [edi+ecx+x] 791 mov [eax], ebx 792 lea eax, [ebp+ecx+x] 793 mov [eax], ebx 794 movd eax, xmm0 795 add ebx, eax 796 psrldq xmm0, 4 797%assign x x+4 798%endrep 799 jmp near assign_next_sse2 800 801hash_assign_with_copy_sse2: 802 movd xmm1, ebx 803 pshufd xmm2, xmm1, 0 804 movdqa [edi+ecx], xmm2 805 movdqa [ebp+ecx], xmm2 806 807assign_next_sse2: 808 add ecx, 16 809 dec edx 810 jnz near hash_assign_loop_x4_sse2 811 812 mov edx, [esp+_ps+12] ; list_sz 813 and edx, 3 814 jz near hash_assign_no_rem_sse2 815hash_assign_loop_x4_rem_sse2: 816 lea eax, [edi+ecx] 817 mov [eax], ebx 818 lea eax, [ebp+ecx] 819 mov [eax], ebx 820 mov eax, [esi+ecx] 821 sal eax, 2 822 add ebx, eax 823 add ecx, 4 824 dec edx 825 jnz near hash_assign_loop_x4_rem_sse2 826 827hash_assign_no_rem_sse2: 828 %undef _ps 829 pop ebp 830 pop edi 831 pop esi 832 pop ebx 833 ret 834%else 835 836;********************************************************************************************************************** 837;void SumOf8x8BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 838; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 839;********************************************************************************************************************* 840WELS_EXTERN SumOf8x8BlockOfFrame_sse2 841 %assign push_num 0 842 LOAD_6_PARA 843 PUSH_XMM 6 844 SIGN_EXTENSION r1, r1d 845 SIGN_EXTENSION r2, r2d 846 SIGN_EXTENSION r3, r3d 847 push r12 848 push r13 849 push r0 850 push r2 851 push r4 852 853 pxor xmm0, xmm0 854 lea r6, [r3+r3*2] 855 856 mov r12, r1 ;r12:tmp_width 857 lea r13, [r0+r3*4] ;rbp:r13 858FIRST_ROW: 859 movq xmm1, [r0] 860 movq xmm2, [r0+r3] 861 movq xmm3, [r0+r3*2] 862 movq xmm4, [r0+r6] 863 864 shufps xmm1, xmm2, 01000100b 865 shufps xmm3, xmm4, 01000100b 866 psadbw xmm1, xmm0 867 psadbw xmm3, xmm0 868 paddd xmm1, xmm3 869 870 movq xmm2, [r13] 871 movq xmm3, [r13+r3] 872 movq xmm4, [r13+r3*2] 873 movq xmm5, [r13+r6] 874 875 shufps xmm2, xmm3, 01000100b 876 shufps xmm4, xmm5, 01000100b 877 psadbw xmm2, xmm0 878 psadbw xmm4, xmm0 879 paddd xmm2, xmm4 880 881 paddd xmm1, xmm2 882 pshufd xmm2, xmm1, 00001110b 883 paddd xmm1, xmm2 884 movd r2d, xmm1 885 mov [r4], r2w 886 inc dword [r5+r2*4] 887 888 inc r0 889 inc r13 890 add r4, 2 891 892 dec r12 893 jg FIRST_ROW 894 895 pop r4 896 pop r2 897 pop r0 898 mov r13, r2 899 dec r13 900HEIGHT_LOOP: 901 mov r12, r1 902WIDTH_LOOP: 903 movq xmm1, [r0+r3*8] 904 movq xmm2, [r0] 905 psadbw xmm1, xmm0 906 psadbw xmm2, xmm0 907 psubd xmm1, xmm2 908 movd r2d, xmm1 909 mov r6w, [r4] 910 add r2d, r6d 911 mov [r4+r1*2], r2w 912 inc dword [r5+r2*4] 913 914 inc r0 915 add r4, 2 916 917 dec r12 918 jg WIDTH_LOOP 919 920 add r0, r3 921 sub r0, r1 922 923 924 dec r13 925 jg HEIGHT_LOOP 926 927 pop r13 928 pop r12 929 POP_XMM 930 LOAD_6_PARA_POP 931 ret 932 933 934%macro COUNT_SUM 4 935%define xmm_reg %1 936%define tmp_dreg %2 937%define tmp_qreg %3 938 movd tmp_dreg, xmm_reg 939 inc dword [r5+tmp_qreg*4] 940%if %4 == 1 941 psrldq xmm_reg, 4 942%endif 943%endmacro 944 945 946;----------------------------------------------------------------------------- 947; requires: width % 8 == 0 && height > 1 948;----------------------------------------------------------------------------- 949;void SumOf8x8BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 950; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 951;----------------------------------------------------------------------------- 952; read extra (16 - (width % 8) ) mod 16 bytes of every line 953; write extra (16 - (width % 8)*2 ) mod 16 bytes in the end of sum_ref 954WELS_EXTERN SumOf8x8BlockOfFrame_sse4 955 %assign push_num 0 956 LOAD_6_PARA 957 PUSH_XMM 8 958 SIGN_EXTENSION r1, r1d 959 SIGN_EXTENSION r2, r2d 960 SIGN_EXTENSION r3, r3d 961 push r12 962 push r13 963 push r0 964 push r2 965 push r4 966 967 pxor xmm0, xmm0 968 lea r6, [r3+r3*2] 969 970 mov r12, r1 ;r12:tmp_width 971 lea r13, [r0+r3*4] ;rbp:r13 972FIRST_ROW_SSE4: 973 movdqu xmm1, [r0] 974 movdqu xmm3, [r0+r3] 975 movdqu xmm5, [r0+r3*2] 976 movdqu xmm7, [r0+r6] 977 978 movdqa xmm2, xmm1 979 mpsadbw xmm1, xmm0, 000b 980 mpsadbw xmm2, xmm0, 100b 981 paddw xmm1, xmm2 ; 8 sums of line1 982 983 movdqa xmm4, xmm3 984 mpsadbw xmm3, xmm0, 000b 985 mpsadbw xmm4, xmm0, 100b 986 paddw xmm3, xmm4 ; 8 sums of line2 987 988 movdqa xmm2, xmm5 989 mpsadbw xmm5, xmm0, 000b 990 mpsadbw xmm2, xmm0, 100b 991 paddw xmm5, xmm2 ; 8 sums of line3 992 993 movdqa xmm4, xmm7 994 mpsadbw xmm7, xmm0, 000b 995 mpsadbw xmm4, xmm0, 100b 996 paddw xmm7, xmm4 ; 8 sums of line4 997 998 paddw xmm1, xmm3 999 paddw xmm5, xmm7 1000 paddw xmm1, xmm5 ; sum the upper 4 lines first 1001 1002 movdqu xmm2, [r13] 1003 movdqu xmm3, [r13+r3] 1004 movdqu xmm4, [r13+r3*2] 1005 movdqu xmm5, [r13+r6] 1006 1007 movdqa xmm6, xmm2 1008 mpsadbw xmm2, xmm0, 000b 1009 mpsadbw xmm6, xmm0, 100b 1010 paddw xmm2, xmm6 1011 1012 movdqa xmm7, xmm3 1013 mpsadbw xmm3, xmm0, 000b 1014 mpsadbw xmm7, xmm0, 100b 1015 paddw xmm3, xmm7 1016 1017 movdqa xmm6, xmm4 1018 mpsadbw xmm4, xmm0, 000b 1019 mpsadbw xmm6, xmm0, 100b 1020 paddw xmm4, xmm6 1021 1022 movdqa xmm7, xmm5 1023 mpsadbw xmm5, xmm0, 000b 1024 mpsadbw xmm7, xmm0, 100b 1025 paddw xmm5, xmm7 1026 1027 paddw xmm2, xmm3 1028 paddw xmm4, xmm5 1029 paddw xmm1, xmm2 1030 paddw xmm1, xmm4 ; sum of lines 1- 8 1031 1032 movdqu [r4], xmm1 1033 1034 movdqa xmm2, xmm1 1035 punpcklwd xmm1, xmm0 1036 punpckhwd xmm2, xmm0 1037 1038 COUNT_SUM xmm1, r2d, r2, 1 1039 COUNT_SUM xmm1, r2d, r2, 1 1040 COUNT_SUM xmm1, r2d, r2, 1 1041 COUNT_SUM xmm1, r2d, r2, 0 1042 COUNT_SUM xmm2, r2d, r2 ,1 1043 COUNT_SUM xmm2, r2d, r2 ,1 1044 COUNT_SUM xmm2, r2d, r2 ,1 1045 COUNT_SUM xmm2, r2d, r2 ,0 1046 1047 lea r0, [r0+8] 1048 lea r13, [r13+8] 1049 lea r4, [r4+16] ; element size is 2 1050 1051 sub r12, 8 1052 jg near FIRST_ROW_SSE4 1053 1054 pop r4 1055 pop r2 1056 pop r0 1057 mov r13, r2 1058 dec r13 1059HEIGHT_LOOP_SSE4: 1060 mov r12, r1 1061WIDTH_LOOP_SSE4: 1062 movdqu xmm1, [r0+r3*8] 1063 movdqu xmm2, [r0] 1064 movdqu xmm7, [r4] 1065 1066 movdqa xmm3, xmm1 1067 mpsadbw xmm1, xmm0, 000b 1068 mpsadbw xmm3, xmm0, 100b 1069 paddw xmm1, xmm3 1070 1071 movdqa xmm4, xmm2 1072 mpsadbw xmm2, xmm0, 000b 1073 mpsadbw xmm4, xmm0, 100b 1074 paddw xmm2, xmm4 1075 1076 paddw xmm7, xmm1 1077 psubw xmm7, xmm2 1078 movdqu [r4+r1*2], xmm7 1079 1080 movdqa xmm6, xmm7 1081 punpcklwd xmm7, xmm0 1082 punpckhwd xmm6, xmm0 1083 1084 COUNT_SUM xmm7, r2d, r2, 1 1085 COUNT_SUM xmm7, r2d, r2, 1 1086 COUNT_SUM xmm7, r2d, r2, 1 1087 COUNT_SUM xmm7, r2d, r2, 0 1088 COUNT_SUM xmm6, r2d, r2, 1 1089 COUNT_SUM xmm6, r2d, r2, 1 1090 COUNT_SUM xmm6, r2d, r2, 1 1091 COUNT_SUM xmm6, r2d, r2, 0 1092 1093 lea r0, [r0+8] 1094 lea r4, [r4+16] 1095 1096 sub r12, 8 1097 jg near WIDTH_LOOP_SSE4 1098 1099 lea r0, [r0+r3] 1100 sub r0, r1 1101 1102 dec r13 1103 jg near HEIGHT_LOOP_SSE4 1104 1105 pop r13 1106 pop r12 1107 POP_XMM 1108 LOAD_6_PARA_POP 1109 ret 1110 1111 1112;**************************************************************************************************************************************************** 1113;void SumOf16x16BlockOfFrame_sse2(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 1114; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 1115;**************************************************************************************************************************************************** 1116WELS_EXTERN SumOf16x16BlockOfFrame_sse2 1117 %assign push_num 0 1118 LOAD_6_PARA 1119 PUSH_XMM 6 1120 SIGN_EXTENSION r1, r1d 1121 SIGN_EXTENSION r2, r2d 1122 SIGN_EXTENSION r3, r3d 1123 push r12 1124 push r13 1125 push r0 1126 push r2 1127 push r4 1128 1129 pxor xmm0, xmm0 1130 lea r6, [r3+r3*2] 1131 1132 mov r12, r1 ;r12:tmp_width 1133FIRST_ROW_X16H: 1134 movdqu xmm1, [r0] 1135 movdqu xmm2, [r0+r3] 1136 movdqu xmm3, [r0+r3*2] 1137 movdqu xmm4, [r0+r6] 1138 1139 psadbw xmm1, xmm0 1140 psadbw xmm2, xmm0 1141 psadbw xmm3, xmm0 1142 psadbw xmm4, xmm0 1143 paddw xmm1, xmm2 1144 paddw xmm3, xmm4 1145 paddw xmm1, xmm3 1146 1147 lea r13, [r0+r3*4] ;ebp:r13 1148 movdqu xmm2, [r13] 1149 movdqu xmm3, [r13+r3] 1150 movdqu xmm4, [r13+r3*2] 1151 movdqu xmm5, [r13+r6] 1152 1153 psadbw xmm2, xmm0 1154 psadbw xmm3, xmm0 1155 psadbw xmm4, xmm0 1156 psadbw xmm5, xmm0 1157 paddw xmm2, xmm3 1158 paddw xmm4, xmm5 1159 paddw xmm2, xmm4 1160 1161 paddw xmm1, xmm2 1162 1163 lea r13, [r13+r3*4] 1164 movdqu xmm2, [r13] 1165 movdqu xmm3, [r13+r3] 1166 movdqu xmm4, [r13+r3*2] 1167 movdqu xmm5, [r13+r6] 1168 1169 psadbw xmm2, xmm0 1170 psadbw xmm3, xmm0 1171 psadbw xmm4, xmm0 1172 psadbw xmm5, xmm0 1173 paddw xmm2, xmm3 1174 paddw xmm4, xmm5 1175 paddw xmm2, xmm4 1176 1177 paddw xmm1, xmm2 1178 1179 lea r13, [r13+r3*4] 1180 movdqu xmm2, [r13] 1181 movdqu xmm3, [r13+r3] 1182 movdqu xmm4, [r13+r3*2] 1183 movdqu xmm5, [r13+r6] 1184 1185 psadbw xmm2, xmm0 1186 psadbw xmm3, xmm0 1187 psadbw xmm4, xmm0 1188 psadbw xmm5, xmm0 1189 paddw xmm2, xmm3 1190 paddw xmm4, xmm5 1191 paddw xmm2, xmm4 1192 1193 paddw xmm1, xmm2 1194 movdqa xmm2, xmm1 1195 punpckhwd xmm2, xmm0 1196 paddw xmm1, xmm2 1197 movd r2d, xmm1 1198 mov [r4], r2w 1199 inc dword [r5+r2*4] 1200 1201 inc r0 1202 lea r4, [r4+2] 1203 1204 dec r12 1205 jg near FIRST_ROW_X16H 1206 1207 pop r4 1208 pop r2 1209 pop r0 1210 mov r13, r2 1211 dec r13 1212 mov r6, r3 1213 sal r6, 4 ; succeeded 16th line 1214HEIGHT_LOOP_X16: 1215 mov r12, r1 1216WIDTH_LOOP_X16: 1217 movdqu xmm1, [r0+r6] 1218 movdqu xmm2, [r0] 1219 psadbw xmm1, xmm0 1220 psadbw xmm2, xmm0 1221 psubw xmm1, xmm2 1222 movdqa xmm2, xmm1 1223 punpckhwd xmm2, xmm0 1224 paddw xmm1, xmm2 1225 movd r2d, xmm1 1226 add r2w, word [r4] 1227 mov [r4+r1*2], r2w 1228 inc dword [r5+r2*4] 1229 1230 inc r0 1231 add r4, 2 1232 1233 dec r12 1234 jg near WIDTH_LOOP_X16 1235 1236 add r0, r3 1237 sub r0, r1 1238 1239 dec r13 1240 jg near HEIGHT_LOOP_X16 1241 1242 pop r13 1243 pop r12 1244 POP_XMM 1245 LOAD_6_PARA_POP 1246 ret 1247 1248; requires: width % 16 == 0 && height > 1 1249;----------------------------------------------------------------------------------------------------------------------------- 1250;void SumOf16x16BlockOfFrame_sse4(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride, 1251; uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]); 1252;----------------------------------------------------------------------------------------------------------------------------- 1253; try 8 mv via offset 1254%macro SUM_LINE_X16_SSE41 5 ; ref, dst0, dst1, tmp0, tmp1 1255 movdqu %2, [%1] 1256 movdqu %3, [%1+8h] 1257 movdqa %4, %2 1258 movdqa %5, %3 1259 1260 mpsadbw %2, xmm0, 0 ; 000 B 1261 mpsadbw %4, xmm0, 5 ; 101 B 1262 mpsadbw %3, xmm0, 2 ; 010 B 1263 mpsadbw %5, xmm0, 7 ; 111 B 1264 paddw %2, %4 1265 paddw %3, %5 1266 paddw %2, %3 ; accumulate cost 1267%endmacro ; end of SAD_16x16_LINE_SSE41 1268 1269WELS_EXTERN SumOf16x16BlockOfFrame_sse4 1270 %assign push_num 0 1271 LOAD_6_PARA 1272 PUSH_XMM 8 1273 SIGN_EXTENSION r1, r1d 1274 SIGN_EXTENSION r2, r2d 1275 SIGN_EXTENSION r3, r3d 1276 push r12 1277 push r13 1278 push r0 1279 push r2 1280 push r4 1281 1282 pxor xmm0, xmm0 1283 lea r6, [r3+r3*2] 1284 1285 mov r12, r1 ;r12:tmp_width 1286FIRST_ROW_X16_SSE4: 1287 SUM_LINE_X16_SSE41 r0, xmm1, xmm2, xmm3, xmm4 1288 SUM_LINE_X16_SSE41 r0+r3, xmm2, xmm3, xmm4, xmm5 1289 SUM_LINE_X16_SSE41 r0+r3*2,xmm3, xmm4, xmm5, xmm6 1290 SUM_LINE_X16_SSE41 r0+r6, xmm4, xmm5, xmm6, xmm7 1291 paddw xmm1, xmm2 1292 paddw xmm3, xmm4 1293 paddw xmm1, xmm3 1294 1295 lea r13, [r0+r3*4] 1296 SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5 1297 paddw xmm1, xmm2 1298 SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5 1299 paddw xmm1, xmm2 1300 SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5 1301 paddw xmm1, xmm2 1302 SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5 1303 paddw xmm1, xmm2 1304 1305 lea r13, [r13+r3*4] 1306 SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5 1307 paddw xmm1, xmm2 1308 SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5 1309 paddw xmm1, xmm2 1310 SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5 1311 paddw xmm1, xmm2 1312 SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5 1313 paddw xmm1, xmm2 1314 1315 lea r13, [r13+r3*4] 1316 SUM_LINE_X16_SSE41 r13, xmm2, xmm3, xmm4, xmm5 1317 paddw xmm1, xmm2 1318 SUM_LINE_X16_SSE41 r13+r3, xmm2, xmm3, xmm4, xmm5 1319 paddw xmm1, xmm2 1320 SUM_LINE_X16_SSE41 r13+r3*2, xmm2, xmm3, xmm4, xmm5 1321 paddw xmm1, xmm2 1322 SUM_LINE_X16_SSE41 r13+r6, xmm2, xmm3, xmm4, xmm5 1323 paddw xmm1, xmm2 1324 1325 movdqa [r4], xmm1 1326 movdqa xmm2, xmm1 1327 punpcklwd xmm1, xmm0 1328 punpckhwd xmm2, xmm0 1329 1330 COUNT_SUM xmm1, r2d, r2, 1 1331 COUNT_SUM xmm1, r2d, r2, 1 1332 COUNT_SUM xmm1, r2d, r2, 1 1333 COUNT_SUM xmm1, r2d, r2, 0 1334 COUNT_SUM xmm2, r2d, r2, 1 1335 COUNT_SUM xmm2, r2d, r2, 1 1336 COUNT_SUM xmm2, r2d, r2, 1 1337 COUNT_SUM xmm2, r2d, r2, 0 1338 1339 lea r0, [r0+8] 1340 lea r4, [r4+16] ; element size is 2 1341 1342 sub r12, 8 1343 jg near FIRST_ROW_X16_SSE4 1344 1345 pop r4 1346 pop r2 1347 pop r0 1348 mov r13, r2 1349 dec r13 1350 mov r6, r3 1351 sal r6, 4 ; succeeded 16th line 1352 1353HEIGHT_LOOP_X16_SSE4: 1354 mov r12, r1 1355WIDTH_LOOP_X16_SSE4: 1356 movdqa xmm7, [r4] 1357 SUM_LINE_X16_SSE41 r0+r6, xmm1, xmm2, xmm3, xmm4 1358 SUM_LINE_X16_SSE41 r0, xmm2, xmm3, xmm4, xmm5 1359 1360 paddw xmm7, xmm1 1361 psubw xmm7, xmm2 1362 movdqa [r4+r1*2], xmm7 1363 1364 movdqa xmm6, xmm7 1365 punpcklwd xmm7, xmm0 1366 punpckhwd xmm6, xmm0 1367 1368 COUNT_SUM xmm7, r2d, r2, 1 1369 COUNT_SUM xmm7, r2d, r2, 1 1370 COUNT_SUM xmm7, r2d, r2, 1 1371 COUNT_SUM xmm7, r2d, r2, 0 1372 COUNT_SUM xmm6, r2d, r2, 1 1373 COUNT_SUM xmm6, r2d, r2, 1 1374 COUNT_SUM xmm6, r2d, r2, 1 1375 COUNT_SUM xmm6, r2d, r2, 0 1376 1377 lea r0, [r0+8] 1378 lea r4, [r4+16] 1379 1380 sub r12, 8 1381 jg near WIDTH_LOOP_X16_SSE4 1382 1383 add r0, r3 1384 sub r0, r1 1385 1386 dec r13 1387 jg near HEIGHT_LOOP_X16_SSE4 1388 1389 pop r13 1390 pop r12 1391 POP_XMM 1392 LOAD_6_PARA_POP 1393 ret 1394 1395;----------------------------------------------------------------------------------------------------------------------------- 1396; void FillQpelLocationByFeatureValue_sse2(uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList) 1397;----------------------------------------------------------------------------------------------------------------------------- 1398WELS_EXTERN FillQpelLocationByFeatureValue_sse2 1399 %assign push_num 0 1400 LOAD_4_PARA 1401 PUSH_XMM 8 1402 SIGN_EXTENSION r1, r1d 1403 SIGN_EXTENSION r2, r2d 1404 push r12 1405 push r13 1406 mov r12, r2 1407 1408 movq xmm7, [mv_x_inc_x4] ; x_qpel inc 1409 movq xmm6, [mv_y_inc_x4] ; y_qpel inc 1410 movq xmm5, [mx_x_offset_x4] ; x_qpel vector 1411 pxor xmm4, xmm4 1412 pxor xmm3, xmm3 ; y_qpel vector 1413HASH_HEIGHT_LOOP_SSE2: 1414 movdqa xmm2, xmm5 ; x_qpel vector 1415 mov r4, r1 1416HASH_WIDTH_LOOP_SSE2: 1417 movq xmm0, [r0] ; load x8 sum 1418 punpcklwd xmm0, xmm4 1419 movdqa xmm1, xmm2 1420 punpcklwd xmm1, xmm3 1421%rep 3 1422 movd r2d, xmm0 ;edx:r3 1423 lea r5, [r3+r2*8] ;ebx:r5 1424 mov r6, [r5] ;eax:r6 1425 movd [r6], xmm1 1426 mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation 1427 lea r6, [r6+4] 1428 mov [r5], r6 1429 psrldq xmm1, 4 1430 psrldq xmm0, 4 1431%endrep 1432 movd r2d, xmm0 1433 lea r5, [r3+r2*8] ;ebx:r5 1434 mov r6, [r5] ;eax:r6 1435 movd [r6], xmm1 1436 mov r13, [r6+4] ; explictly load eax+4 due cache miss from vtune observation 1437 lea r6, [r6+4] 1438 mov [r5], r6 1439 1440 paddw xmm2, xmm7 1441 lea r0, [r0+8] 1442 sub r4, 4 1443 jnz near HASH_WIDTH_LOOP_SSE2 1444 paddw xmm3, xmm6 1445 dec r12 1446 jnz near HASH_HEIGHT_LOOP_SSE2 1447 1448 pop r13 1449 pop r12 1450 POP_XMM 1451 ret 1452 1453;--------------------------------------------------------------------------------------------------------------------------------------------------- 1454; void InitializeHashforFeature_sse2( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, 1455; uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList); 1456;uint16_t** pPositionOfSum, uint16_t** sum_idx_list, uint32_t* pTimesOfSum, uint16_t* pBuf, const int32_t list_sz ) 1457;--------------------------------------------------------------------------------------------------------------------------------------------------- 1458WELS_EXTERN InitializeHashforFeature_sse2 1459 %assign push_num 0 1460 LOAD_5_PARA 1461 SIGN_EXTENSION r2, r2d 1462 push r12 1463 push r13 1464 mov r12, r2 1465 sar r2, 2 1466 mov r5, 0 ;r5:ecx 1467 xor r6, r6 1468 pxor xmm3, xmm3 1469hash_assign_loop_x4_sse2: 1470 movdqa xmm0, [r0+r5] 1471 pslld xmm0, 2 1472 1473 movdqa xmm1, xmm0 1474 pcmpeqd xmm1, xmm3 1475 movmskps r6, xmm1 1476 cmp r6, 0x0f 1477 jz near hash_assign_with_copy_sse2 1478 1479%assign x 0 1480%rep 4 1481 lea r13, [r3+r5*2+x] 1482 mov [r13], r1 1483 lea r13, [r4+r5*2+x] 1484 mov [r13], r1 1485 movd r6d, xmm0 1486 add r1, r6 1487 psrldq xmm0, 4 1488%assign x x+8 1489%endrep 1490 jmp near assign_next_sse2 1491 1492hash_assign_with_copy_sse2: 1493 movq xmm1, r1 1494 pshufd xmm2, xmm1, 01000100b 1495 movdqa [r3+r5*2], xmm2 1496 movdqa [r4+r5*2], xmm2 1497 movdqa [r3+r5*2+16], xmm2 1498 movdqa [r4+r5*2+16], xmm2 1499 1500assign_next_sse2: 1501 add r5, 16 1502 dec r2 1503 jnz near hash_assign_loop_x4_sse2 1504 1505 and r12, 3 1506 jz near hash_assign_no_rem_sse2 1507hash_assign_loop_x4_rem_sse2: 1508 lea r13, [r3+r5*2] 1509 mov [r13], r1 1510 lea r13, [r4+r5*2] 1511 mov [r13], r1 1512 mov r6d, [r0+r5] 1513 sal r6, 2 1514 add r1, r6 1515 add r5, 4 1516 dec r12 1517 jnz near hash_assign_loop_x4_rem_sse2 1518 1519hash_assign_no_rem_sse2: 1520 pop r13 1521 pop r12 1522 ret 1523 1524%endif 1525 1526;********************************************************************************************************************************** 1527; int32_t SumOf8x8SingleBlock_sse2(uint8_t* ref0, int32_t linesize) 1528;********************************************************************************************************************************** 1529WELS_EXTERN SumOf8x8SingleBlock_sse2 1530 %assign push_num 0 1531 LOAD_2_PARA 1532 SIGN_EXTENSION r1, r1d 1533 1534 pxor xmm0, xmm0 1535 movq xmm1, [r0] 1536 movhps xmm1, [r0+r1] 1537 lea r0, [r0+2*r1] 1538 movq xmm2, [r0] 1539 movhps xmm2, [r0+r1] 1540 lea r0, [r0+2*r1] 1541 movq xmm3, [r0] 1542 movhps xmm3, [r0+r1] 1543 lea r0, [r0+2*r1] 1544 movq xmm4, [r0] 1545 movhps xmm4, [r0+r1] 1546 1547 psadbw xmm1, xmm0 1548 psadbw xmm2, xmm0 1549 psadbw xmm3, xmm0 1550 psadbw xmm4, xmm0 1551 paddw xmm1, xmm2 1552 paddw xmm3, xmm4 1553 paddw xmm1, xmm3 1554 1555 movdqa xmm2, xmm1 1556 punpckhwd xmm2, xmm0 1557 paddw xmm1, xmm2 1558 1559 movd retrd, xmm1 1560 ret 1561 1562;********************************************************************************************************************************** 1563; int32_t SumOf16x16SingleBlock_sse2(uint8_t* ref0, int32_t linesize) 1564;********************************************************************************************************************************** 1565WELS_EXTERN SumOf16x16SingleBlock_sse2 1566 %assign push_num 0 1567 LOAD_2_PARA 1568 PUSH_XMM 6 1569 SIGN_EXTENSION r1, r1d 1570 1571 pxor xmm0, xmm0 1572 movdqa xmm1, [r0] 1573 movdqa xmm2, [r0+r1] 1574 lea r0, [r0+2*r1] 1575 movdqa xmm3, [r0] 1576 movdqa xmm4, [r0+r1] 1577 psadbw xmm1, xmm0 1578 psadbw xmm2, xmm0 1579 psadbw xmm3, xmm0 1580 psadbw xmm4, xmm0 1581 paddw xmm1, xmm2 1582 paddw xmm3, xmm4 1583 paddw xmm1, xmm3 1584 1585 lea r0, [r0+2*r1] 1586 movdqa xmm2, [r0] 1587 movdqa xmm3, [r0+r1] 1588 lea r0, [r0+2*r1] 1589 movdqa xmm4, [r0] 1590 movdqa xmm5, [r0+r1] 1591 psadbw xmm2, xmm0 1592 psadbw xmm3, xmm0 1593 psadbw xmm4, xmm0 1594 psadbw xmm5, xmm0 1595 paddw xmm2, xmm3 1596 paddw xmm4, xmm5 1597 paddw xmm2, xmm4 1598 1599 paddw xmm1, xmm2 1600 1601 lea r0, [r0+2*r1] 1602 movdqa xmm2, [r0] 1603 movdqa xmm3, [r0+r1] 1604 lea r0, [r0+2*r1] 1605 movdqa xmm4, [r0] 1606 movdqa xmm5, [r0+r1] 1607 psadbw xmm2, xmm0 1608 psadbw xmm3, xmm0 1609 psadbw xmm4, xmm0 1610 psadbw xmm5, xmm0 1611 paddw xmm2, xmm3 1612 paddw xmm4, xmm5 1613 paddw xmm2, xmm4 1614 1615 paddw xmm1, xmm2 1616 1617 lea r0, [r0+2*r1] 1618 movdqa xmm2, [r0] 1619 movdqa xmm3, [r0+r1] 1620 lea r0, [r0+2*r1] 1621 movdqa xmm4, [r0] 1622 movdqa xmm5, [r0+r1] 1623 psadbw xmm2, xmm0 1624 psadbw xmm3, xmm0 1625 psadbw xmm4, xmm0 1626 psadbw xmm5, xmm0 1627 paddw xmm2, xmm3 1628 paddw xmm4, xmm5 1629 paddw xmm2, xmm4 1630 1631 paddw xmm1, xmm2 1632 1633 movdqa xmm2, xmm1 1634 punpckhwd xmm2, xmm0 1635 paddw xmm1, xmm2 1636 1637 movd retrd, xmm1 1638 POP_XMM 1639 ret 1640 1641;********************************************************************************************************************************** 1642; 1643; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost ) 1644; 1645; \note: 1646; src need align with 16 bytes, ref is optional 1647; \return value: 1648; return minimal SAD cost, according index carried by index_min_cost 1649;********************************************************************************************************************************** 1650; try 8 mv via offset 1651; xmm7 store sad costs 1652%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref 1653 movdqa xmm0, [%1] 1654 movdqu xmm1, [%2] 1655 movdqu xmm2, [%2+8h] 1656 movdqa xmm3, xmm1 1657 movdqa xmm4, xmm2 1658 1659 mpsadbw xmm1, xmm0, 0 ; 000 B 1660 paddw xmm7, xmm1 ; accumulate cost 1661 1662 mpsadbw xmm3, xmm0, 5 ; 101 B 1663 paddw xmm7, xmm3 ; accumulate cost 1664 1665 mpsadbw xmm2, xmm0, 2 ; 010 B 1666 paddw xmm7, xmm2 ; accumulate cost 1667 1668 mpsadbw xmm4, xmm0, 7 ; 111 B 1669 paddw xmm7, xmm4 ; accumulate cost 1670 1671 add %1, %3 1672 add %2, %4 1673%endmacro ; end of SAD_16x16_LINE_SSE41 1674%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref 1675 movdqa xmm0, [%1] 1676 movdqu xmm1, [%2] 1677 movdqu xmm2, [%2+8h] 1678 movdqa xmm3, xmm1 1679 movdqa xmm4, xmm2 1680 1681 mpsadbw xmm1, xmm0, 0 ; 000 B 1682 paddw xmm7, xmm1 ; accumulate cost 1683 1684 mpsadbw xmm3, xmm0, 5 ; 101 B 1685 paddw xmm7, xmm3 ; accumulate cost 1686 1687 mpsadbw xmm2, xmm0, 2 ; 010 B 1688 paddw xmm7, xmm2 ; accumulate cost 1689 1690 mpsadbw xmm4, xmm0, 7 ; 111 B 1691 paddw xmm7, xmm4 ; accumulate cost 1692%endmacro ; end of SAD_16x16_LINE_SSE41E 1693 1694WELS_EXTERN SampleSad16x16Hor8_sse41 1695 ;push ebx 1696 ;push esi 1697 ;mov eax, [esp+12] ; src 1698 ;mov ecx, [esp+16] ; stride_src 1699 ;mov ebx, [esp+20] ; ref 1700 ;mov edx, [esp+24] ; stride_ref 1701 ;mov esi, [esp+28] ; base_cost 1702 %assign push_num 0 1703 LOAD_6_PARA 1704 PUSH_XMM 8 1705 SIGN_EXTENSION r1, r1d 1706 SIGN_EXTENSION r3, r3d 1707 pxor xmm7, xmm7 1708 1709 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1710 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1711 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1712 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1713 1714 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1715 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1716 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1717 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1718 1719 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1720 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1721 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1722 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1723 1724 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1725 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1726 SAD_16x16_LINE_SSE41 r0, r2, r1, r3 1727 SAD_16x16_LINE_SSE41E r0, r2, r1, r3 1728 1729 pxor xmm0, xmm0 1730 movdqa xmm6, xmm7 1731 punpcklwd xmm6, xmm0 1732 punpckhwd xmm7, xmm0 1733 1734 movdqa xmm5, [r4] 1735 movdqa xmm4, xmm5 1736 punpcklwd xmm4, xmm0 1737 punpckhwd xmm5, xmm0 1738 1739 paddd xmm4, xmm6 1740 paddd xmm5, xmm7 1741 movdqa xmm3, xmm4 1742 pminud xmm3, xmm5 1743 pshufd xmm2, xmm3, 01001110B 1744 pminud xmm2, xmm3 1745 pshufd xmm3, xmm2, 10110001B 1746 pminud xmm2, xmm3 1747 movd retrd, xmm2 1748 pcmpeqd xmm4, xmm2 1749 movmskps r2d, xmm4 1750 bsf r1d, r2d 1751 jnz near WRITE_INDEX 1752 1753 pcmpeqd xmm5, xmm2 1754 movmskps r2d, xmm5 1755 bsf r1d, r2d 1756 add r1d, 4 1757 1758WRITE_INDEX: 1759 mov [r5], r1d 1760 POP_XMM 1761 LOAD_6_PARA_POP 1762 ret 1763 1764;********************************************************************************************************************************** 1765; 1766; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost ) 1767; 1768; \note: 1769; src and ref is optional to align with 16 due inter 8x8 1770; \return value: 1771; return minimal SAD cost, according index carried by index_min_cost 1772; 1773;********************************************************************************************************************************** 1774; try 8 mv via offset 1775; xmm7 store sad costs 1776%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref 1777 movdqu xmm0, [%1] 1778 movdqu xmm1, [%2] 1779 movdqa xmm2, xmm1 1780 1781 mpsadbw xmm1, xmm0, 0 ; 000 B 1782 paddw xmm7, xmm1 ; accumulate cost 1783 1784 mpsadbw xmm2, xmm0, 5 ; 101 B 1785 paddw xmm7, xmm2 ; accumulate cost 1786 1787 add %1, %3 1788 add %2, %4 1789%endmacro ; end of SAD_8x8_LINE_SSE41 1790%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref 1791 movdqu xmm0, [%1] 1792 movdqu xmm1, [%2] 1793 movdqa xmm2, xmm1 1794 1795 mpsadbw xmm1, xmm0, 0 ; 000 B 1796 paddw xmm7, xmm1 ; accumulate cost 1797 1798 mpsadbw xmm2, xmm0, 5 ; 101 B 1799 paddw xmm7, xmm2 ; accumulate cost 1800%endmacro ; end of SAD_8x8_LINE_SSE41E 1801 1802WELS_EXTERN SampleSad8x8Hor8_sse41 1803 %assign push_num 0 1804 LOAD_6_PARA 1805 PUSH_XMM 8 1806 SIGN_EXTENSION r1, r1d 1807 SIGN_EXTENSION r3, r3d 1808 movdqa xmm7, [r4] ; load base cost list 1809 1810 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 1811 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 1812 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 1813 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 1814 1815 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 1816 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 1817 SAD_8x8_LINE_SSE41 r0, r2, r1, r3 1818 SAD_8x8_LINE_SSE41E r0, r2, r1, r3 1819 1820 phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index 1821 movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX 1822 mov r1d, retrd 1823 and retrd, 0xFFFF 1824 sar r1d, 16 1825 mov [r5], r1d 1826 1827 POP_XMM 1828 LOAD_6_PARA_POP 1829 ret 1830