1;*! 2;* \copy 3;* Copyright (c) 2010-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* vaa.asm 33;* 34;* Abstract 35;* sse2 for pVaa routines 36;* 37;* History 38;* 04/14/2010 Created 39;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3) 40;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement 41;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 42;* 43;*************************************************************************/ 44%include "asm_inc.asm" 45 46 47;*********************************************************************** 48; Macros and other preprocessor constants 49;*********************************************************************** 50%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero 51 movdqa %1, %2 52 punpcklbw %1, %3 53 punpckhbw %2, %3 54 pmaddwd %1, %1 55 pmaddwd %2, %2 56 paddd %1, %2 57 pshufd %2, %1, 04Eh ; 01001110 B 58 paddd %1, %2 59 pshufd %2, %1, 0B1h ; 10110001 B 60 paddd %1, %2 61%endmacro ; END OF SUM_SQR_SSE2 62 63%macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3 64 movdqa xmm1, [%1] 65 movdqa xmm2, [%2] 66 movdqa xmm3, [%1+%3] 67 movdqa xmm4, [%2+%3] 68 psadbw xmm1, xmm2 69 psadbw xmm3, xmm4 70 paddd xmm6, xmm1 71 paddd xmm6, xmm3 72 lea %1, [%1+%3*2] 73 lea %2, [%2+%3*2] 74%endmacro 75 76; by comparing it outperforms than phaddw(SSSE3) sets 77%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp 78 ; @sum_8x2 begin 79 pshufd %2, %1, 04Eh ; 01001110 B 80 paddw %1, %2 81 pshuflw %2, %1, 04Eh ; 01001110 B 82 paddw %1, %2 83 pshuflw %2, %1, 0B1h ; 10110001 B 84 paddw %1, %2 85 ; end of @sum_8x2 86%endmacro ; END of SUM_WORD_8x2_SSE2 87 88%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3 89 movdqa xmm1, [%1] 90 movdqa xmm2, [%2] 91 movdqa xmm3, xmm1 92 psadbw xmm3, xmm2 93 paddd xmm6, xmm3 94 95 movdqa xmm3, xmm1 96 psadbw xmm3, xmm0 97 paddd xmm5, xmm3 98 99 movdqa xmm2, xmm1 100 punpcklbw xmm1, xmm0 101 punpckhbw xmm2, xmm0 102 pmaddwd xmm1, xmm1 103 pmaddwd xmm2, xmm2 104 paddd xmm4, xmm1 105 paddd xmm4, xmm2 106 107 add %1, %3 108 add %2, %3 109%endmacro 110 111%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3 112 movdqa xmm1, [%1] 113 movdqa xmm2, [%2] 114 movdqa xmm3, xmm1 115 psadbw xmm3, xmm2 116 paddd xmm7, xmm3 ; sad 117 118 movdqa xmm3, xmm1 119 pmaxub xmm3, xmm2 120 pminub xmm2, xmm1 121 psubb xmm3, xmm2 ; diff 122 123 movdqa xmm2, xmm1 124 psadbw xmm2, xmm0 125 paddd xmm6, xmm2 ; sum 126 127 movdqa xmm2, xmm1 128 punpcklbw xmm1, xmm0 129 punpckhbw xmm2, xmm0 130 pmaddwd xmm1, xmm1 131 pmaddwd xmm2, xmm2 132 paddd xmm5, xmm1 133 paddd xmm5, xmm2 ; sqsum 134 135 movdqa xmm1, xmm3 136 punpcklbw xmm1, xmm0 137 punpckhbw xmm3, xmm0 138 pmaddwd xmm1, xmm1 139 pmaddwd xmm3, xmm3 140 paddd xmm4, xmm1 141 paddd xmm4, xmm3 ; sqdiff 142 143 add %1, %3 144 add %2, %3 145%endmacro 146 147%macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7 148%define sad_reg %1 149%define sum_cur_reg %2 150%define sum_ref_reg %3 151%define mad_reg %4 152 movdqa xmm1, [%5] 153 movdqa xmm2, [%6] 154 movdqa xmm3, xmm1 155 psadbw xmm3, xmm0 156 paddd sum_cur_reg, xmm3 ; sum_cur 157 movdqa xmm3, xmm2 158 psadbw xmm3, xmm0 159 paddd sum_ref_reg, xmm3 ; sum_ref 160 161 movdqa xmm3, xmm1 162 pmaxub xmm3, xmm2 163 pminub xmm2, xmm1 164 psubb xmm3, xmm2 ; abs diff 165 pmaxub mad_reg, xmm3 ; max abs diff 166 167 psadbw xmm3, xmm0 168 paddd sad_reg, xmm3 ; sad 169 170 add %5, %7 171 add %6, %7 172%endmacro 173 174 175%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used 176%define max_reg %1 177 movdqa xmm1, max_reg 178 psrldq xmm1, 4 179 pmaxub max_reg, xmm1 180 movdqa xmm1, max_reg 181 psrldq xmm1, 2 182 pmaxub max_reg, xmm1 183 movdqa xmm1, max_reg 184 psrldq xmm1, 1 185 pmaxub max_reg, xmm1 186%endmacro 187 188%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7 189%define sad_reg %1 190%define sum_reg %2 191%define mad_reg %3 192%define sqdiff_reg %4 193 movdqa xmm1, [%5] 194 movdqa xmm2, xmm1 195 movdqa xmm3, xmm1 196 punpcklbw xmm2, xmm0 197 punpckhbw xmm3, xmm0 198 pmaddwd xmm2, xmm2 199 pmaddwd xmm3, xmm3 200 paddd xmm2, xmm3 201 movdqa xmm3, xmm2 202 psllq xmm2, 32 203 psrlq xmm3, 32 204 psllq xmm3, 32 205 paddd xmm2, xmm3 206 paddd sad_reg, xmm2 ; sqsum 207 208 movdqa xmm2, [%6] 209 movdqa xmm3, xmm1 210 psadbw xmm3, xmm0 211 paddd sum_reg, xmm3 ; sum_cur 212 movdqa xmm3, xmm2 213 psadbw xmm3, xmm0 214 pslldq xmm3, 4 215 paddd sum_reg, xmm3 ; sum_ref 216 217 movdqa xmm3, xmm1 218 pmaxub xmm3, xmm2 219 pminub xmm2, xmm1 220 psubb xmm3, xmm2 ; abs diff 221 pmaxub mad_reg, xmm3 ; max abs diff 222 223 movdqa xmm1, xmm3 224 psadbw xmm3, xmm0 225 paddd sad_reg, xmm3 ; sad 226 227 movdqa xmm3, xmm1 228 punpcklbw xmm1, xmm0 229 punpckhbw xmm3, xmm0 230 pmaddwd xmm1, xmm1 231 pmaddwd xmm3, xmm3 232 paddd sqdiff_reg, xmm1 233 paddd sqdiff_reg, xmm3 ; sqdiff 234 235 add %5, %7 236 add %6, %7 237%endmacro 238 239 240;*********************************************************************** 241; Code 242;*********************************************************************** 243 244SECTION .text 245 246%ifdef X86_32 247 248;*********************************************************************** 249; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); 250;*********************************************************************** 251WELS_EXTERN SampleVariance16x16_sse2 252 push esi 253 push edi 254 push ebx 255 256 sub esp, 16 257 %define SUM [esp] 258 %define SUM_CUR [esp+4] 259 %define SQR [esp+8] 260 %define SQR_CUR [esp+12] 261 %define PUSH_SIZE 28 ; 12 + 16 262 263 mov edi, [esp+PUSH_SIZE+4] ; y_ref 264 mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride 265 mov esi, [esp+PUSH_SIZE+12] ; y_src 266 mov eax, [esp+PUSH_SIZE+16] ; y_src_stride 267 mov ecx, 010h ; height = 16 268 269 pxor xmm7, xmm7 270 movdqu SUM, xmm7 271 272.hloops: 273 movdqa xmm0, [edi] ; y_ref 274 movdqa xmm1, [esi] ; y_src 275 movdqa xmm2, xmm0 ; store first for future process 276 movdqa xmm3, xmm1 277 ; sum += diff; 278 movdqa xmm4, xmm0 279 psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] 280 ; to be continued for sum 281 pshufd xmm5, xmm4, 0C6h ; 11000110 B 282 paddw xmm4, xmm5 283 movd ebx, xmm4 284 add SUM, ebx 285 286 ; sqr += diff * diff; 287 pmaxub xmm0, xmm1 288 pminub xmm1, xmm2 289 psubb xmm0, xmm1 ; diff 290 SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero 291 movd ebx, xmm1 292 add SQR, ebx 293 294 ; sum_cur += y_src[x]; 295 movdqa xmm0, xmm3 ; cur_orig 296 movdqa xmm1, xmm0 297 punpcklbw xmm0, xmm7 298 punpckhbw xmm1, xmm7 299 paddw xmm0, xmm1 ; 8x2 300 SUM_WORD_8x2_SSE2 xmm0, xmm1 301 movd ebx, xmm0 302 and ebx, 0ffffh 303 add SUM_CUR, ebx 304 305 ; sqr_cur += y_src[x] * y_src[x]; 306 SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero 307 movd ebx, xmm0 308 add SQR_CUR, ebx 309 310 lea edi, [edi+edx] 311 lea esi, [esi+eax] 312 dec ecx 313 jnz near .hloops 314 315 mov ebx, 0 316 mov bx, word SUM 317 sar ebx, 8 318 imul ebx, ebx 319 mov ecx, SQR 320 sar ecx, 8 321 sub ecx, ebx 322 mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture 323 mov [edi], cx ; to store uiMotionIndex 324 mov ebx, 0 325 mov bx, word SUM_CUR 326 sar ebx, 8 327 imul ebx, ebx 328 mov ecx, SQR_CUR 329 sar ecx, 8 330 sub ecx, ebx 331 mov [edi+2], cx ; to store uiTextureIndex 332 333 %undef SUM 334 %undef SUM_CUR 335 %undef SQR 336 %undef SQR_CUR 337 %undef PUSH_SIZE 338 339 add esp, 16 340 pop ebx 341 pop edi 342 pop esi 343 344 ret 345 346 347 348;************************************************************************************************************* 349;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 350; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) 351;************************************************************************************************************* 352 353 354WELS_EXTERN VAACalcSad_sse2 355%define cur_data esp + pushsize + 4 356%define ref_data esp + pushsize + 8 357%define iPicWidth esp + pushsize + 12 358%define iPicHeight esp + pushsize + 16 359%define iPicStride esp + pushsize + 20 360%define psadframe esp + pushsize + 24 361%define psad8x8 esp + pushsize + 28 362%define pushsize 12 363 push esi 364 push edi 365 push ebx 366 mov esi, [cur_data] 367 mov edi, [ref_data] 368 mov ebx, [iPicStride] 369 mov edx, [psad8x8] 370 mov eax, ebx 371 372 shr dword [iPicWidth], 4 ; iPicWidth/16 373 shr dword [iPicHeight], 4 ; iPicHeight/16 374 shl eax, 4 ; iPicStride*16 375 pxor xmm0, xmm0 376 pxor xmm7, xmm7 ; iFrameSad 377height_loop: 378 mov ecx, dword [iPicWidth] 379 push esi 380 push edi 381width_loop: 382 pxor xmm6, xmm6 ; 383 WELS_SAD_16x2_SSE2 esi,edi,ebx 384 WELS_SAD_16x2_SSE2 esi,edi,ebx 385 WELS_SAD_16x2_SSE2 esi,edi,ebx 386 WELS_SAD_16x2_SSE2 esi,edi,ebx 387 paddd xmm7, xmm6 388 movd [edx], xmm6 389 psrldq xmm6, 8 390 movd [edx+4], xmm6 391 392 pxor xmm6, xmm6 393 WELS_SAD_16x2_SSE2 esi,edi,ebx 394 WELS_SAD_16x2_SSE2 esi,edi,ebx 395 WELS_SAD_16x2_SSE2 esi,edi,ebx 396 WELS_SAD_16x2_SSE2 esi,edi,ebx 397 paddd xmm7, xmm6 398 movd [edx+8], xmm6 399 psrldq xmm6, 8 400 movd [edx+12], xmm6 401 402 add edx, 16 403 sub esi, eax 404 sub edi, eax 405 add esi, 16 406 add edi, 16 407 408 dec ecx 409 jnz width_loop 410 411 pop edi 412 pop esi 413 add esi, eax 414 add edi, eax 415 416 dec dword [iPicHeight] 417 jnz height_loop 418 419 mov edx, [psadframe] 420 movdqa xmm5, xmm7 421 psrldq xmm7, 8 422 paddd xmm7, xmm5 423 movd [edx], xmm7 424 425%undef cur_data 426%undef ref_data 427%undef iPicWidth 428%undef iPicHeight 429%undef iPicStride 430%undef psadframe 431%undef psad8x8 432%undef pushsize 433 pop ebx 434 pop edi 435 pop esi 436 ret 437 438%else ;64-bit 439 440;*********************************************************************** 441; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture ); 442;*********************************************************************** 443WELS_EXTERN SampleVariance16x16_sse2 444 %define SUM r10;[esp] 445 %define SUM_CUR r11;[esp+4] 446 %define SQR r13;[esp+8] 447 %define SQR_CUR r15;[esp+12] 448 449 push r12 450 push r13 451 push r14 452 push r15 453 %assign push_num 4 454 LOAD_5_PARA 455 PUSH_XMM 8 456 SIGN_EXTENSION r1,r1d 457 SIGN_EXTENSION r3,r3d 458 459 mov r12,010h 460 pxor xmm7, xmm7 461 movq SUM, xmm7 462 movq SUM_CUR,xmm7 463 movq SQR,xmm7 464 movq SQR_CUR,xmm7 465 466.hloops: 467 mov r14,0 468 movdqa xmm0, [r0] ; y_ref 469 movdqa xmm1, [r2] ; y_src 470 movdqa xmm2, xmm0 ; store first for future process 471 movdqa xmm3, xmm1 472 ; sum += diff; 473 movdqa xmm4, xmm0 474 psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79] 475 ; to be continued for sum 476 pshufd xmm5, xmm4, 0C6h ; 11000110 B 477 paddw xmm4, xmm5 478 movd r14d, xmm4 479 add SUM, r14 480 481 ; sqr += diff * diff; 482 pmaxub xmm0, xmm1 483 pminub xmm1, xmm2 484 psubb xmm0, xmm1 ; diff 485 SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero 486 movd r14d, xmm1 487 add SQR, r14 488 489 ; sum_cur += y_src[x]; 490 movdqa xmm0, xmm3 ; cur_orig 491 movdqa xmm1, xmm0 492 punpcklbw xmm0, xmm7 493 punpckhbw xmm1, xmm7 494 paddw xmm0, xmm1 ; 8x2 495 SUM_WORD_8x2_SSE2 xmm0, xmm1 496 movd r14d, xmm0 497 and r14, 0ffffh 498 add SUM_CUR, r14 499 500 ; sqr_cur += y_src[x] * y_src[x]; 501 SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero 502 movd r14d, xmm0 503 add SQR_CUR, r14 504 505 lea r0, [r0+r1] 506 lea r2, [r2+r3] 507 dec r12 508 jnz near .hloops 509 510 mov r0, SUM 511 sar r0, 8 512 imul r0, r0 513 mov r1, SQR 514 sar r1, 8 515 sub r1, r0 516 mov [r4], r1w ; to store uiMotionIndex 517 mov r0, SUM_CUR 518 sar r0, 8 519 imul r0, r0 520 mov r1, SQR_CUR 521 sar r1, 8 522 sub r1, r0 523 mov [r4+2], r1w ; to store uiTextureIndex 524 525 POP_XMM 526 LOAD_5_PARA_POP 527 pop r15 528 pop r14 529 pop r13 530 pop r12 531 532 533 %assign push_num 0 534 535 ret 536 537 538;************************************************************************************************************* 539;void VAACalcSad_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 540; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) 541;************************************************************************************************************* 542 543 544WELS_EXTERN VAACalcSad_sse2 545%define cur_data r0 546%define ref_data r1 547%define iPicWidth r2 548%define iPicHeight r3 549%define iPicStride r4 550%define psadframe r5 551%define psad8x8 r6 552 553 push r12 554 push r13 555 %assign push_num 2 556 LOAD_7_PARA 557 PUSH_XMM 8 558 SIGN_EXTENSION r2,r2d 559 SIGN_EXTENSION r3,r3d 560 SIGN_EXTENSION r4,r4d 561 562 mov r12,r4 563 shr r2, 4 ; iPicWidth/16 564 shr r3, 4 ; iPicHeight/16 565 566 shl r12, 4 ; iPicStride*16 567 pxor xmm0, xmm0 568 pxor xmm7, xmm7 ; iFrameSad 569height_loop: 570 mov r13, r2 571 push r0 572 push r1 573width_loop: 574 pxor xmm6, xmm6 575 WELS_SAD_16x2_SSE2 r0,r1,r4 576 WELS_SAD_16x2_SSE2 r0,r1,r4 577 WELS_SAD_16x2_SSE2 r0,r1,r4 578 WELS_SAD_16x2_SSE2 r0,r1,r4 579 paddd xmm7, xmm6 580 movd [r6], xmm6 581 psrldq xmm6, 8 582 movd [r6+4], xmm6 583 584 pxor xmm6, xmm6 585 WELS_SAD_16x2_SSE2 r0,r1,r4 586 WELS_SAD_16x2_SSE2 r0,r1,r4 587 WELS_SAD_16x2_SSE2 r0,r1,r4 588 WELS_SAD_16x2_SSE2 r0,r1,r4 589 paddd xmm7, xmm6 590 movd [r6+8], xmm6 591 psrldq xmm6, 8 592 movd [r6+12], xmm6 593 594 add r6, 16 595 sub r0, r12 596 sub r1, r12 597 add r0, 16 598 add r1, 16 599 600 dec r13 601 jnz width_loop 602 603 pop r1 604 pop r0 605 add r0, r12 606 add r1, r12 607 608 dec r3 609 jnz height_loop 610 611 ;mov r13, [psadframe] 612 movdqa xmm5, xmm7 613 psrldq xmm7, 8 614 paddd xmm7, xmm5 615 movd [psadframe], xmm7 616 617%undef cur_data 618%undef ref_data 619%undef iPicWidth 620%undef iPicHeight 621%undef iPicStride 622%undef psadframe 623%undef psad8x8 624%undef pushsize 625 POP_XMM 626 LOAD_7_PARA_POP 627 pop r13 628 pop r12 629 %assign push_num 0 630 ret 631 632%endif 633 634 635%ifdef X86_32 636;************************************************************************************************************* 637;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 638; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) 639;************************************************************************************************************* 640 641 642WELS_EXTERN VAACalcSadVar_sse2 643%define localsize 8 644%define cur_data esp + pushsize + localsize + 4 645%define ref_data esp + pushsize + localsize + 8 646%define iPicWidth esp + pushsize + localsize + 12 647%define iPicHeight esp + pushsize + localsize + 16 648%define iPicStride esp + pushsize + localsize + 20 649%define psadframe esp + pushsize + localsize + 24 650%define psad8x8 esp + pushsize + localsize + 28 651%define psum16x16 esp + pushsize + localsize + 32 652%define psqsum16x16 esp + pushsize + localsize + 36 653%define tmp_esi esp + 0 654%define tmp_edi esp + 4 655%define pushsize 16 656 push ebp 657 push esi 658 push edi 659 push ebx 660 sub esp, localsize 661 mov esi, [cur_data] 662 mov edi, [ref_data] 663 mov ebx, [iPicStride] 664 mov edx, [psad8x8] 665 mov eax, ebx 666 667 shr dword [iPicWidth], 4 ; iPicWidth/16 668 shr dword [iPicHeight], 4 ; iPicHeight/16 669 shl eax, 4 ; iPicStride*16 670 pxor xmm0, xmm0 671 pxor xmm7, xmm7 ; iFrameSad 672var_height_loop: 673 mov ecx, dword [iPicWidth] 674 mov [tmp_esi], esi 675 mov [tmp_edi], edi 676var_width_loop: 677 pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 678 pxor xmm5, xmm5 ; pSum16x16 679 pxor xmm4, xmm4 ; sqsum_16x16 680 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 681 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 682 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 683 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 684 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 685 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 686 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 687 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 688 paddd xmm7, xmm6 689 movd [edx], xmm6 690 psrldq xmm6, 8 691 movd [edx+4], xmm6 692 693 pxor xmm6, xmm6 694 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 695 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 696 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 697 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 698 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 699 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 700 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 701 WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx 702 paddd xmm7, xmm6 703 movd [edx+8], xmm6 704 psrldq xmm6, 8 705 movd [edx+12], xmm6 706 707 mov ebp, [psum16x16] 708 movdqa xmm1, xmm5 709 psrldq xmm1, 8 710 paddd xmm5, xmm1 711 movd [ebp], xmm5 712 add dword [psum16x16], 4 713 714 movdqa xmm5, xmm4 715 psrldq xmm5, 8 716 paddd xmm4, xmm5 717 movdqa xmm3, xmm4 718 psrldq xmm3, 4 719 paddd xmm4, xmm3 720 721 mov ebp, [psqsum16x16] 722 movd [ebp], xmm4 723 add dword [psqsum16x16], 4 724 725 add edx, 16 726 sub esi, eax 727 sub edi, eax 728 add esi, 16 729 add edi, 16 730 731 dec ecx 732 jnz var_width_loop 733 734 mov esi, [tmp_esi] 735 mov edi, [tmp_edi] 736 add esi, eax 737 add edi, eax 738 739 dec dword [iPicHeight] 740 jnz var_height_loop 741 742 mov edx, [psadframe] 743 movdqa xmm5, xmm7 744 psrldq xmm7, 8 745 paddd xmm7, xmm5 746 movd [edx], xmm7 747 748 add esp, localsize 749 pop ebx 750 pop edi 751 pop esi 752 pop ebp 753%undef cur_data 754%undef ref_data 755%undef iPicWidth 756%undef iPicHeight 757%undef iPicStride 758%undef psadframe 759%undef psad8x8 760%undef psum16x16 761%undef psqsum16x16 762%undef tmp_esi 763%undef tmp_edi 764%undef pushsize 765%undef localsize 766 ret 767 768%else ;64-bit 769 770;************************************************************************************************************* 771;void VAACalcSadVar_sse2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 772; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) 773;************************************************************************************************************* 774 775 776WELS_EXTERN VAACalcSadVar_sse2 777%define cur_data arg1 ;r0 778%define ref_data arg2 ;r1 779%define iPicWidth arg3 ;r2 780%define iPicHeight arg4 ;r3 781%define iPicStride arg5 782%define psadframe arg6 783%define psad8x8 arg7 784%define psum16x16 arg8 785%define psqsum16x16 arg9 786 787 push r12 788 push r13 789 push r14 790 push r15 791 %assign push_num 4 792 PUSH_XMM 8 793 794%ifdef WIN64 795 mov r4, arg5 ;iPicStride 796 mov r5, arg6 ;psad8x8 797%endif 798 mov r14,arg7 799 SIGN_EXTENSION r2,r2d 800 SIGN_EXTENSION r3,r3d 801 SIGN_EXTENSION r4,r4d 802 803 mov r13,r4 804 shr r2,4 805 shr r3,4 806 807 shl r13,4 ; iPicStride*16 808 pxor xmm0, xmm0 809 pxor xmm7, xmm7 ; iFrameSad 810var_height_loop: 811 push r2 812 %assign push_num push_num+1 813 mov r11, r0 814 mov r12, r1 815var_width_loop: 816 pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8 817 pxor xmm5, xmm5 ; pSum16x16 818 pxor xmm4, xmm4 ; sqsum_16x16 819 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 820 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 821 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 822 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 823 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 824 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 825 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 826 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 827 paddd xmm7, xmm6 828 movd [r14], xmm6 829 psrldq xmm6, 8 830 movd [r14+4], xmm6 831 832 pxor xmm6, xmm6 833 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 834 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 835 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 836 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 837 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 838 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 839 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 840 WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4 841 paddd xmm7, xmm6 842 movd [r14+8], xmm6 843 psrldq xmm6, 8 844 movd [r14+12], xmm6 845 846 mov r15, psum16x16 847 movdqa xmm1, xmm5 848 psrldq xmm1, 8 849 paddd xmm5, xmm1 850 movd [r15], xmm5 851 add dword psum16x16, 4 852 853 movdqa xmm5, xmm4 854 psrldq xmm5, 8 855 paddd xmm4, xmm5 856 movdqa xmm3, xmm4 857 psrldq xmm3, 4 858 paddd xmm4, xmm3 859 860 mov r15, psqsum16x16 861 movd [r15], xmm4 862 add dword psqsum16x16, 4 863 864 add r14,16 865 sub r0, r13 866 sub r1, r13 867 add r0, 16 868 add r1, 16 869 870 dec r2 871 jnz var_width_loop 872 873 pop r2 874 %assign push_num push_num-1 875 mov r0, r11 876 mov r1, r12 877 add r0, r13 878 add r1, r13 879 dec r3 880 jnz var_height_loop 881 882 mov r15, psadframe 883 movdqa xmm5, xmm7 884 psrldq xmm7, 8 885 paddd xmm7, xmm5 886 movd [r15], xmm7 887 888 POP_XMM 889 pop r15 890 pop r14 891 pop r13 892 pop r12 893%assign push_num 0 894%undef cur_data 895%undef ref_data 896%undef iPicWidth 897%undef iPicHeight 898%undef iPicStride 899%undef psadframe 900%undef psad8x8 901%undef psum16x16 902%undef psqsum16x16 903%undef tmp_esi 904%undef tmp_edi 905%undef pushsize 906%undef localsize 907 ret 908 909%endif 910 911%ifdef X86_32 912 913;************************************************************************************************************* 914;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 915; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) 916;************************************************************************************************************* 917 918 919WELS_EXTERN VAACalcSadSsd_sse2 920%define localsize 12 921%define cur_data esp + pushsize + localsize + 4 922%define ref_data esp + pushsize + localsize + 8 923%define iPicWidth esp + pushsize + localsize + 12 924%define iPicHeight esp + pushsize + localsize + 16 925%define iPicStride esp + pushsize + localsize + 20 926%define psadframe esp + pushsize + localsize + 24 927%define psad8x8 esp + pushsize + localsize + 28 928%define psum16x16 esp + pushsize + localsize + 32 929%define psqsum16x16 esp + pushsize + localsize + 36 930%define psqdiff16x16 esp + pushsize + localsize + 40 931%define tmp_esi esp + 0 932%define tmp_edi esp + 4 933%define tmp_sadframe esp + 8 934%define pushsize 16 935 push ebp 936 push esi 937 push edi 938 push ebx 939 sub esp, localsize 940 941 mov ecx, [iPicWidth] 942 mov ecx, [iPicHeight] 943 mov esi, [cur_data] 944 mov edi, [ref_data] 945 mov ebx, [iPicStride] 946 mov edx, [psad8x8] 947 mov eax, ebx 948 949 shr dword [iPicWidth], 4 ; iPicWidth/16 950 shr dword [iPicHeight], 4 ; iPicHeight/16 951 shl eax, 4 ; iPicStride*16 952 mov ecx, [iPicWidth] 953 mov ecx, [iPicHeight] 954 pxor xmm0, xmm0 955 movd [tmp_sadframe], xmm0 956sqdiff_height_loop: 957 mov ecx, dword [iPicWidth] 958 mov [tmp_esi], esi 959 mov [tmp_edi], edi 960sqdiff_width_loop: 961 pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 962 pxor xmm6, xmm6 ; pSum16x16 963 pxor xmm5, xmm5 ; sqsum_16x16 four dword 964 pxor xmm4, xmm4 ; sqdiff_16x16 four Dword 965 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 966 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 967 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 968 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 969 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 970 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 971 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 972 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 973 movdqa xmm1, xmm7 974 movd [edx], xmm7 975 psrldq xmm7, 8 976 paddd xmm1, xmm7 977 movd [edx+4], xmm7 978 movd ebp, xmm1 979 add [tmp_sadframe], ebp 980 981 pxor xmm7, xmm7 982 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 983 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 984 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 985 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 986 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 987 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 988 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 989 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx 990 movdqa xmm1, xmm7 991 movd [edx+8], xmm7 992 psrldq xmm7, 8 993 paddd xmm1, xmm7 994 movd [edx+12], xmm7 995 movd ebp, xmm1 996 add [tmp_sadframe], ebp 997 998 mov ebp, [psum16x16] 999 movdqa xmm1, xmm6 1000 psrldq xmm1, 8 1001 paddd xmm6, xmm1 1002 movd [ebp], xmm6 1003 add dword [psum16x16], 4 1004 1005 mov ebp, [psqsum16x16] 1006 pshufd xmm6, xmm5, 14 ;00001110 1007 paddd xmm6, xmm5 1008 pshufd xmm5, xmm6, 1 ;00000001 1009 paddd xmm5, xmm6 1010 movd [ebp], xmm5 1011 add dword [psqsum16x16], 4 1012 1013 mov ebp, [psqdiff16x16] 1014 pshufd xmm5, xmm4, 14 ; 00001110 1015 paddd xmm5, xmm4 1016 pshufd xmm4, xmm5, 1 ; 00000001 1017 paddd xmm4, xmm5 1018 movd [ebp], xmm4 1019 add dword [psqdiff16x16], 4 1020 1021 add edx, 16 1022 sub esi, eax 1023 sub edi, eax 1024 add esi, 16 1025 add edi, 16 1026 1027 dec ecx 1028 jnz sqdiff_width_loop 1029 1030 mov esi, [tmp_esi] 1031 mov edi, [tmp_edi] 1032 add esi, eax 1033 add edi, eax 1034 1035 dec dword [iPicHeight] 1036 jnz sqdiff_height_loop 1037 1038 mov ebx, [tmp_sadframe] 1039 mov eax, [psadframe] 1040 mov [eax], ebx 1041 1042 add esp, localsize 1043 pop ebx 1044 pop edi 1045 pop esi 1046 pop ebp 1047%undef cur_data 1048%undef ref_data 1049%undef iPicWidth 1050%undef iPicHeight 1051%undef iPicStride 1052%undef psadframe 1053%undef psad8x8 1054%undef psum16x16 1055%undef psqsum16x16 1056%undef psqdiff16x16 1057%undef tmp_esi 1058%undef tmp_edi 1059%undef tmp_sadframe 1060%undef pushsize 1061%undef localsize 1062 ret 1063 1064%else 1065 1066 1067;************************************************************************************************************* 1068;void VAACalcSadSsd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 1069; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) 1070;************************************************************************************************************* 1071 1072 1073WELS_EXTERN VAACalcSadSsd_sse2 1074%define localsize 12 1075%define cur_data arg1;r0 1076%define ref_data arg2;r1 1077%define iPicWidth arg3;r2 1078%define iPicHeight arg4;r3 1079%define iPicStride arg5; 1080%define psadframe arg6; 1081%define psad8x8 arg7; 1082%define psum16x16 arg8; 1083%define psqsum16x16 arg9; 1084%define psqdiff16x16 arg10 1085 1086 push r12 1087 push r13 1088 push r14 1089 push r15 1090 %assign push_num 4 1091 PUSH_XMM 10 1092 1093%ifdef WIN64 1094 mov r4,arg5 1095%endif 1096 mov r14,arg7 1097 SIGN_EXTENSION r2,r2d 1098 SIGN_EXTENSION r3,r3d 1099 SIGN_EXTENSION r4,r4d 1100 1101 mov r13,r4 1102 shr r2,4 ; iPicWidth/16 1103 shr r3,4 ; iPicHeight/16 1104 shl r13,4 ; iPicStride*16 1105 pxor xmm0, xmm0 1106 pxor xmm8, xmm8 ;framesad 1107 pxor xmm9, xmm9 1108sqdiff_height_loop: 1109 ;mov ecx, dword [iPicWidth] 1110 ;mov r14,r2 1111 push r2 1112 %assign push_num push_num +1 1113 mov r10, r0 1114 mov r11, r1 1115sqdiff_width_loop: 1116 pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8 1117 pxor xmm6, xmm6 ; pSum16x16 1118 pxor xmm5, xmm5 ; sqsum_16x16 four dword 1119 pxor xmm4, xmm4 ; sqdiff_16x16 four Dword 1120 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1121 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1122 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1123 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1124 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1125 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1126 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1127 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1128 movdqa xmm1, xmm7 1129 movd [r14], xmm7 1130 psrldq xmm7, 8 1131 paddd xmm1, xmm7 1132 movd [r14+4], xmm7 1133 movd r15d, xmm1 1134 movd xmm9, r15d 1135 paddd xmm8,xmm9 1136 1137 1138 pxor xmm7, xmm7 1139 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1140 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1141 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1142 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1143 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1144 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1145 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1146 WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4 1147 movdqa xmm1, xmm7 1148 movd [r14+8], xmm7 1149 psrldq xmm7, 8 1150 paddd xmm1, xmm7 1151 movd [r14+12], xmm7 1152 movd r15d, xmm1 1153 movd xmm9, r15d 1154 paddd xmm8,xmm9 1155 1156 mov r15, psum16x16 1157 movdqa xmm1, xmm6 1158 psrldq xmm1, 8 1159 paddd xmm6, xmm1 1160 movd [r15], xmm6 1161 add dword psum16x16, 4 1162 1163 mov r15, psqsum16x16 1164 pshufd xmm6, xmm5, 14 ;00001110 1165 paddd xmm6, xmm5 1166 pshufd xmm5, xmm6, 1 ;00000001 1167 paddd xmm5, xmm6 1168 movd [r15], xmm5 1169 add dword psqsum16x16, 4 1170 1171 mov r15, psqdiff16x16 1172 pshufd xmm5, xmm4, 14 ; 00001110 1173 paddd xmm5, xmm4 1174 pshufd xmm4, xmm5, 1 ; 00000001 1175 paddd xmm4, xmm5 1176 movd [r15], xmm4 1177 add dword psqdiff16x16, 4 1178 1179 add r14,16 1180 sub r0, r13 1181 sub r1, r13 1182 add r0, 16 1183 add r1, 16 1184 1185 dec r2 1186 jnz sqdiff_width_loop 1187 1188 pop r2 1189 %assign push_num push_num -1 1190 1191 mov r0, r10 1192 mov r1, r11 1193 add r0, r13 1194 add r1, r13 1195 1196 dec r3 1197 jnz sqdiff_height_loop 1198 1199 mov r13, psadframe 1200 movd [r13], xmm8 1201 1202 POP_XMM 1203 pop r15 1204 pop r14 1205 pop r13 1206 pop r12 1207 %assign push_num 0 1208 1209%undef cur_data 1210%undef ref_data 1211%undef iPicWidth 1212%undef iPicHeight 1213%undef iPicStride 1214%undef psadframe 1215%undef psad8x8 1216%undef psum16x16 1217%undef psqsum16x16 1218%undef psqdiff16x16 1219%undef tmp_esi 1220%undef tmp_edi 1221%undef tmp_sadframe 1222%undef pushsize 1223%undef localsize 1224 ret 1225 1226 1227 1228%endif 1229 1230%ifdef X86_32 1231;************************************************************************************************************* 1232;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 1233; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) 1234;************************************************************************************************************* 1235 1236 1237WELS_EXTERN VAACalcSadBgd_sse2 1238%define localsize 12 1239%define cur_data esp + pushsize + localsize + 4 1240%define ref_data esp + pushsize + localsize + 8 1241%define iPicWidth esp + pushsize + localsize + 12 1242%define iPicHeight esp + pushsize + localsize + 16 1243%define iPicStride esp + pushsize + localsize + 20 1244%define psadframe esp + pushsize + localsize + 24 1245%define psad8x8 esp + pushsize + localsize + 28 1246%define p_sd8x8 esp + pushsize + localsize + 32 1247%define p_mad8x8 esp + pushsize + localsize + 36 1248%define tmp_esi esp + 0 1249%define tmp_edi esp + 4 1250%define tmp_ecx esp + 8 1251%define pushsize 16 1252 push ebp 1253 push esi 1254 push edi 1255 push ebx 1256 sub esp, localsize 1257 mov esi, [cur_data] 1258 mov edi, [ref_data] 1259 mov ebx, [iPicStride] 1260 mov eax, ebx 1261 1262 shr dword [iPicWidth], 4 ; iPicWidth/16 1263 shr dword [iPicHeight], 4 ; iPicHeight/16 1264 shl eax, 4 ; iPicStride*16 1265 xor ebp, ebp 1266 pxor xmm0, xmm0 1267bgd_height_loop: 1268 mov ecx, dword [iPicWidth] 1269 mov [tmp_esi], esi 1270 mov [tmp_edi], edi 1271bgd_width_loop: 1272 pxor xmm7, xmm7 ; pSad8x8 1273 pxor xmm6, xmm6 ; sum_cur_8x8 1274 pxor xmm5, xmm5 ; sum_ref_8x8 1275 pxor xmm4, xmm4 ; pMad8x8 1276 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1277 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1278 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1279 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1280 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1281 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1282 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1283 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1284 1285 1286 mov edx, [p_mad8x8] 1287 WELS_MAX_REG_SSE2 xmm4 1288 1289 ;movdqa xmm1, xmm4 1290 ;punpcklbw xmm1, xmm0 1291 ;punpcklwd xmm1, xmm0 1292 ;movd [edx], xmm1 1293 ;punpckhbw xmm4, xmm0 1294 ;punpcklwd xmm4, xmm0 1295 ;movd [edx+4], xmm4 1296 ;add edx, 8 1297 ;mov [p_mad8x8], edx 1298 mov [tmp_ecx], ecx 1299 movhlps xmm1, xmm4 1300 movd ecx, xmm4 1301 mov [edx], cl 1302 movd ecx, xmm1 1303 mov [edx+1],cl 1304 add edx, 2 1305 mov [p_mad8x8], edx 1306 1307 1308 pslldq xmm7, 4 1309 pslldq xmm6, 4 1310 pslldq xmm5, 4 1311 1312 1313 pxor xmm4, xmm4 ; pMad8x8 1314 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1315 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1316 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1317 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1318 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1319 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1320 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1321 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx 1322 1323 mov edx, [p_mad8x8] 1324 WELS_MAX_REG_SSE2 xmm4 1325 1326 ;movdqa xmm1, xmm4 1327 ;punpcklbw xmm1, xmm0 1328 ;punpcklwd xmm1, xmm0 1329 ;movd [edx], xmm1 1330 ;punpckhbw xmm4, xmm0 1331 ;punpcklwd xmm4, xmm0 1332 ;movd [edx+4], xmm4 1333 ;add edx, 8 1334 ;mov [p_mad8x8], edx 1335 movhlps xmm1, xmm4 1336 movd ecx, xmm4 1337 mov [edx], cl 1338 movd ecx, xmm1 1339 mov [edx+1],cl 1340 add edx, 2 1341 mov [p_mad8x8], edx 1342 1343 ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 1344 1345 mov edx, [psad8x8] 1346 pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 1347 movdqa [edx], xmm1 1348 add edx, 16 1349 mov [psad8x8], edx ; sad8x8 1350 1351 paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 1352 pshufd xmm2, xmm1, 00000011b 1353 paddd xmm1, xmm2 1354 movd edx, xmm1 1355 add ebp, edx ; sad frame 1356 1357 mov edx, [p_sd8x8] 1358 psubd xmm6, xmm5 1359 pshufd xmm1, xmm6, 10001101b 1360 movdqa [edx], xmm1 1361 add edx, 16 1362 mov [p_sd8x8], edx 1363 1364 1365 add edx, 16 1366 sub esi, eax 1367 sub edi, eax 1368 add esi, 16 1369 add edi, 16 1370 1371 mov ecx, [tmp_ecx] 1372 dec ecx 1373 jnz bgd_width_loop 1374 1375 mov esi, [tmp_esi] 1376 mov edi, [tmp_edi] 1377 add esi, eax 1378 add edi, eax 1379 1380 dec dword [iPicHeight] 1381 jnz bgd_height_loop 1382 1383 mov edx, [psadframe] 1384 mov [edx], ebp 1385 1386 add esp, localsize 1387 pop ebx 1388 pop edi 1389 pop esi 1390 pop ebp 1391%undef cur_data 1392%undef ref_data 1393%undef iPicWidth 1394%undef iPicHeight 1395%undef iPicStride 1396%undef psadframe 1397%undef psad8x8 1398%undef p_sd8x8 1399%undef p_mad8x8 1400%undef tmp_esi 1401%undef tmp_edi 1402%undef pushsize 1403%undef localsize 1404 ret 1405 1406 1407 1408;************************************************************************************************************* 1409;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 1410; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 1411; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) 1412;************************************************************************************************************* 1413 1414 1415WELS_EXTERN VAACalcSadSsdBgd_sse2 1416%define localsize 16 1417%define cur_data esp + pushsize + localsize + 4 1418%define ref_data esp + pushsize + localsize + 8 1419%define iPicWidth esp + pushsize + localsize + 12 1420%define iPicHeight esp + pushsize + localsize + 16 1421%define iPicStride esp + pushsize + localsize + 20 1422%define psadframe esp + pushsize + localsize + 24 1423%define psad8x8 esp + pushsize + localsize + 28 1424%define psum16x16 esp + pushsize + localsize + 32 1425%define psqsum16x16 esp + pushsize + localsize + 36 1426%define psqdiff16x16 esp + pushsize + localsize + 40 1427%define p_sd8x8 esp + pushsize + localsize + 44 1428%define p_mad8x8 esp + pushsize + localsize + 48 1429%define tmp_esi esp + 0 1430%define tmp_edi esp + 4 1431%define tmp_sadframe esp + 8 1432%define tmp_ecx esp + 12 1433%define pushsize 16 1434 push ebp 1435 push esi 1436 push edi 1437 push ebx 1438 sub esp, localsize 1439 mov esi, [cur_data] 1440 mov edi, [ref_data] 1441 mov ebx, [iPicStride] 1442 mov eax, ebx 1443 1444 shr dword [iPicWidth], 4 ; iPicWidth/16 1445 shr dword [iPicHeight], 4 ; iPicHeight/16 1446 shl eax, 4 ; iPicStride*16 1447 pxor xmm0, xmm0 1448 movd [tmp_sadframe], xmm0 1449sqdiff_bgd_height_loop: 1450 mov ecx, dword [iPicWidth] 1451 mov [tmp_esi], esi 1452 mov [tmp_edi], edi 1453sqdiff_bgd_width_loop: 1454 pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 1455 pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 1456 pxor xmm5, xmm5 ; pMad8x8 1457 pxor xmm4, xmm4 ; sqdiff_16x16 four Dword 1458 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1459 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1460 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1461 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1462 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1463 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1464 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1465 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1466 1467 mov edx, [psad8x8] 1468 movdqa xmm2, xmm7 1469 pshufd xmm1, xmm2, 00001110b 1470 movd [edx], xmm2 1471 movd [edx+4], xmm1 1472 add edx, 8 1473 mov [psad8x8], edx ; sad8x8 1474 1475 paddd xmm1, xmm2 1476 movd edx, xmm1 1477 add [tmp_sadframe], edx ; iFrameSad 1478 1479 mov edx, [psum16x16] 1480 movdqa xmm1, xmm6 1481 pshufd xmm2, xmm1, 00001110b 1482 paddd xmm1, xmm2 1483 movd [edx], xmm1 ; sum 1484 1485 mov edx, [p_sd8x8] 1486 pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 1487 psubd xmm6, xmm1 ; 00 diff1 00 diff0 1488 pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 1489 movq [edx], xmm1 1490 add edx, 8 1491 mov [p_sd8x8], edx 1492 1493 mov edx, [p_mad8x8] 1494 WELS_MAX_REG_SSE2 xmm5 1495 ;movdqa xmm1, xmm5 1496 ;punpcklbw xmm1, xmm0 1497 ;punpcklwd xmm1, xmm0 1498 ;movd [edx], xmm1 1499 ;punpckhbw xmm5, xmm0 1500 ;punpcklwd xmm5, xmm0 1501 ;movd [edx+4], xmm5 1502 ;add edx, 8 1503 ;mov [p_mad8x8], edx 1504 mov [tmp_ecx], ecx 1505 movhlps xmm1, xmm5 1506 movd ecx, xmm5 1507 mov [edx], cl 1508 movd ecx, xmm1 1509 mov [edx+1],cl 1510 add edx, 2 1511 mov [p_mad8x8], edx 1512 1513 psrlq xmm7, 32 1514 psllq xmm7, 32 ; clear sad 1515 pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 1516 pxor xmm5, xmm5 ; pMad8x8 1517 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1518 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1519 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1520 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1521 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1522 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1523 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1524 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx 1525 1526 mov edx, [psad8x8] 1527 movdqa xmm2, xmm7 1528 pshufd xmm1, xmm2, 00001110b 1529 movd [edx], xmm2 1530 movd [edx+4], xmm1 1531 add edx, 8 1532 mov [psad8x8], edx ; sad8x8 1533 1534 paddd xmm1, xmm2 1535 movd edx, xmm1 1536 add [tmp_sadframe], edx ; iFrameSad 1537 1538 mov edx, [psum16x16] 1539 movdqa xmm1, xmm6 1540 pshufd xmm2, xmm1, 00001110b 1541 paddd xmm1, xmm2 1542 movd ebp, xmm1 ; sum 1543 add [edx], ebp 1544 add edx, 4 1545 mov [psum16x16], edx 1546 1547 mov edx, [psqsum16x16] 1548 psrlq xmm7, 32 1549 pshufd xmm2, xmm7, 00001110b 1550 paddd xmm2, xmm7 1551 movd [edx], xmm2 ; sqsum 1552 add edx, 4 1553 mov [psqsum16x16], edx 1554 1555 mov edx, [p_sd8x8] 1556 pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 1557 psubd xmm6, xmm1 ; 00 diff1 00 diff0 1558 pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 1559 movq [edx], xmm1 1560 add edx, 8 1561 mov [p_sd8x8], edx 1562 1563 mov edx, [p_mad8x8] 1564 WELS_MAX_REG_SSE2 xmm5 1565 ;movdqa xmm1, xmm5 1566 ;punpcklbw xmm1, xmm0 1567 ;punpcklwd xmm1, xmm0 1568 ;movd [edx], xmm1 1569 ;punpckhbw xmm5, xmm0 1570 ;punpcklwd xmm5, xmm0 1571 ;movd [edx+4], xmm5 1572 ;add edx, 8 1573 ;mov [p_mad8x8], edx 1574 movhlps xmm1, xmm5 1575 movd ecx, xmm5 1576 mov [edx], cl 1577 movd ecx, xmm1 1578 mov [edx+1],cl 1579 add edx, 2 1580 mov [p_mad8x8], edx 1581 1582 mov edx, [psqdiff16x16] 1583 pshufd xmm1, xmm4, 00001110b 1584 paddd xmm4, xmm1 1585 pshufd xmm1, xmm4, 00000001b 1586 paddd xmm4, xmm1 1587 movd [edx], xmm4 1588 add edx, 4 1589 mov [psqdiff16x16], edx 1590 1591 add edx, 16 1592 sub esi, eax 1593 sub edi, eax 1594 add esi, 16 1595 add edi, 16 1596 1597 mov ecx, [tmp_ecx] 1598 dec ecx 1599 jnz sqdiff_bgd_width_loop 1600 1601 mov esi, [tmp_esi] 1602 mov edi, [tmp_edi] 1603 add esi, eax 1604 add edi, eax 1605 1606 dec dword [iPicHeight] 1607 jnz sqdiff_bgd_height_loop 1608 1609 mov edx, [psadframe] 1610 mov ebp, [tmp_sadframe] 1611 mov [edx], ebp 1612 1613 add esp, localsize 1614 pop ebx 1615 pop edi 1616 pop esi 1617 pop ebp 1618%undef cur_data 1619%undef ref_data 1620%undef iPicWidth 1621%undef iPicHeight 1622%undef iPicStride 1623%undef psadframe 1624%undef psad8x8 1625%undef psum16x16 1626%undef psqsum16x16 1627%undef psqdiff16x16 1628%undef p_sd8x8 1629%undef p_mad8x8 1630%undef tmp_esi 1631%undef tmp_edi 1632%undef pushsize 1633%undef localsize 1634 ret 1635%else 1636 1637;************************************************************************************************************* 1638;void VAACalcSadBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 1639; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) 1640;************************************************************************************************************* 1641 1642 1643WELS_EXTERN VAACalcSadBgd_sse2 1644%define cur_data arg1; 1645%define ref_data arg2; 1646%define iPicWidth arg3; 1647%define iPicHeight arg4; 1648%define iPicStride arg5; 1649%define psadframe arg6; 1650%define psad8x8 arg7; 1651%define p_sd8x8 arg8; 1652%define p_mad8x8 arg9; 1653 1654 push r12 1655 push r13 1656 push r14 1657 push r15 1658%assign push_num 4 1659 PUSH_XMM 10 1660%ifdef WIN64 1661 mov r4,arg5 1662 ; mov r5,arg6 1663%endif 1664 mov r14,arg7 1665 SIGN_EXTENSION r2,r2d 1666 SIGN_EXTENSION r3,r3d 1667 SIGN_EXTENSION r4,r4d 1668 1669 1670 mov r13,r4 1671 mov r15,r0 1672 shr r2,4 1673 shr r3,4 1674 shl r13,4 1675 pxor xmm0, xmm0 1676 pxor xmm8, xmm8 1677 pxor xmm9, xmm9 1678bgd_height_loop: 1679 ;mov ecx, dword [iPicWidth] 1680 push r2 1681 %assign push_num push_num+1 1682 mov r10, r15 1683 mov r11, r1 1684bgd_width_loop: 1685 pxor xmm7, xmm7 ; pSad8x8 1686 pxor xmm6, xmm6 ; sum_cur_8x8 1687 pxor xmm5, xmm5 ; sum_ref_8x8 1688 pxor xmm4, xmm4 ; pMad8x8 1689 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1690 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1691 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1692 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1693 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1694 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1695 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1696 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1697 1698 1699 mov r14, p_mad8x8 1700 WELS_MAX_REG_SSE2 xmm4 1701 1702 ;mov [tmp_ecx], ecx 1703 movhlps xmm1, xmm4 1704 movd r0d, xmm4 1705 1706 1707 mov [r14], r0b 1708 movd r0d, xmm1 1709 mov [r14+1],r0b 1710 add r14, 2 1711 ;mov p_mad8x8, r14 1712 1713 1714 pslldq xmm7, 4 1715 pslldq xmm6, 4 1716 pslldq xmm5, 4 1717 1718 1719 pxor xmm4, xmm4 ; pMad8x8 1720 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1721 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1722 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1723 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1724 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1725 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1726 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1727 WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4 1728 1729 ;mov r14, [p_mad8x8] 1730 WELS_MAX_REG_SSE2 xmm4 1731 1732 movhlps xmm1, xmm4 1733 movd r0d, xmm4 1734 mov [r14], r0b 1735 movd r0d, xmm1 1736 mov [r14+1],r0b 1737 add r14, 2 1738 mov p_mad8x8, r14 1739 1740 ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2 1741 1742 mov r14, psad8x8 1743 pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0 1744 movdqa [r14], xmm1 1745 add r14, 16 1746 mov psad8x8, r14 ; sad8x8 1747 1748 paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0 1749 pshufd xmm2, xmm1, 00000011b 1750 paddd xmm1, xmm2 1751 movd r14d, xmm1 1752 movd xmm9, r14d 1753 paddd xmm8, xmm9 ; sad frame 1754 1755 mov r14, p_sd8x8 1756 psubd xmm6, xmm5 1757 pshufd xmm1, xmm6, 10001101b 1758 movdqa [r14], xmm1 1759 add r14, 16 1760 mov p_sd8x8, r14 1761 1762 1763 ;add edx, 16 1764 sub r15, r13 1765 sub r1, r13 1766 add r15, 16 1767 add r1, 16 1768 1769 1770 dec r2 1771 jnz bgd_width_loop 1772 pop r2 1773%assign push_num push_num-1 1774 mov r15, r10 1775 mov r1, r11 1776 add r15, r13 1777 add r1, r13 1778 1779 dec r3 1780 jnz bgd_height_loop 1781 1782 mov r13, psadframe 1783 movd [r13], xmm8 1784 1785 POP_XMM 1786 pop r15 1787 pop r14 1788 pop r13 1789 pop r12 1790%assign push_num 0 1791%undef cur_data 1792%undef ref_data 1793%undef iPicWidth 1794%undef iPicHeight 1795%undef iPicStride 1796%undef psadframe 1797%undef psad8x8 1798%undef p_sd8x8 1799%undef p_mad8x8 1800%undef tmp_esi 1801%undef tmp_edi 1802%undef pushsize 1803%undef localsize 1804 ret 1805 1806 1807 1808;************************************************************************************************************* 1809;void VAACalcSadSsdBgd_sse2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 1810; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 1811; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) 1812;************************************************************************************************************* 1813 1814 1815WELS_EXTERN VAACalcSadSsdBgd_sse2 1816%define cur_data arg1; 1817%define ref_data arg2; 1818%define iPicWidth arg3; 1819%define iPicHeight arg4; 1820%define iPicStride arg5; 1821%define psadframe arg6; 1822%define psad8x8 arg7; 1823%define psum16x16 arg8; 1824%define psqsum16x16 arg9; 1825%define psqdiff16x16 arg10; 1826%define p_sd8x8 arg11 1827%define p_mad8x8 arg12 1828 1829 push r12 1830 push r13 1831 push r14 1832 push r15 1833%assign push_num 4 1834 PUSH_XMM 10 1835%ifdef WIN64 1836 mov r4,arg5 1837 ;mov r5,arg6 1838%endif 1839 SIGN_EXTENSION r2,r2d 1840 SIGN_EXTENSION r3,r3d 1841 SIGN_EXTENSION r4,r4d 1842 1843 mov r13,r4 1844 shr r2, 4 ; iPicWidth/16 1845 shr r3, 4 ; iPicHeight/16 1846 shl r13, 4 ; iPicStride*16 1847 pxor xmm0, xmm0 1848 pxor xmm8, xmm8 1849 pxor xmm9, xmm9 1850 1851 1852sqdiff_bgd_height_loop: 1853 mov r10, r0 1854 mov r11, r1 1855 push r2 1856%assign push_num push_num+1 1857sqdiff_bgd_width_loop: 1858 1859 pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0 1860 pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 1861 pxor xmm5, xmm5 ; pMad8x8 1862 pxor xmm4, xmm4 ; sqdiff_16x16 four Dword 1863 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1864 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1865 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1866 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1867 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1868 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1869 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1870 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1871 1872 mov r14, psad8x8 1873 movdqa xmm2, xmm7 1874 pshufd xmm1, xmm2, 00001110b 1875 movd [r14], xmm2 1876 movd [r14+4], xmm1 1877 add r14, 8 1878 mov psad8x8, r14 ; sad8x8 1879 1880 paddd xmm1, xmm2 1881 movd r14d, xmm1 1882 movd xmm9,r14d 1883 paddd xmm8, xmm9 ; iFrameSad 1884 1885 mov r14, psum16x16 1886 movdqa xmm1, xmm6 1887 pshufd xmm2, xmm1, 00001110b 1888 paddd xmm1, xmm2 1889 movd [r14], xmm1 ; sum 1890 1891 mov r14, p_sd8x8 1892 pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 1893 psubd xmm6, xmm1 ; 00 diff1 00 diff0 1894 pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 1895 movq [r14], xmm1 1896 add r14, 8 1897 mov p_sd8x8, r14 1898 1899 mov r14, p_mad8x8 1900 WELS_MAX_REG_SSE2 xmm5 1901 1902 movhlps xmm1, xmm5 1903 push r0 1904 movd r0d, xmm5 1905 mov [r14], r0b 1906 movd r0d, xmm1 1907 mov [r14+1],r0b 1908 pop r0 1909 add r14, 2 1910 mov p_mad8x8, r14 1911 1912 psrlq xmm7, 32 1913 psllq xmm7, 32 ; clear sad 1914 pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0 1915 pxor xmm5, xmm5 ; pMad8x8 1916 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1917 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1918 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1919 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1920 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1921 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1922 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1923 WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4 1924 1925 mov r14, psad8x8 1926 movdqa xmm2, xmm7 1927 pshufd xmm1, xmm2, 00001110b 1928 movd [r14], xmm2 1929 movd [r14+4], xmm1 1930 add r14, 8 1931 mov psad8x8, r14 ; sad8x8 1932 1933 paddd xmm1, xmm2 1934 movd r14d, xmm1 1935 movd xmm9, r14d 1936 paddd xmm8, xmm9 ; iFrameSad 1937 1938 mov r14, psum16x16 1939 movdqa xmm1, xmm6 1940 pshufd xmm2, xmm1, 00001110b 1941 paddd xmm1, xmm2 1942 movd r15d, xmm1 ; sum 1943 add [r14], r15d 1944 add r14, 4 1945 mov psum16x16, r14 1946 1947 mov r14, psqsum16x16 1948 psrlq xmm7, 32 1949 pshufd xmm2, xmm7, 00001110b 1950 paddd xmm2, xmm7 1951 movd [r14], xmm2 ; sqsum 1952 add r14, 4 1953 mov psqsum16x16, r14 1954 1955 mov r14, p_sd8x8 1956 pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0 1957 psubd xmm6, xmm1 ; 00 diff1 00 diff0 1958 pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0 1959 movq [r14], xmm1 1960 add r14, 8 1961 mov p_sd8x8, r14 1962 1963 mov r14, p_mad8x8 1964 WELS_MAX_REG_SSE2 xmm5 1965 1966 1967 movhlps xmm1, xmm5 1968 push r0 1969 movd r0d, xmm5 1970 mov [r14], r0b 1971 movd r0d, xmm1 1972 mov [r14+1],r0b 1973 pop r0 1974 add r14, 2 1975 mov p_mad8x8, r14 1976 1977 mov r14, psqdiff16x16 1978 pshufd xmm1, xmm4, 00001110b 1979 paddd xmm4, xmm1 1980 pshufd xmm1, xmm4, 00000001b 1981 paddd xmm4, xmm1 1982 movd [r14], xmm4 1983 add r14, 4 1984 mov psqdiff16x16, r14 1985 1986 add r14, 16 1987 sub r0, r13 1988 sub r1, r13 1989 add r0, 16 1990 add r1, 16 1991 1992 dec r2 1993 jnz sqdiff_bgd_width_loop 1994 pop r2 1995 %assign push_num push_num-1 1996 mov r0, r10 1997 mov r1, r11 1998 add r0, r13 1999 add r1, r13 2000 2001 dec r3 2002 jnz sqdiff_bgd_height_loop 2003 2004 mov r14, psadframe 2005 movd [r14], xmm8 2006 2007 POP_XMM 2008 pop r15 2009 pop r14 2010 pop r13 2011 pop r12 2012%assign push_num 0 2013%undef cur_data 2014%undef ref_data 2015%undef iPicWidth 2016%undef iPicHeight 2017%undef iPicStride 2018%undef psadframe 2019%undef psad8x8 2020%undef psum16x16 2021%undef psqsum16x16 2022%undef psqdiff16x16 2023%undef p_sd8x8 2024%undef p_mad8x8 2025%undef tmp_esi 2026%undef tmp_edi 2027%undef pushsize 2028%undef localsize 2029 ret 2030%endif 2031 2032%ifdef X86_32 2033%define ptrword dword 2034%else 2035%define ptrword qword 2036%endif 2037 2038%define xmm_width 16 2039%define ymm_width 32 2040 2041%macro PUSHM 1-* 2042 %rep %0 2043 push %1 2044 %rotate 1 2045 %endrep 2046 %assign push_num push_num + %0 2047%endmacro 2048 2049%macro POPM 1-* 2050 %rep %0 2051 %rotate -1 2052 pop %1 2053 %endrep 2054 %assign push_num push_num - %0 2055%endmacro 2056 2057%ifdef X86_32 2058%define stack_alloc_min 4 2059%else 2060%define stack_alloc_min 8 2061%endif 2062 2063; Allocate aligned stack space. 2064; address_out=%1 size=%2 alignment=%3 2065%macro STACK_ALLOC 3 2066%if (%3) & ((%3) - 1) 2067 %error non-power-of-2 alignment requested. 2068%endif 2069%if (%3) > 0 2070 %assign stack_alloc_align ((%3) + stack_alloc_min - 1) / stack_alloc_min 2071%else 2072 %assign stack_alloc_align 1 2073%endif 2074 %assign stack_alloc_num ((%2) + stack_alloc_min - 1) / stack_alloc_min + stack_alloc_align - 1 2075 %assign push_num push_num + stack_alloc_num 2076 sub r7, stack_alloc_min * stack_alloc_num 2077%if stack_alloc_align == 1 2078 mov %1, r7 2079%else 2080 lea %1, [r7 + stack_alloc_min * (stack_alloc_align - 1)] 2081 and %1, -(stack_alloc_min * stack_alloc_align) 2082%endif 2083%endmacro 2084 2085; Deallocate stack space allocated with STACK_ALLOC. 2086%macro STACK_DEALLOC 0 2087 add r7, stack_alloc_min * stack_alloc_num 2088 %assign push_num push_num - stack_alloc_num 2089%endmacro 2090 2091%ifdef HAVE_AVX2 2092; Max unsigned byte per quadword 2093; out=%1 in=%2 tmp=%3 2094%macro AVX2_Maxubq 3 2095 vpsrlq %3, %2, 32 2096 vpmaxub %1, %2, %3 2097 vpsrlq %3, %1, 16 2098 vpmaxub %1, %1, %3 2099 vpsrlq %3, %1, 8 2100 vpmaxub %1, %1, %3 2101%endmacro 2102 2103; Max unsigned byte per quadword. 2 register input. 2104; Results interleaved as least significant byte of even/odd doublewords. 2105; out=%1 in_a=%2 in_b=%3 tmp=%4 2106%macro AVX2_Maxubq2 4 2107 vpblendd %4, %2, %3, 10101010b 2108 vpshufd %4, %4, 10110001b 2109 vpblendd %1, %2, %3, 01010101b 2110 vpmaxub %1, %4, %1 2111 vpsrld %4, %1, 16 2112 vpmaxub %1, %1, %4 2113 vpsrld %4, %1, 8 2114 vpmaxub %1, %1, %4 2115%endmacro 2116 2117; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5 2118%macro AVX2_Sqsumbdw 5 2119 vpunpcklbw %4, %2, %3 2120%if %5 2121 vpmaddwd %4, %4, %4 2122 vpaddd %1, %1, %4 2123%else 2124 vpmaddwd %1, %4, %4 2125%endif 2126 vpunpckhbw %4, %2, %3 2127 vpmaddwd %4, %4, %4 2128 vpaddd %1, %1, %4 2129%endmacro 2130 2131; res=%1 src=%2 zero=%3 tmp=%4 add_to_res=%5 2132%macro AVX2_Sumbdw 5 2133%if %5 2134 vpsadbw %4, %2, %3 2135 vpaddd %1, %1, %4 2136%else 2137 vpsadbw %1, %2, %3 2138%endif 2139%endmacro 2140 2141; res=%1 a=%2 b=%3 a=%4 tmp=%5 2142%macro AVX2_AbsDiffub 5 2143 vpsubusb %5, %2, %3 2144 vpsubusb %1, %3, %4 2145 vpor %1, %5, %1 2146%endmacro 2147 2148; sad=%1 cur_data=%2 ref_data=%3 tmp=%4 accumulate_results=%5 2149%macro AVX2_Sadbdw 5 2150%if %5 2151 vpsadbw %4, %2, %3 2152 vpaddd %1, %1, %4 2153%else 2154 vpsadbw %1, %2, %3 2155%endif 2156%endmacro 2157 2158; sad=%1 sum_cur=%2 sqsum_cur=%3 cur_data=%4 ref_data=%5 zero=%6 tmp=%7 accumulate_results=%8 2159%macro AVX2_SadSumSqsumbdw 8 2160 AVX2_Sadbdw %1, %4, %5, %7, %8 2161 AVX2_Sumbdw %2, %4, %6, %7, %8 2162 AVX2_Sqsumbdw %3, %4, %6, %7, %8 2163%endmacro 2164 2165; sad=%1 pCur=%2 pRef=%3 tmp=%4 accumulate_results=%5 2166%macro AVX2_Sad 5 2167 vmovdqu %4, [%2] 2168 AVX2_Sadbdw %1, %4, [%3], %4, %5 2169%endmacro 2170 2171; sad=%1 sum_cur=%2 sqsum_cur=%3 pCur=%4 pRef=%5 zero=%6 tmp=%7,%8 accumulate_results=%9 2172%macro AVX2_SadSumSqsum 9 2173 vmovdqu %7, [%4] 2174 AVX2_SadSumSqsumbdw %1, %2, %3, %7, [%5], %6, %8, %9 2175%endmacro 2176 2177; sad=%1 sum_cur=%2 sqsum_cur=%3 sqdiff=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11 2178%macro AVX2_SadSumSqsumSqdiff 11 2179 vmovdqu %8, [%5] 2180 vmovdqu %9, [%6] 2181 AVX2_SadSumSqsumbdw %1, %2, %3, %8, %9, %7, %10, %11 2182 AVX2_AbsDiffub %9, %8, %9, %8, %10 2183 AVX2_Sqsumbdw %4, %9, %7, %10, %11 2184%endmacro 2185 2186; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 pCur=%5 pRef=%6 zero=%7 tmp=%8,%9,%10 accumulate_results=%11 2187%macro AVX2_SadSdMad 11 2188 vmovdqu %8, [%5] 2189 vmovdqu %9, [%6] 2190 AVX2_Sumbdw %2, %8, %7, %10, %11 2191 AVX2_Sumbdw %3, %9, %7, %10, %11 2192 AVX2_Sadbdw %1, %8, %9, %10, %11 2193%if %11 2194 AVX2_AbsDiffub %9, %8, %9, %8, %10 2195 vpmaxub %4, %4, %9 2196%else 2197 AVX2_AbsDiffub %4, %8, %9, %8, %10 2198%endif 2199%endmacro 2200 2201; sad=%1 sum_cur=%2 sum_ref=%3 mad=%4 sqdiff=%5 sqsum_cur=%6 pCur=%7 pRef=%8 zero=%9 tmp=%10,%11,%12 accumulate_results=%13 2202%macro AVX2_SadBgdSqdiff 13 2203%ifidn %12, 0 2204 vmovdqu %10, [%7] 2205 AVX2_Sumbdw %2, %10, %9, %11, %13 2206 AVX2_Sqsumbdw %6, %10, %9, %11, %13 2207 vmovdqu %11, [%8] 2208 AVX2_Sadbdw %1, %10, %11, %10, %13 2209 AVX2_Sumbdw %3, %11, %9, %10, %13 2210 vmovdqu %10, [%7] 2211%if %13 2212 AVX2_AbsDiffub %11, %10, %11, [%7], %10 2213 vpmaxub %4, %4, %11 2214 AVX2_Sqsumbdw %5, %11, %9, %10, %13 2215%else 2216 AVX2_AbsDiffub %4, %10, %11, [%7], %10 2217 AVX2_Sqsumbdw %5, %4, %9, %10, %13 2218%endif 2219%else 2220 vmovdqu %10, [%7] 2221 vmovdqu %11, [%8] 2222 AVX2_Sadbdw %1, %10, %11, %12, %13 2223 AVX2_Sumbdw %2, %10, %9, %12, %13 2224 AVX2_Sumbdw %3, %11, %9, %12, %13 2225 AVX2_Sqsumbdw %6, %10, %9, %12, %13 2226%if %13 2227 AVX2_AbsDiffub %11, %10, %11, %10, %12 2228 vpmaxub %4, %4, %11 2229 AVX2_Sqsumbdw %5, %11, %9, %10, %13 2230%else 2231 AVX2_AbsDiffub %4, %10, %11, %10, %12 2232 AVX2_Sqsumbdw %5, %4, %9, %10, %13 2233%endif 2234%endif 2235%endmacro 2236 2237; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5 2238%macro AVX2_Store8x8Accdw 5 2239 vpshufd %2%4, %2%3, 1000b 2240%ifidni %2, x 2241 vmovlps [%1 + 8 * %5], x%4 2242%elif %5 == 0 2243 vmovdqu [%1], %2%4 2244%else 2245 vmovlps [%1 + 8], x%4 2246 vextracti128 x%4, %2%4, 1 2247 vmovlps [%1 + 24], x%4 2248%endif 2249%endmacro 2250 2251; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 second_blocks=%5 2252%macro AVX2_Store8x8Accb 5 2253 vpunpckhqdq %2%4, %2%3, %2%3 2254 vpunpcklbw %2%4, %2%3, %2%4 2255%if %5 == 0 2256 vmovd [%1 + 0], x%4 2257%ifidni %2, y 2258 vextracti128 x%4, %2%4, 1 2259 vmovd [%1 + 4], x%4 2260%endif 2261%else 2262 vpextrw [%1 + 2], x%4, 0 2263%ifidni %2, y 2264 vextracti128 x%4, %2%4, 1 2265 vpextrw [%1 + 6], x%4, 0 2266%endif 2267%endif 2268%endmacro 2269 2270; p_dst=%1 data=%2 tmp=%3,%4 second_blocks=%5 2271%macro AVX2_Store2x8x8Accb 5 2272 vpunpckhqdq y%3, y%2, y%2 2273 vpunpcklbw y%3, y%2, y%3 2274 vextracti128 x%4, y%3, 1 2275 vpsllq x%4, x%4, 32 2276 vpblendd x%4, x%3, x%4, 1010b 2277%if %5 2278 vpslld x%4, x%4, 16 2279 vpblendw x%4, x%4, [%1], 01010101b 2280%endif 2281 vmovdqu [%1], x%4 2282%endmacro 2283 2284; p_dst=%1 mmreg_prefix=%2 data=%3 tmp=%4 add_to_dst=%5 2285%macro AVX2_Store16x16Accdw 5 2286%ifidni %2, x 2287%if %5 2288 vmovd x%4, [%1 + 0] 2289 vpaddd x%3, x%4, x%3 2290%endif 2291 vmovd [%1 + 0], x%3 2292%elif %5 == 0 2293 vmovd [%1 + 0], x%3 2294 vextracti128 x%3, %2%3, 1 2295 vmovd [%1 + 4], x%3 2296%else 2297 vextracti128 x%4, %2%3, 1 2298 vpunpckldq x%4, x%3, x%4 2299 vmovq x%3, [%1 + 0] 2300 vpaddd x%3, x%3, x%4 2301 vmovlps [%1 + 0], x%3 2302%endif 2303%endmacro 2304 2305; p_dst1=%1 p_dst2=%2 i_dst_offset=%3 gpr_tmp=%4 mmreg_prefix=%5 data=%6 mm_tmp=%7 add_to_dst=%8 2306%macro AVX2_Store2x16x16Accdw 8 2307%ifidni %5, x 2308 mov %4, %1 2309%if %8 == 0 2310 vmovd [%4 + %3], x%6 2311 mov %4, %2 2312 vpextrd [%4 + %3], x%6, 2 2313%else 2314 vmovd x%7, [%4 + %3] 2315 vpaddd x%7, x%7, x%6 2316 vmovd [%4 + %3], x%7 2317 mov %4, %2 2318 vpbroadcastd x%7, [%4 + %3] 2319 vpaddd x%7, x%7, x%6 2320 vpextrd [%4 + %3], x%7, 2 2321%endif 2322%else 2323 vextracti128 x%7, %5%6, 1 2324 vpblendd x%6, x%6, x%7, 1010b 2325 mov %4, %1 2326%if %8 == 0 2327 vmovlps [%4 + %3], x%6 2328 mov %4, %2 2329 vmovhps [%4 + %3], x%6 2330%else 2331 vmovq x%7, [%4 + %3] 2332 vpaddd x%7, x%7, x%6 2333 vmovlps [%4 + %3], x%7 2334 mov %4, %2 2335 vpbroadcastq x%7, [%4 + %3] 2336 vpaddd x%7, x%7, x%6 2337 vmovhps [%4 + %3], x%7 2338%endif 2339%endif 2340%endmacro 2341 2342 2343; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7 2344%macro AVX2_CalcSad_8Lines 7 2345%define mm_tmp0 %2 2346%define mm_sad %3 2347%define mm_sad2 %4 2348%define mm_sad3 %5 2349%define mm_sad4 %6 2350%define b_second_blocks %7 2351%ifdef i_stride5 2352 %define i_stride5_ i_stride5 2353%else 2354 lea r_tmp, [5 * i_stride] 2355 %define i_stride5_ r_tmp 2356%endif 2357 ; Use multiple accumulators to shorten dependency chains and enable more parallelism. 2358 AVX2_Sad %1 %+ mm_sad, p_cur, p_ref, %1 %+ mm_tmp0, 0 2359 AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_tmp0, 0 2360 AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_tmp0, 0 2361 AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_tmp0, 0 2362 AVX2_Sad %1 %+ mm_sad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_tmp0, 1 2363 AVX2_Sad %1 %+ mm_sad2, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_tmp0, 1 2364%ifdef i_stride7 2365 %define i_stride7_ i_stride7 2366%else 2367 lea r_tmp, [i_stride + 2 * i_stride3] 2368 %define i_stride7_ r_tmp 2369%endif 2370 AVX2_Sad %1 %+ mm_sad3, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_tmp0, 1 2371 AVX2_Sad %1 %+ mm_sad4, p_cur + 1 * i_stride7_, p_ref + 1 * i_stride7_, %1 %+ mm_tmp0, 1 2372%undef i_stride5_ 2373%undef i_stride7_ 2374 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 2375 add p_cur, %1 %+ mm_width 2376 add p_ref, %1 %+ mm_width 2377 ; Collapse accumulators. 2378 vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad2 2379 vpaddd %1 %+ mm_sad3, %1 %+ mm_sad3, %1 %+ mm_sad4 2380 vpaddd %1 %+ mm_sad, %1 %+ mm_sad, %1 %+ mm_sad3 2381 AVX2_Store8x8Accdw p_sad8x8 + xcnt_unit * i_xcnt, %1, mm_sad, mm_tmp0, b_second_blocks 2382 vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad 2383%undef mm_tmp0 2384%undef mm_sad 2385%undef mm_sad2 2386%undef mm_sad3 2387%undef mm_sad4 2388%undef b_second_blocks 2389%endmacro 2390 2391;************************************************************************************************************* 2392;void VAACalcSad_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 2393; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8) 2394;************************************************************************************************************* 2395 2396WELS_EXTERN VAACalcSad_avx2 2397%define p_sadframe ptrword arg6 2398%define p_sad8x8 ptrword arg7 2399%ifdef X86_32 2400%define saveregs r5, r6 2401%else 2402%define saveregs rbx, rbp, r12 2403%endif 2404 2405%assign push_num 0 2406 LOAD_5_PARA 2407 PUSH_XMM 7 2408 SIGN_EXTENSION r2, r2d 2409 SIGN_EXTENSION r3, r3d 2410 SIGN_EXTENSION r4, r4d 2411 PUSHM saveregs 2412 2413%define mm_zero mm0 2414%define mm_sadframe mm6 2415 vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero 2416 vmovdqa y %+ mm_sadframe, y %+ mm_zero 2417 2418 and r2, -16 ; iPicWidth &= -16 2419 jle .done ; bail if iPicWidth < 16 2420 sar r3, 4 ; iPicHeight / 16 2421 jle .done ; bail if iPicHeight < 16 2422 shr r2, 2 ; iPicWidth / 4 2423 2424%define p_cur r0 2425%define p_ref r1 2426%define i_xcnt r2 2427%define i_ycnt ptrword arg4 2428%define i_stride r4 2429%define xcnt_unit 4 2430%ifdef X86_32 2431 mov i_ycnt, r3 2432 mov r5, p_sad8x8 2433 %define i_stride3 r3 2434 %undef p_sad8x8 2435 %define p_sad8x8 r5 2436 %define r_tmp r6 2437 lea i_stride3, [3 * i_stride] 2438%else 2439 mov rbp, p_sad8x8 2440 %define i_stride3 rbx 2441 %define i_stride5 r12 2442 %define i_stride7 r6 2443 %undef p_sad8x8 2444 %define p_sad8x8 rbp 2445 lea i_stride3, [3 * i_stride] 2446 lea i_stride5, [5 * i_stride] 2447 lea i_stride7, [i_stride + 2 * i_stride3] 2448%endif 2449 2450 ; offset pointer so as to compensate for the i_xcnt offset below. 2451 sub p_sad8x8, 4 * 16 / xcnt_unit 2452 2453 push i_xcnt 2454%assign push_num push_num + 1 2455%define i_xcnt_load ptrword [r7] 2456 2457.height_loop: 2458 ; use end-of-line pointers so as to enable use of a negative counter as index. 2459 lea p_sad8x8, [p_sad8x8 + xcnt_unit * i_xcnt] 2460 ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter. 2461 neg i_xcnt 2462 add i_xcnt, 16 / xcnt_unit 2463 jz .width_loop_upper8_remaining16 2464.width_loop_upper8: 2465 AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 0 2466 add i_xcnt, 32 / xcnt_unit 2467 jl .width_loop_upper8 2468 jg .width_loop_upper8_end 2469.width_loop_upper8_remaining16: 2470 AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 0 2471.width_loop_upper8_end: 2472 lea p_cur, [p_cur + 8 * i_stride] 2473 lea p_ref, [p_ref + 8 * i_stride] 2474 xor i_xcnt, i_xcnt 2475 sub i_xcnt, i_xcnt_load 2476 lea p_cur, [p_cur + xcnt_unit * i_xcnt] 2477 lea p_ref, [p_ref + xcnt_unit * i_xcnt] 2478 add i_xcnt, 16 / xcnt_unit 2479 jz .width_loop_lower8_remaining16 2480.width_loop_lower8: 2481 AVX2_CalcSad_8Lines y, mm1, mm2, mm3, mm4, mm5, 1 2482 add i_xcnt, 32 / xcnt_unit 2483 jl .width_loop_lower8 2484 jg .width_loop_lower8_end 2485.width_loop_lower8_remaining16: 2486 AVX2_CalcSad_8Lines x, mm1, mm2, mm3, mm4, mm5, 1 2487.width_loop_lower8_end: 2488 lea p_cur, [p_cur + 8 * i_stride] 2489 lea p_ref, [p_ref + 8 * i_stride] 2490 xor i_xcnt, i_xcnt 2491 sub i_xcnt, i_xcnt_load 2492 lea p_cur, [p_cur + xcnt_unit * i_xcnt] 2493 lea p_ref, [p_ref + xcnt_unit * i_xcnt] 2494 neg i_xcnt 2495 sub i_ycnt, 1 2496 jnz .height_loop 2497 2498 pop i_xcnt 2499%assign push_num push_num - 1 2500%undef i_xcnt_load 2501 2502.done: 2503 mov r6, p_sadframe 2504 vextracti128 xmm2, y %+ mm_sadframe, 1 2505 vpaddd xmm2, x %+ mm_sadframe, xmm2 2506 vpunpckhqdq xmm1, xmm2, xmm2 2507 vpaddd xmm2, xmm2, xmm1 2508 vmovd [r6], xmm2 2509 vzeroupper 2510 2511 POPM saveregs 2512 POP_XMM 2513 LOAD_5_PARA_POP 2514%undef p_cur 2515%undef p_ref 2516%undef i_xcnt 2517%undef i_ycnt 2518%undef i_stride 2519%undef r_tmp 2520%undef xcnt_unit 2521%undef i_stride3 2522%undef i_stride5 2523%undef i_stride7 2524%undef mm_sadframe 2525%undef mm_zero 2526%undef saveregs 2527%undef p_sadframe 2528%undef p_sad8x8 2529 ret 2530 2531 2532; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6 b_second_blocks=%7 2533%macro AVX2_CalcSadVar_8Lines 7 2534%define mm_tmp0 %2 2535%define mm_tmp1 %3 2536%define mm_sad %4 2537%define mm_sum %5 2538%define mm_sqsum %6 2539%define b_second_blocks %7 2540 ; Unroll for better performance on Haswell. 2541 ; Avoid unrolling for the 16 px case so as to reduce the code footprint. 2542%ifidni %1, y 2543 lea r_tmp, [5 * i_stride] 2544 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 0 2545 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2546 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2547 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2548 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2549 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2550 lea r_tmp, [i_stride + 2 * i_stride3] 2551 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2552 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2553 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 2554 add p_cur, %1 %+ mm_width 2555 add p_ref, %1 %+ mm_width 2556%else 2557 vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad 2558 vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum 2559 vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum 2560 lea r_tmp, [8 * i_stride] 2561 add p_cur, r_tmp 2562 add p_ref, r_tmp 2563 neg r_tmp 2564%%loop: 2565 AVX2_SadSumSqsum %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, 1 2566 add r_tmp, i_stride 2567 jl %%loop 2568 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 2569 lea r_tmp, [8 * i_stride - %1 %+ mm_width] 2570 sub p_cur, r_tmp 2571 sub p_ref, r_tmp 2572%endif 2573 AVX2_Store8x8Accdw p_sad8x8 + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks 2574 vpaddd y %+ mm_sadframe, y %+ mm_sadframe, y %+ mm_sad 2575 vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_sqsum 2576 vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sqsum 2577 vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 2578 vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b 2579 vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 2580 AVX2_Store2x16x16Accdw p_sum16x16, p_sqsum16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks 2581%undef mm_tmp0 2582%undef mm_tmp1 2583%undef mm_sad 2584%undef mm_sum 2585%undef mm_sqsum 2586%undef b_second_blocks 2587%endmacro 2588 2589;************************************************************************************************************* 2590;void VAACalcSadVar_avx2( const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 2591; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) 2592;************************************************************************************************************* 2593 2594WELS_EXTERN VAACalcSadVar_avx2 2595%define p_sadframe ptrword arg6 2596%define p_sad8x8 ptrword arg7 2597%define p_sum16x16 ptrword arg8 2598%define p_sqsum16x16 ptrword arg9 2599%ifdef X86_32 2600%define saveregs r5, r6 2601%else 2602%define saveregs rbx, rbp, r12, r13 2603%endif 2604 2605%assign push_num 0 2606 LOAD_5_PARA 2607 PUSH_XMM 7 2608 SIGN_EXTENSION r2, r2d 2609 SIGN_EXTENSION r3, r3d 2610 SIGN_EXTENSION r4, r4d 2611 PUSHM saveregs 2612 2613%define mm_zero mm0 2614%define mm_sadframe mm6 2615 vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero 2616 vmovdqa y %+ mm_sadframe, y %+ mm_zero 2617 2618 and r2, -16 ; iPicWidth &= -16 2619 jle .done ; bail if iPicWidth < 16 2620 sar r3, 4 ; iPicHeight / 16 2621 jle .done ; bail if iPicHeight < 16 2622 shr r2, 2 ; iPicWidth / 4 2623 2624%define p_cur r0 2625%define p_ref r1 2626%define i_xcnt r2 2627%define i_ycnt ptrword arg4 2628%define i_stride r4 2629%define r_tmp r6 2630%define xcnt_unit 4 2631%ifdef X86_32 2632 mov i_ycnt, r3 2633 mov r3, p_sad8x8 2634 %undef p_sad8x8 2635 %define p_sad8x8 r3 2636 %define i_stride3 r5 2637%else 2638 mov rbp, p_sad8x8 2639 mov r12, p_sum16x16 2640 mov r13, p_sqsum16x16 2641 %undef p_sad8x8 2642 %undef p_sum16x16 2643 %undef p_sqsum16x16 2644 %define p_sad8x8 rbp 2645 %define p_sum16x16 r12 2646 %define p_sqsum16x16 r13 2647 %define i_stride3 rbx 2648%endif 2649 lea i_stride3, [3 * i_stride] 2650 2651 ; offset pointers so as to compensate for the i_xcnt offset below. 2652 sub p_sad8x8, 4 * 16 / xcnt_unit 2653 sub p_sum16x16, 1 * 16 / xcnt_unit 2654 sub p_sqsum16x16, 1 * 16 / xcnt_unit 2655 2656 ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter. 2657 neg i_xcnt 2658 2659.height_loop: 2660 push i_xcnt 2661%assign push_num push_num + 1 2662%define i_xcnt_load ptrword [r7] 2663 ; use end-of-line pointers so as to enable use of a negative counter as index. 2664 lea r_tmp, [xcnt_unit * i_xcnt] 2665 sub p_sad8x8, r_tmp 2666 sub p_sum16x16, i_xcnt 2667 sub p_sqsum16x16, i_xcnt 2668 add i_xcnt, 16 / xcnt_unit 2669 jz .width_loop_upper8_remaining16 2670.width_loop_upper8: 2671 AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 0 2672 add i_xcnt, 32 / xcnt_unit 2673 jl .width_loop_upper8 2674 jg .width_loop_upper8_end 2675.width_loop_upper8_remaining16: 2676 AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 0 2677.width_loop_upper8_end: 2678 lea p_cur, [p_cur + 8 * i_stride] 2679 lea p_ref, [p_ref + 8 * i_stride] 2680 mov i_xcnt, i_xcnt_load 2681 lea p_cur, [p_cur + xcnt_unit * i_xcnt] 2682 lea p_ref, [p_ref + xcnt_unit * i_xcnt] 2683 add i_xcnt, 16 / xcnt_unit 2684 jz .width_loop_lower8_remaining16 2685.width_loop_lower8: 2686 AVX2_CalcSadVar_8Lines y, mm1, mm2, mm3, mm4, mm5, 1 2687 add i_xcnt, 32 / xcnt_unit 2688 jl .width_loop_lower8 2689 jg .width_loop_lower8_end 2690.width_loop_lower8_remaining16: 2691 AVX2_CalcSadVar_8Lines x, mm1, mm2, mm3, mm4, mm5, 1 2692.width_loop_lower8_end: 2693 lea p_cur, [p_cur + 8 * i_stride] 2694 lea p_ref, [p_ref + 8 * i_stride] 2695%undef i_xcnt_load 2696 pop i_xcnt 2697 %assign push_num push_num - 1 2698 lea p_cur, [p_cur + xcnt_unit * i_xcnt] 2699 lea p_ref, [p_ref + xcnt_unit * i_xcnt] 2700 sub i_ycnt, 1 2701 jnz .height_loop 2702 2703.done: 2704 mov r_tmp, p_sadframe 2705 vextracti128 xmm2, y %+ mm_sadframe, 1 2706 vpaddd xmm2, x %+ mm_sadframe, xmm2 2707 vpunpckhqdq xmm1, xmm2, xmm2 2708 vpaddd xmm2, xmm2, xmm1 2709 vmovd [r_tmp], xmm2 2710 vzeroupper 2711 2712 POPM saveregs 2713 POP_XMM 2714 LOAD_5_PARA_POP 2715%undef p_cur 2716%undef p_ref 2717%undef i_xcnt 2718%undef i_ycnt 2719%undef i_stride 2720%undef i_stride3 2721%undef r_tmp 2722%undef xcnt_unit 2723%undef mm_sadframe 2724%undef mm_zero 2725%undef saveregs 2726%undef p_sadframe 2727%undef p_sad8x8 2728%undef p_sum16x16 2729%undef p_sqsum16x16 2730 ret 2731 2732 2733; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9 2734%macro AVX2_CalcSadSsd_8Lines 9 2735%define mm_tmp0 %2 2736%define mm_tmp1 %3 2737%define mm_tmp2 %4 2738%define mm_sad %5 2739%define mm_sum %6 2740%define mm_sqsum %7 2741%define mm_sqdiff %8 2742%define b_second_blocks %9 2743 ; Unroll for better performance on Haswell. 2744 ; Avoid unrolling for the 16 px case so as to reduce the code footprint. 2745%ifidni %1, y 2746%ifdef i_stride5 2747 lea r_tmp, [i_stride + 2 * i_stride3] 2748 %define i_stride5_ i_stride5 2749%else 2750 lea r_tmp, [5 * i_stride] 2751 %define i_stride5_ r_tmp 2752%endif 2753 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0 2754 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2755 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2756 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2757 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2758 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 1 * i_stride5_, p_ref + 1 * i_stride5_, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2759%ifndef i_stride5 2760 lea r_tmp, [i_stride + 2 * i_stride3] 2761%endif 2762%undef i_stride5_ 2763 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2764 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2765 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 2766 add p_cur, %1 %+ mm_width 2767 add p_ref, %1 %+ mm_width 2768%else 2769 vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad 2770 vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum 2771 vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum 2772 vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff 2773 lea r_tmp, [8 * i_stride] 2774 add p_cur, r_tmp 2775 add p_ref, r_tmp 2776 neg r_tmp 2777%%loop: 2778 AVX2_SadSumSqsumSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sqsum, %1 %+ mm_sqdiff, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2779 add r_tmp, i_stride 2780 jl %%loop 2781 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 2782 lea r_tmp, [8 * i_stride - %1 %+ mm_width] 2783 sub p_cur, r_tmp 2784 sub p_ref, r_tmp 2785%endif 2786 mov r_tmp, p_sad8x8 2787 AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks 2788%ifdef X86_32 2789 vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc 2790 vmovdqa sadframe_acc, y %+ mm_tmp1 2791%else 2792 vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad 2793%endif 2794 mov r_tmp, i_xcnt 2795 add r_tmp, p_sum16x16 2796 vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum 2797 vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1 2798 AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks 2799 vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff 2800 vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff 2801 vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 2802 vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b 2803 vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 2804 AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks 2805%undef mm_tmp0 2806%undef mm_tmp1 2807%undef mm_tmp2 2808%undef mm_sad 2809%undef mm_sum 2810%undef mm_sqsum 2811%undef mm_sqdiff 2812%undef b_second_blocks 2813%endmacro 2814 2815;************************************************************************************************************* 2816;void VAACalcSadSsd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 2817; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) 2818;************************************************************************************************************* 2819 2820WELS_EXTERN VAACalcSadSsd_avx2 2821%define p_sadframe ptrword arg6 2822%define p_sad8x8 ptrword arg7 2823%define p_sum16x16 ptrword arg8 2824%define p_sqsum16x16 ptrword arg9 2825%define p_sqdiff16x16 ptrword arg10 2826%ifdef X86_32 2827%define saveregs r5, r6 2828%else 2829%define saveregs rbx, rbp, r12, r13, r14, r15 2830%endif 2831 2832%assign push_num 0 2833 LOAD_5_PARA 2834 PUSH_XMM 9 2835 SIGN_EXTENSION r2, r2d 2836 SIGN_EXTENSION r3, r3d 2837 SIGN_EXTENSION r4, r4d 2838 PUSHM saveregs 2839 2840%define mm_zero mm0 2841 vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero 2842 2843%ifdef X86_32 2844 STACK_ALLOC r5, ymm_width, ymm_width 2845 %define sadframe_acc_addr r5 2846 %define sadframe_acc [sadframe_acc_addr] 2847%else 2848 %define sadframe_acc ymm8 2849 %define xsadframe_acc xmm8 2850%endif 2851 vmovdqa sadframe_acc, y %+ mm_zero 2852 2853 and r2, -16 ; iPicWidth &= -16 2854 jle .done ; bail if iPicWidth < 16 2855 sar r3, 4 ; iPicHeight / 16 2856 jle .done ; bail if iPicHeight < 16 2857 shr r2, 2 ; iPicWidth / 4 2858 2859%define p_cur r0 2860%define p_ref r1 2861%define i_xcnt r2 2862%define i_ycnt ptrword arg4 2863%define i_stride r4 2864%define r_tmp r6 2865%define xcnt_unit 4 2866%ifdef X86_32 2867 mov i_ycnt, r3 2868 %define i_stride3 r3 2869%else 2870 mov r12, p_sad8x8 2871 mov r13, p_sum16x16 2872 mov r14, p_sqsum16x16 2873 mov r15, p_sqdiff16x16 2874 %undef p_sad8x8 2875 %undef p_sum16x16 2876 %undef p_sqsum16x16 2877 %undef p_sqdiff16x16 2878 %define p_sad8x8 r12 2879 %define p_sum16x16 r13 2880 %define p_sqsum16x16 r14 2881 %define p_sqdiff16x16 r15 2882 %define i_stride3 rbx 2883 %define i_stride5 rbp 2884 lea i_stride5, [5 * i_stride] 2885%endif 2886 lea i_stride3, [3 * i_stride] 2887 2888 ; offset pointers so as to compensate for i_xcnt offset below. 2889 sub p_sad8x8, 4 * 16 / xcnt_unit 2890 sub p_sum16x16, 1 * 16 / xcnt_unit 2891 sub p_sqsum16x16, 1 * 16 / xcnt_unit 2892 sub p_sqdiff16x16, 1 * 16 / xcnt_unit 2893 2894 ; use a negative loop counter so as to enable counting toward zero and indexing with the same counter. 2895 neg i_xcnt 2896 2897.height_loop: 2898 push i_xcnt 2899%assign push_num push_num + 1 2900%define i_xcnt_load ptrword [r7] 2901 ; use end-of-line pointers so as to enable use of a negative counter as index. 2902 lea r_tmp, [xcnt_unit * i_xcnt] 2903 sub p_sad8x8, r_tmp 2904 sub p_sum16x16, i_xcnt 2905 sub p_sqsum16x16, i_xcnt 2906 sub p_sqdiff16x16, i_xcnt 2907 add i_xcnt, 16 / xcnt_unit 2908 jz .width_loop_upper8_remaining16 2909.width_loop_upper8: 2910 AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 2911 add i_xcnt, 32 / xcnt_unit 2912 jl .width_loop_upper8 2913 jg .width_loop_upper8_end 2914.width_loop_upper8_remaining16: 2915 AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 2916.width_loop_upper8_end: 2917 lea p_cur, [p_cur + 8 * i_stride] 2918 lea p_ref, [p_ref + 8 * i_stride] 2919 mov i_xcnt, i_xcnt_load 2920 lea p_cur, [p_cur + xcnt_unit * i_xcnt] 2921 lea p_ref, [p_ref + xcnt_unit * i_xcnt] 2922 add i_xcnt, 16 / xcnt_unit 2923 jz .width_loop_lower8_remaining16 2924.width_loop_lower8: 2925 AVX2_CalcSadSsd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 2926 add i_xcnt, 32 / xcnt_unit 2927 jl .width_loop_lower8 2928 jg .width_loop_lower8_end 2929.width_loop_lower8_remaining16: 2930 AVX2_CalcSadSsd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 2931.width_loop_lower8_end: 2932 lea p_cur, [p_cur + 8 * i_stride] 2933 lea p_ref, [p_ref + 8 * i_stride] 2934%undef i_xcnt_load 2935 pop i_xcnt 2936 %assign push_num push_num - 1 2937 lea p_cur, [p_cur + xcnt_unit * i_xcnt] 2938 lea p_ref, [p_ref + xcnt_unit * i_xcnt] 2939 sub i_ycnt, 1 2940 jnz .height_loop 2941 2942.done: 2943 mov r_tmp, p_sadframe 2944%ifdef X86_32 2945 vmovdqa xmm2, sadframe_acc 2946 vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width] 2947%else 2948 vextracti128 xmm2, sadframe_acc, 1 2949 vpaddd xmm2, xsadframe_acc, xmm2 2950%endif 2951 vpunpckhqdq xmm1, xmm2, xmm2 2952 vpaddd xmm2, xmm2, xmm1 2953 vmovd [r_tmp], xmm2 2954 vzeroupper 2955%ifdef X86_32 2956 STACK_DEALLOC 2957%endif 2958 POPM saveregs 2959 POP_XMM 2960 LOAD_5_PARA_POP 2961%undef p_cur 2962%undef p_ref 2963%undef i_xcnt 2964%undef i_ycnt 2965%undef i_stride 2966%undef i_stride3 2967%undef i_stride5 2968%undef r_tmp 2969%undef xcnt_unit 2970%undef sadframe_acc 2971%undef sadframe_acc_addr 2972%undef xsadframe_acc 2973%undef mm_zero 2974%undef saveregs 2975%undef p_sadframe 2976%undef p_sad8x8 2977%undef p_sum16x16 2978%undef p_sqsum16x16 2979%undef p_sqdiff16x16 2980 ret 2981 2982 2983; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8 b_second_blocks=%9 2984%macro AVX2_CalcSadBgd_8Lines 9 2985%define mm_tmp0 %2 2986%define mm_tmp1 %3 2987%define mm_tmp2 %8 2988%define mm_mad %4 2989%define mm_sumcur %5 2990%define mm_sumref %6 2991%define mm_sad %7 2992%define b_second_blocks %9 2993 ; Unroll for better performance on Haswell. 2994 ; Avoid unrolling for the 16 px case so as to reduce the code footprint. 2995%ifidni %1, y 2996 lea r_tmp, [5 * i_stride] 2997 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 0 2998 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 2999 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 3000 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 3001 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 3002 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 3003 lea r_tmp, [i_stride + 2 * i_stride3] 3004 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 3005 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 3006 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 3007 add p_cur, %1 %+ mm_width 3008 add p_ref, %1 %+ mm_width 3009%else 3010 vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad 3011 vpxor x %+ mm_sumcur, x %+ mm_sumcur, x %+ mm_sumcur 3012 vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref 3013 vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad 3014 lea r_tmp, [8 * i_stride] 3015 add p_cur, r_tmp 3016 add p_ref, r_tmp 3017 neg r_tmp 3018%%loop: 3019 AVX2_SadSdMad %1 %+ mm_sad, %1 %+ mm_sumcur, %1 %+ mm_sumref, %1 %+ mm_mad, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, %1 %+ mm_tmp2, 1 3020 add r_tmp, i_stride 3021 jl %%loop 3022 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 3023 lea r_tmp, [8 * i_stride - %1 %+ mm_width] 3024 sub p_cur, r_tmp 3025 sub p_ref, r_tmp 3026%endif 3027 mov r_tmp, p_sad8x8 3028 AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks 3029%ifdef X86_32 3030 vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc 3031 vmovdqa sadframe_acc, y %+ mm_tmp1 3032%else 3033 vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad 3034%endif 3035 mov r_tmp, p_sd8x8 3036 vpsubd %1 %+ mm_tmp0, %1 %+ mm_sumcur, %1 %+ mm_sumref 3037 AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_tmp0, mm_tmp1, b_second_blocks 3038 ; Coalesce store and horizontal reduction of MAD accumulator for even and 3039 ; odd iterations so as to enable more parallelism. 3040%ifidni %1, y 3041 test i_xcnt, 32 / xcnt_unit 3042 jz %%preserve_mad 3043 mov r_tmp, p_mad8x8 3044 AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0 3045 AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks 3046%%preserve_mad: 3047 vmovdqa prev_mad, y %+ mm_mad 3048%else 3049 mov r_tmp, p_mad8x8 3050 AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0 3051 AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks 3052%endif 3053%undef mm_tmp0 3054%undef mm_tmp1 3055%undef mm_tmp2 3056%undef mm_mad 3057%undef mm_sumcur 3058%undef mm_sumref 3059%undef mm_sad 3060%undef b_second_blocks 3061%endmacro 3062 3063; Store remaining MAD accumulator for width & 32 cases. 3064; width/xcnt_unit=%1 mm_tmp=%2,%3 b_second_blocks=%4 3065%macro AVX2_StoreRemainingSingleMad 4 3066 test %1, 32 / xcnt_unit 3067 jz %%skip 3068 mov r_tmp, p_mad8x8 3069 vmovdqa y%2, prev_mad 3070 AVX2_Maxubq y%2, y%2, y%3 3071 AVX2_Store8x8Accb r_tmp + i_xcnt - 8, y, %2, %3, %4 3072%%skip: 3073%endmacro 3074 3075;************************************************************************************************************* 3076;void VAACalcSadBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 3077; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) 3078;************************************************************************************************************* 3079 3080WELS_EXTERN VAACalcSadBgd_avx2 3081%define p_sadframe arg6 3082%define p_sad8x8 arg7 3083%define p_sd8x8 arg8 3084%define p_mad8x8 arg9 3085%ifdef X86_32 3086%define saveregs r5, r6 3087%else 3088%define saveregs rbx, rbp, r12, r13 3089%endif 3090 3091%assign push_num 0 3092 LOAD_5_PARA 3093 PUSH_XMM 10 3094 SIGN_EXTENSION r2, r2d 3095 SIGN_EXTENSION r3, r3d 3096 SIGN_EXTENSION r4, r4d 3097 PUSHM saveregs 3098 3099%define mm_zero mm0 3100 vpxor x %+ mm_zero, x %+ mm_zero, x %+ mm_zero 3101 3102%ifdef X86_32 3103 STACK_ALLOC r5, 2 * ymm_width, ymm_width 3104 %define sadframe_acc_addr r5 3105 %define sadframe_acc [sadframe_acc_addr] 3106 %define prev_mad [r5 + ymm_width] 3107%else 3108 %define sadframe_acc ymm8 3109 %define xsadframe_acc xmm8 3110 %define prev_mad ymm9 3111%endif 3112 vmovdqa sadframe_acc, y %+ mm_zero 3113 3114 and r2, -16 ; iPicWidth &= -16 3115 jle .done ; bail if iPicWidth < 16 3116 sar r3, 4 ; iPicHeight / 16 3117 jle .done ; bail if iPicHeight < 16 3118 shr r2, 2 ; iPicWidth / 4 3119 3120%define p_cur r0 3121%define p_ref r1 3122%define i_xcnt r2 3123%define i_ycnt ptrword arg4 3124%define i_stride r4 3125%define r_tmp r6 3126%define xcnt_unit 4 3127%ifdef X86_32 3128 mov i_ycnt, r3 3129 %define i_stride3 r3 3130%else 3131 mov rbp, p_sad8x8 3132 mov r12, p_sd8x8 3133 mov r13, p_mad8x8 3134 %undef p_sad8x8 3135 %undef p_sd8x8 3136 %undef p_mad8x8 3137 %define p_sad8x8 rbp 3138 %define p_sd8x8 r12 3139 %define p_mad8x8 r13 3140 %define i_stride3 rbx 3141%endif 3142 lea i_stride3, [3 * i_stride] 3143 3144 ; offset pointers to compensate for the i_xcnt offset below. 3145 mov r_tmp, i_xcnt 3146 and r_tmp, 64 / xcnt_unit - 1 3147 sub p_mad8x8, r_tmp 3148 shl r_tmp, 2 3149 sub p_sad8x8, r_tmp 3150 sub p_sd8x8, r_tmp 3151 3152.height_loop: 3153 push i_xcnt 3154%assign push_num push_num + 1 3155%define i_xcnt_load ptrword [r7] 3156 ; use end-of-line pointers so as to enable use of a negative counter as index. 3157 lea r_tmp, [xcnt_unit * i_xcnt] 3158 add p_sad8x8, r_tmp 3159 add p_sd8x8, r_tmp 3160 add p_mad8x8, i_xcnt 3161 and i_xcnt, -(64 / xcnt_unit) 3162 jz .width_loop_upper8_64x_end 3163 ; use a negative loop counter to enable counting toward zero and indexing with the same counter. 3164 neg i_xcnt 3165.width_loop_upper8: 3166 AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 3167 add i_xcnt, 32 / xcnt_unit 3168 jl .width_loop_upper8 3169 jg .width_loop_upper8_32x_end 3170.width_loop_upper8_64x_end: 3171 test i_xcnt_load, 32 / xcnt_unit 3172 jnz .width_loop_upper8 3173.width_loop_upper8_32x_end: 3174 AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0 3175 test i_xcnt_load, 16 / xcnt_unit 3176 jz .width_loop_upper8_end 3177 ; remaining 16. 3178 AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 0 3179.width_loop_upper8_end: 3180 lea p_cur, [p_cur + 8 * i_stride] 3181 lea p_ref, [p_ref + 8 * i_stride] 3182 mov i_xcnt, i_xcnt_load 3183 lea r_tmp, [xcnt_unit * i_xcnt] 3184 sub p_cur, r_tmp 3185 sub p_ref, r_tmp 3186 and i_xcnt, -(64 / xcnt_unit) 3187 jz .width_loop_lower8_64x_end 3188 neg i_xcnt 3189.width_loop_lower8: 3190 AVX2_CalcSadBgd_8Lines y, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 3191 add i_xcnt, 32 / xcnt_unit 3192 jl .width_loop_lower8 3193 jg .width_loop_lower8_32x_end 3194.width_loop_lower8_64x_end: 3195 test i_xcnt_load, 32 / xcnt_unit 3196 jnz .width_loop_lower8 3197.width_loop_lower8_32x_end: 3198 AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1 3199 test i_xcnt_load, 16 / xcnt_unit 3200 jz .width_loop_lower8_end 3201 ; remaining 16. 3202 AVX2_CalcSadBgd_8Lines x, mm1, mm2, mm3, mm4, mm5, mm6, mm7, 1 3203.width_loop_lower8_end: 3204 lea p_cur, [p_cur + 8 * i_stride] 3205 lea p_ref, [p_ref + 8 * i_stride] 3206 pop i_xcnt 3207%undef i_xcnt_load 3208 %assign push_num push_num - 1 3209 lea r_tmp, [xcnt_unit * i_xcnt] 3210 sub p_cur, r_tmp 3211 sub p_ref, r_tmp 3212 sub i_ycnt, 1 3213 jnz .height_loop 3214 3215.done: 3216 mov r_tmp, p_sadframe 3217%ifdef X86_32 3218 vmovdqa xmm2, sadframe_acc 3219 vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width] 3220%else 3221 vextracti128 xmm2, sadframe_acc, 1 3222 vpaddd xmm2, xsadframe_acc, xmm2 3223%endif 3224 vpunpckhqdq xmm1, xmm2, xmm2 3225 vpaddd xmm2, xmm2, xmm1 3226 vmovd [r_tmp], xmm2 3227 vzeroupper 3228%ifdef X86_32 3229 STACK_DEALLOC 3230%endif 3231 POPM saveregs 3232 POP_XMM 3233 LOAD_5_PARA_POP 3234%undef p_cur 3235%undef p_ref 3236%undef i_xcnt 3237%undef i_ycnt 3238%undef i_stride 3239%undef i_stride3 3240%undef r_tmp 3241%undef xcnt_unit 3242%undef sadframe_acc 3243%undef sadframe_acc_addr 3244%undef xsadframe_acc 3245%undef prev_mad 3246%undef mm_zero 3247%undef saveregs 3248%undef p_sadframe 3249%undef p_sad8x8 3250%undef p_sd8x8 3251%undef p_mad8x8 3252 ret 3253 3254 3255; x/y-mm_prefix=%1 mm_clobber=%2,%3,%4,%5,%6,%7,%8,%9,%10 b_second_blocks=%11 3256%macro AVX2_CalcSadSsdBgd_8Lines 11 3257%define mm_tmp0 %2 3258%define mm_tmp1 %3 3259%define mm_sad %4 3260%define mm_sum %5 3261%define mm_sumref %6 3262%define mm_mad %7 3263%define mm_sqsum %8 3264%define mm_sqdiff %9 3265%ifidn %10, 0 3266%define tmp2 0 3267%else 3268%define tmp2 %1 %+ %10 3269%endif 3270%define b_second_blocks %11 3271 ; Unroll for better performance on Haswell. 3272 ; Avoid unrolling for the 16 px case so as to reduce the code footprint. 3273%ifidni %1, y 3274 lea r_tmp, [5 * i_stride] 3275 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur, p_ref, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 0 3276 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride, p_ref + 1 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3277 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride, p_ref + 2 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3278 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 1 * i_stride3, p_ref + 1 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3279 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 4 * i_stride, p_ref + 4 * i_stride, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3280 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3281 lea r_tmp, [i_stride + 2 * i_stride3] 3282 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + 2 * i_stride3, p_ref + 2 * i_stride3, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3283 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3284 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 3285 add p_cur, %1 %+ mm_width 3286 add p_ref, %1 %+ mm_width 3287%else 3288 vpxor x %+ mm_sad, x %+ mm_sad, x %+ mm_sad 3289 vpxor x %+ mm_sum, x %+ mm_sum, x %+ mm_sum 3290 vpxor x %+ mm_sumref, x %+ mm_sumref, x %+ mm_sumref 3291 vpxor x %+ mm_mad, x %+ mm_mad, x %+ mm_mad 3292 vpxor x %+ mm_sqsum, x %+ mm_sqsum, x %+ mm_sqsum 3293 vpxor x %+ mm_sqdiff, x %+ mm_sqdiff, x %+ mm_sqdiff 3294 lea r_tmp, [8 * i_stride] 3295 add p_cur, r_tmp 3296 add p_ref, r_tmp 3297 neg r_tmp 3298%%loop: 3299 AVX2_SadBgdSqdiff %1 %+ mm_sad, %1 %+ mm_sum, %1 %+ mm_sumref, %1 %+ mm_mad, %1 %+ mm_sqdiff, %1 %+ mm_sqsum, p_cur + r_tmp, p_ref + r_tmp, %1 %+ mm_zero, %1 %+ mm_tmp0, %1 %+ mm_tmp1, tmp2, 1 3300 add r_tmp, i_stride 3301 jl %%loop 3302 ; Increment addresses for the next iteration. Doing this early is beneficial on Haswell. 3303 lea r_tmp, [8 * i_stride - %1 %+ mm_width] 3304 sub p_cur, r_tmp 3305 sub p_ref, r_tmp 3306%endif 3307 mov r_tmp, p_sad8x8 3308 AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sad, mm_tmp1, b_second_blocks 3309%ifdef X86_32 3310 vpaddd y %+ mm_tmp1, y %+ mm_sad, sadframe_acc 3311 vmovdqa sadframe_acc, y %+ mm_tmp1 3312%else 3313 vpaddd sadframe_acc, sadframe_acc, y %+ mm_sad 3314%endif 3315 mov r_tmp, i_xcnt 3316 add r_tmp, p_sum16x16 3317 vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sum, %1 %+ mm_sum 3318 vpaddd %1 %+ mm_tmp0, %1 %+ mm_sum, %1 %+ mm_tmp1 3319 AVX2_Store16x16Accdw r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks 3320 mov r_tmp, p_sd8x8 3321 vpsubd %1 %+ mm_sum, %1 %+ mm_sum, %1 %+ mm_sumref 3322 AVX2_Store8x8Accdw r_tmp + 4 * i_xcnt, %1, mm_sum, mm_tmp0, b_second_blocks 3323 ; Coalesce store and horizontal reduction of MAD accumulator for even and 3324 ; odd iterations so as to enable more parallelism. 3325%ifidni %1, y 3326 test i_xcnt, 32 / xcnt_unit 3327 jz %%preserve_mad 3328 mov r_tmp, p_mad8x8 3329 AVX2_Maxubq2 y %+ mm_mad, y %+ mm_mad, prev_mad, y %+ mm_tmp0 3330 AVX2_Store2x8x8Accb r_tmp + i_xcnt - 8, mm_mad, mm_tmp0, mm_tmp1, b_second_blocks 3331%%preserve_mad: 3332 vmovdqa prev_mad, y %+ mm_mad 3333%else 3334 mov r_tmp, p_mad8x8 3335 AVX2_Maxubq %1 %+ mm_mad, %1 %+ mm_mad, %1 %+ mm_tmp0 3336 AVX2_Store8x8Accb r_tmp + i_xcnt, %1, mm_mad, mm_tmp0, b_second_blocks 3337%endif 3338 vpunpcklqdq %1 %+ mm_tmp0, %1 %+ mm_sqsum, %1 %+ mm_sqdiff 3339 vpunpckhqdq %1 %+ mm_tmp1, %1 %+ mm_sqsum, %1 %+ mm_sqdiff 3340 vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 3341 vpshufd %1 %+ mm_tmp1, %1 %+ mm_tmp0, 10110001b 3342 vpaddd %1 %+ mm_tmp0, %1 %+ mm_tmp0, %1 %+ mm_tmp1 3343 AVX2_Store2x16x16Accdw p_sqsum16x16, p_sqdiff16x16, i_xcnt, r_tmp, %1, mm_tmp0, mm_tmp1, b_second_blocks 3344%undef mm_tmp0 3345%undef mm_tmp1 3346%undef mm_sqsum 3347%undef mm_sqdiff 3348%undef mm_mad 3349%undef mm_sum 3350%undef mm_sumref 3351%undef mm_sad 3352%undef tmp2 3353%undef b_second_blocks 3354%endmacro 3355 3356;************************************************************************************************************* 3357;void VAACalcSadSsdBgd_avx2(const uint8_t *cur_data, const uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 3358; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 3359; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) 3360;************************************************************************************************************* 3361 3362WELS_EXTERN VAACalcSadSsdBgd_avx2 3363%define p_sadframe arg6 3364%define p_sad8x8 arg7 3365%define p_sum16x16 arg8 3366%define p_sqsum16x16 arg9 3367%define p_sqdiff16x16 arg10 3368%define p_sd8x8 arg11 3369%define p_mad8x8 arg12 3370%ifdef X86_32 3371%define saveregs r5, r6 3372%else 3373%define saveregs rbx, rbp, r12, r13, r14, r15 3374%endif 3375 3376%assign push_num 0 3377 LOAD_5_PARA 3378 PUSH_XMM 12 3379 SIGN_EXTENSION r2, r2d 3380 SIGN_EXTENSION r3, r3d 3381 SIGN_EXTENSION r4, r4d 3382 PUSHM saveregs 3383 3384%ifdef X86_32 3385 STACK_ALLOC r5, 3 * ymm_width, ymm_width 3386 %define mm8 0 3387 %define sadframe_acc_addr r5 3388 %define sadframe_acc [sadframe_acc_addr] 3389 %define prev_mad [r5 + ymm_width] 3390 %define ymm_zero [r5 + 2 * ymm_width] 3391 %define xmm_zero ymm_zero 3392 vpxor xmm0, xmm0, xmm0 3393 vmovdqa sadframe_acc, ymm0 3394 vmovdqa ymm_zero, ymm0 3395%else 3396 %define sadframe_acc ymm9 3397 %define xsadframe_acc xmm9 3398 %define prev_mad ymm10 3399 %define ymm_zero ymm11 3400 %define xmm_zero xmm11 3401 vpxor xmm_zero, xmm_zero, xmm_zero 3402 vpxor xsadframe_acc, xsadframe_acc, xsadframe_acc 3403%endif 3404 3405 and r2, -16 ; iPicWidth &= -16 3406 jle .done ; bail if iPicWidth < 16 3407 sar r3, 4 ; iPicHeight / 16 3408 jle .done ; bail if iPicHeight < 16 3409 shr r2, 2 ; iPicWidth / 4 3410 3411%define p_cur r0 3412%define p_ref r1 3413%define i_xcnt r2 3414%define i_ycnt ptrword arg4 3415%define i_stride r4 3416%define r_tmp r6 3417%define xcnt_unit 4 3418%ifdef X86_32 3419 mov i_ycnt, r3 3420 %define i_stride3 r3 3421%else 3422 mov rbp, p_sad8x8 3423 mov r12, p_sum16x16 3424 mov r13, p_sqsum16x16 3425 mov r14, p_sqdiff16x16 3426 mov r15, p_sd8x8 3427 %undef p_sad8x8 3428 %undef p_sum16x16 3429 %undef p_sqsum16x16 3430 %undef p_sqdiff16x16 3431 %undef p_sd8x8 3432 %define p_sad8x8 rbp 3433 %define p_sum16x16 r12 3434 %define p_sqsum16x16 r13 3435 %define p_sqdiff16x16 r14 3436 %define p_sd8x8 r15 3437 %define i_stride3 rbx 3438%endif 3439 lea i_stride3, [3 * i_stride] 3440 3441 ; offset pointers so as to compensate for the i_xcnt offset below. 3442 mov r_tmp, i_xcnt 3443 and r_tmp, 64 / xcnt_unit - 1 3444 sub p_sum16x16, r_tmp 3445 sub p_sqsum16x16, r_tmp 3446 sub p_sqdiff16x16, r_tmp 3447 sub p_mad8x8, r_tmp 3448 shl r_tmp, 2 3449 sub p_sad8x8, r_tmp 3450 sub p_sd8x8, r_tmp 3451 3452.height_loop: 3453 push i_xcnt 3454%assign push_num push_num + 1 3455%define i_xcnt_load ptrword [r7] 3456 ; use end-of-line pointers so as to enable use of a negative counter as index. 3457 lea r_tmp, [xcnt_unit * i_xcnt] 3458 add p_sad8x8, r_tmp 3459 add p_sum16x16, i_xcnt 3460 add p_sqsum16x16, i_xcnt 3461 add p_sqdiff16x16, i_xcnt 3462 add p_sd8x8, r_tmp 3463 add p_mad8x8, i_xcnt 3464 and i_xcnt, -(64 / xcnt_unit) 3465 jz .width_loop_upper8_64x_end 3466 ; use a negative loop counter to enable counting toward zero and indexing with the same counter. 3467 neg i_xcnt 3468.width_loop_upper8: 3469 AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0 3470 add i_xcnt, 32 / xcnt_unit 3471 jl .width_loop_upper8 3472 jg .width_loop_upper8_32x_end 3473.width_loop_upper8_64x_end: 3474 test i_xcnt_load, 32 / xcnt_unit 3475 jnz .width_loop_upper8 3476.width_loop_upper8_32x_end: 3477 AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 0 3478 test i_xcnt_load, 16 / xcnt_unit 3479 jz .width_loop_upper8_end 3480 ; remaining 16. 3481 AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 0 3482.width_loop_upper8_end: 3483 lea p_cur, [p_cur + 8 * i_stride] 3484 lea p_ref, [p_ref + 8 * i_stride] 3485 mov i_xcnt, i_xcnt_load 3486 lea r_tmp, [xcnt_unit * i_xcnt] 3487 sub p_cur, r_tmp 3488 sub p_ref, r_tmp 3489 and i_xcnt, -(64 / xcnt_unit) 3490 jz .width_loop_lower8_64x_end 3491 neg i_xcnt 3492.width_loop_lower8: 3493 AVX2_CalcSadSsdBgd_8Lines y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1 3494 add i_xcnt, 32 / xcnt_unit 3495 jl .width_loop_lower8 3496 jg .width_loop_lower8_32x_end 3497.width_loop_lower8_64x_end: 3498 test i_xcnt_load, 32 / xcnt_unit 3499 jnz .width_loop_lower8 3500.width_loop_lower8_32x_end: 3501 AVX2_StoreRemainingSingleMad i_xcnt_load, mm1, mm2, 1 3502 test i_xcnt_load, 16 / xcnt_unit 3503 jz .width_loop_lower8_end 3504 ; remaining 16. 3505 AVX2_CalcSadSsdBgd_8Lines x, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, mm8, 1 3506.width_loop_lower8_end: 3507 lea p_cur, [p_cur + 8 * i_stride] 3508 lea p_ref, [p_ref + 8 * i_stride] 3509 pop i_xcnt 3510%undef i_xcnt_load 3511 %assign push_num push_num - 1 3512 lea r_tmp, [xcnt_unit * i_xcnt] 3513 sub p_cur, r_tmp 3514 sub p_ref, r_tmp 3515 sub i_ycnt, 1 3516 jnz .height_loop 3517 3518.done: 3519 mov r_tmp, p_sadframe 3520%ifdef X86_32 3521 vmovdqa xmm2, sadframe_acc 3522 vpaddd xmm2, xmm2, [sadframe_acc_addr + xmm_width] 3523%else 3524 vextracti128 xmm2, sadframe_acc, 1 3525 vpaddd xmm2, xsadframe_acc, xmm2 3526%endif 3527 vpunpckhqdq xmm1, xmm2, xmm2 3528 vpaddd xmm2, xmm2, xmm1 3529 vmovd [r_tmp], xmm2 3530 vzeroupper 3531%ifdef X86_32 3532 STACK_DEALLOC 3533%endif 3534 POPM saveregs 3535 POP_XMM 3536 LOAD_5_PARA_POP 3537%undef p_cur 3538%undef p_ref 3539%undef i_xcnt 3540%undef i_ycnt 3541%undef i_stride 3542%undef i_stride3 3543%undef r_tmp 3544%undef xcnt_unit 3545%undef mm8 3546%undef sadframe_acc 3547%undef sadframe_acc_addr 3548%undef xsadframe_acc 3549%undef prev_mad 3550%undef ymm_zero 3551%undef xmm_zero 3552%undef saveregs 3553%undef p_sadframe 3554%undef p_sad8x8 3555%undef p_sum16x16 3556%undef p_sqsum16x16 3557%undef p_sqdiff16x16 3558%undef p_sd8x8 3559%undef p_mad8x8 3560 ret 3561 3562%endif 3563 3564