1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* mc_luma.asm 33;* 34;* Abstract 35;* sse2 motion compensation 36;* 37;* History 38;* 17/08/2009 Created 39;* 40;* 41;*************************************************************************/ 42%include "asm_inc.asm" 43 44;******************************************************************************* 45; Local Data (Read Only) 46;******************************************************************************* 47%ifdef X86_32_PICASM 48SECTION .text align=32 49%else 50SECTION .rodata align=32 51%endif 52 53;******************************************************************************* 54; Various memory constants (trigonometric values or rounding values) 55;******************************************************************************* 56 57%ifdef HAVE_AVX2 58ALIGN 32 59dwm32768_256: 60 times 16 dw -32768 61maddubsw_m2p10_m40m40_p10m2_p0p0_256: 62 times 4 db -2, 10, -40, -40, 10, -2, 0, 0 63dwm1024_256: 64 times 16 dw -1024 65dd32768_256: 66 times 8 dd 32768 67maddubsw_p1m5_256: 68 times 16 db 1, -5 69maddubsw_m5p1_256: 70 times 16 db -5, 1 71db20_256: 72 times 32 db 20 73maddubsw_m5p20_256: 74 times 16 db -5, 20 75maddubsw_p20m5_256: 76 times 16 db 20, -5 77h264_w0x10_256: 78 times 16 dw 16 79dw32_256: 80 times 16 dw 32 81%endif ; HAVE_AVX2 82 83ALIGN 16 84shufb_32435465768798A9: 85 db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9 86shufb_011267784556ABBC: 87 db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch 88maddubsw_p1m5_p1m5_m5p1_m5p1_128: 89 times 2 db 1, -5, 1, -5, -5, 1, -5, 1 90maddubsw_m2p10_m40m40_p10m2_p0p0_128: 91 times 2 db -2, 10, -40, -40, 10, -2, 0, 0 92dwm1024_128: 93 times 8 dw -1024 94dd32768_128: 95 times 4 dd 32768 96maddubsw_p1m5_128: 97 times 8 db 1, -5 98maddubsw_m5p1_128: 99 times 8 db -5, 1 100db20_128: 101 times 16 db 20 102maddubsw_m5p20_128: 103 times 8 db -5, 20 104maddubsw_p20m5_128: 105 times 8 db 20, -5 106h264_w0x10_1: 107 dw 16, 16, 16, 16, 16, 16, 16, 16 108ALIGN 16 109h264_mc_hc_32: 110 dw 32, 32, 32, 32, 32, 32, 32, 32 111 112 113;******************************************************************************* 114; Code 115;******************************************************************************* 116 117SECTION .text 118 119%ifdef X86_32_PICASM 120 121%macro MOVEIMM_DW16 1 122 pcmpeqw %1, %1 123 psrlw %1, 15 124 psllw %1, 4 125%endmacro 126 127%endif 128 129;******************************************************************************* 130; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc, 131; int iSrcStride, 132; uint8_t *pDst, 133; int iDstStride, 134; int iHeight) 135;******************************************************************************* 136WELS_EXTERN McHorVer20WidthEq4_mmx 137 %assign push_num 0 138 LOAD_5_PARA 139 SIGN_EXTENSION r1, r1d 140 SIGN_EXTENSION r3, r3d 141 SIGN_EXTENSION r4, r4d 142 143 sub r0, 2 144 WELS_Zero mm7 145%ifdef X86_32_PICASM 146 MOVEIMM_DW16 mm6 147%else 148 movq mm6, [h264_w0x10_1] 149%endif 150.height_loop: 151 movd mm0, [r0] 152 punpcklbw mm0, mm7 153 movd mm1, [r0+5] 154 punpcklbw mm1, mm7 155 movd mm2, [r0+1] 156 punpcklbw mm2, mm7 157 movd mm3, [r0+4] 158 punpcklbw mm3, mm7 159 movd mm4, [r0+2] 160 punpcklbw mm4, mm7 161 movd mm5, [r0+3] 162 punpcklbw mm5, mm7 163 164 paddw mm2, mm3 165 paddw mm4, mm5 166 psllw mm4, 2 167 psubw mm4, mm2 168 paddw mm0, mm1 169 paddw mm0, mm4 170 psllw mm4, 2 171 paddw mm0, mm4 172 paddw mm0, mm6 173 psraw mm0, 5 174 packuswb mm0, mm7 175 movd [r2], mm0 176 177 add r0, r1 178 add r2, r3 179 dec r4 180 jnz .height_loop 181 182 WELSEMMS 183 LOAD_5_PARA_POP 184 ret 185 186;******************************************************************************* 187; Macros and other preprocessor constants 188;******************************************************************************* 189 190 191%macro SSE_LOAD_8P 3 192 movq %1, %3 193 punpcklbw %1, %2 194%endmacro 195 196%macro FILTER_HV_W8 9 197 paddw %1, %6 198 paddw %1, [pic(h264_w0x10_1)] 199 movdqa %8, %3 200 movdqa %7, %2 201 paddw %8, %4 202 paddw %7, %5 203 psllw %8, 2 204 psubw %8, %7 205 paddw %1, %8 206 psllw %8, 2 207 paddw %1, %8 208 psraw %1, 5 209 WELS_Zero %8 210 packuswb %1, %8 211 movq %9, %1 212%endmacro 213 214 215%macro FILTER_HV_W4 9 216paddw %1, %6 217paddw %1, [pic(h264_w0x10_1)] 218movdqa %8, %3 219movdqa %7, %2 220paddw %8, %4 221paddw %7, %5 222psllw %8, 2 223psubw %8, %7 224paddw %1, %8 225psllw %8, 2 226paddw %1, %8 227psraw %1, 5 228WELS_Zero %8 229packuswb %1, %8 230movd %9, %1 231%endmacro 232 233 234;******************************************************************************* 235; Code 236;******************************************************************************* 237 238SECTION .text 239 240;*********************************************************************** 241; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc, 242; int16_t iSrcStride, 243; uint8_t *pDst, 244; int32_t iDstStride 245; int32_t iHeight 246; ) 247;*********************************************************************** 248WELS_EXTERN McHorVer22Width8HorFirst_sse2 249 %assign push_num 0 250 LOAD_5_PARA 251 PUSH_XMM 8 252 SIGN_EXTENSION r1, r1d 253 SIGN_EXTENSION r3, r3d 254 SIGN_EXTENSION r4, r4d 255 pxor xmm7, xmm7 256 257 sub r0, r1 ;;;;;;;;need more 5 lines. 258 sub r0, r1 259 260.yloop_width_8: 261 movq xmm0, [r0] 262 punpcklbw xmm0, xmm7 263 movq xmm1, [r0+5] 264 punpcklbw xmm1, xmm7 265 movq xmm2, [r0+1] 266 punpcklbw xmm2, xmm7 267 movq xmm3, [r0+4] 268 punpcklbw xmm3, xmm7 269 movq xmm4, [r0+2] 270 punpcklbw xmm4, xmm7 271 movq xmm5, [r0+3] 272 punpcklbw xmm5, xmm7 273 274 paddw xmm2, xmm3 275 paddw xmm4, xmm5 276 psllw xmm4, 2 277 psubw xmm4, xmm2 278 paddw xmm0, xmm1 279 paddw xmm0, xmm4 280 psllw xmm4, 2 281 paddw xmm0, xmm4 282 movdqa [r2], xmm0 283 284 add r0, r1 285 add r2, r3 286 dec r4 287 jnz .yloop_width_8 288 POP_XMM 289 LOAD_5_PARA_POP 290 ret 291 292;******************************************************************************* 293; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc, 294; int iSrcStride, 295; uint8_t *pDst, 296; int iDstStride, 297; int iHeight, 298; ); 299;******************************************************************************* 300WELS_EXTERN McHorVer20WidthEq8_sse2 301 %assign push_num 0 302 LOAD_5_PARA 303 PUSH_XMM 8 304 SIGN_EXTENSION r1, r1d 305 SIGN_EXTENSION r3, r3d 306 SIGN_EXTENSION r4, r4d 307 lea r0, [r0-2] ;pSrc -= 2; 308 309 pxor xmm7, xmm7 310%ifdef X86_32_PICASM 311 MOVEIMM_DW16 xmm6 312%else 313 movdqa xmm6, [h264_w0x10_1] 314%endif 315.y_loop: 316 movq xmm0, [r0] 317 punpcklbw xmm0, xmm7 318 movq xmm1, [r0+5] 319 punpcklbw xmm1, xmm7 320 movq xmm2, [r0+1] 321 punpcklbw xmm2, xmm7 322 movq xmm3, [r0+4] 323 punpcklbw xmm3, xmm7 324 movq xmm4, [r0+2] 325 punpcklbw xmm4, xmm7 326 movq xmm5, [r0+3] 327 punpcklbw xmm5, xmm7 328 329 paddw xmm2, xmm3 330 paddw xmm4, xmm5 331 psllw xmm4, 2 332 psubw xmm4, xmm2 333 paddw xmm0, xmm1 334 paddw xmm0, xmm4 335 psllw xmm4, 2 336 paddw xmm0, xmm4 337 paddw xmm0, xmm6 338 psraw xmm0, 5 339 340 packuswb xmm0, xmm7 341 movq [r2], xmm0 342 343 lea r2, [r2+r3] 344 lea r0, [r0+r1] 345 dec r4 346 jnz near .y_loop 347 348 POP_XMM 349 LOAD_5_PARA_POP 350 ret 351 352;******************************************************************************* 353; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc, 354; int iSrcStride, 355; uint8_t *pDst, 356; int iDstStride, 357; int iHeight, 358; ); 359;******************************************************************************* 360WELS_EXTERN McHorVer20WidthEq16_sse2 361 %assign push_num 0 362 LOAD_5_PARA 363 PUSH_XMM 8 364 SIGN_EXTENSION r1, r1d 365 SIGN_EXTENSION r3, r3d 366 SIGN_EXTENSION r4, r4d 367 lea r0, [r0-2] ;pSrc -= 2; 368 369 pxor xmm7, xmm7 370%ifdef X86_32_PICASM 371 MOVEIMM_DW16 xmm6 372%else 373 movdqa xmm6, [h264_w0x10_1] 374%endif 375.y_loop: 376 377 movq xmm0, [r0] 378 punpcklbw xmm0, xmm7 379 movq xmm1, [r0+5] 380 punpcklbw xmm1, xmm7 381 movq xmm2, [r0+1] 382 punpcklbw xmm2, xmm7 383 movq xmm3, [r0+4] 384 punpcklbw xmm3, xmm7 385 movq xmm4, [r0+2] 386 punpcklbw xmm4, xmm7 387 movq xmm5, [r0+3] 388 punpcklbw xmm5, xmm7 389 390 paddw xmm2, xmm3 391 paddw xmm4, xmm5 392 psllw xmm4, 2 393 psubw xmm4, xmm2 394 paddw xmm0, xmm1 395 paddw xmm0, xmm4 396 psllw xmm4, 2 397 paddw xmm0, xmm4 398 paddw xmm0, xmm6 399 psraw xmm0, 5 400 packuswb xmm0, xmm7 401 movq [r2], xmm0 402 403 movq xmm0, [r0+8] 404 punpcklbw xmm0, xmm7 405 movq xmm1, [r0+5+8] 406 punpcklbw xmm1, xmm7 407 movq xmm2, [r0+1+8] 408 punpcklbw xmm2, xmm7 409 movq xmm3, [r0+4+8] 410 punpcklbw xmm3, xmm7 411 movq xmm4, [r0+2+8] 412 punpcklbw xmm4, xmm7 413 movq xmm5, [r0+3+8] 414 punpcklbw xmm5, xmm7 415 416 paddw xmm2, xmm3 417 paddw xmm4, xmm5 418 psllw xmm4, 2 419 psubw xmm4, xmm2 420 paddw xmm0, xmm1 421 paddw xmm0, xmm4 422 psllw xmm4, 2 423 paddw xmm0, xmm4 424 paddw xmm0, xmm6 425 psraw xmm0, 5 426 packuswb xmm0, xmm7 427 movq [r2+8], xmm0 428 429 lea r2, [r2+r3] 430 lea r0, [r0+r1] 431 dec r4 432 jnz near .y_loop 433 434 POP_XMM 435 LOAD_5_PARA_POP 436 ret 437 438 439;******************************************************************************* 440; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc, 441; int iSrcStride, 442; uint8_t *pDst, 443; int iDstStride, 444; int iHeight ) 445;******************************************************************************* 446WELS_EXTERN McHorVer02WidthEq8_sse2 447 %assign push_num 0 448 INIT_X86_32_PIC r5 449 LOAD_5_PARA 450 PUSH_XMM 8 451 SIGN_EXTENSION r1, r1d 452 SIGN_EXTENSION r3, r3d 453 SIGN_EXTENSION r4, r4d 454 sub r0, r1 455 sub r0, r1 456 457 WELS_Zero xmm7 458 459 SSE_LOAD_8P xmm0, xmm7, [r0] 460 SSE_LOAD_8P xmm1, xmm7, [r0+r1] 461 lea r0, [r0+2*r1] 462 SSE_LOAD_8P xmm2, xmm7, [r0] 463 SSE_LOAD_8P xmm3, xmm7, [r0+r1] 464 lea r0, [r0+2*r1] 465 SSE_LOAD_8P xmm4, xmm7, [r0] 466 SSE_LOAD_8P xmm5, xmm7, [r0+r1] 467 468.start: 469 FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 470 dec r4 471 jz near .xx_exit 472 473 lea r0, [r0+2*r1] 474 SSE_LOAD_8P xmm6, xmm7, [r0] 475 FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] 476 dec r4 477 jz near .xx_exit 478 479 lea r2, [r2+2*r3] 480 SSE_LOAD_8P xmm7, xmm0, [r0+r1] 481 FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] 482 dec r4 483 jz near .xx_exit 484 485 lea r0, [r0+2*r1] 486 SSE_LOAD_8P xmm0, xmm1, [r0] 487 FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] 488 dec r4 489 jz near .xx_exit 490 491 lea r2, [r2+2*r3] 492 SSE_LOAD_8P xmm1, xmm2, [r0+r1] 493 FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] 494 dec r4 495 jz near .xx_exit 496 497 lea r0, [r0+2*r1] 498 SSE_LOAD_8P xmm2, xmm3, [r0] 499 FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] 500 dec r4 501 jz near .xx_exit 502 503 lea r2, [r2+2*r3] 504 SSE_LOAD_8P xmm3, xmm4, [r0+r1] 505 FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] 506 dec r4 507 jz near .xx_exit 508 509 lea r0, [r0+2*r1] 510 SSE_LOAD_8P xmm4, xmm5, [r0] 511 FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] 512 dec r4 513 jz near .xx_exit 514 515 lea r2, [r2+2*r3] 516 SSE_LOAD_8P xmm5, xmm6, [r0+r1] 517 jmp near .start 518 519.xx_exit: 520 POP_XMM 521 LOAD_5_PARA_POP 522 DEINIT_X86_32_PIC 523 ret 524 525;*********************************************************************** 526; Code 527;*********************************************************************** 528 529SECTION .text 530 531 532 533;*********************************************************************** 534; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc, 535; int32_t iSrcStride, 536; uint8_t *pDst, 537; int32_t iDstStride, 538; int32_t iWidth, 539; int32_t iHeight ) 540;*********************************************************************** 541WELS_EXTERN McHorVer02Height9Or17_sse2 542 %assign push_num 0 543 INIT_X86_32_PIC r6 544 LOAD_6_PARA 545 PUSH_XMM 8 546 SIGN_EXTENSION r1, r1d 547 SIGN_EXTENSION r3, r3d 548 SIGN_EXTENSION r4, r4d 549 SIGN_EXTENSION r5, r5d 550 551%ifndef X86_32 552 push r12 553 push r13 554 push r14 555 mov r12, r0 556 mov r13, r2 557 mov r14, r5 558%endif 559 560 shr r4, 3 561 sub r0, r1 562 sub r0, r1 563 564.xloop: 565 WELS_Zero xmm7 566 SSE_LOAD_8P xmm0, xmm7, [r0] 567 SSE_LOAD_8P xmm1, xmm7, [r0+r1] 568 lea r0, [r0+2*r1] 569 SSE_LOAD_8P xmm2, xmm7, [r0] 570 SSE_LOAD_8P xmm3, xmm7, [r0+r1] 571 lea r0, [r0+2*r1] 572 SSE_LOAD_8P xmm4, xmm7, [r0] 573 SSE_LOAD_8P xmm5, xmm7, [r0+r1] 574 575 FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 576 dec r5 577 lea r0, [r0+2*r1] 578 SSE_LOAD_8P xmm6, xmm7, [r0] 579 movdqa xmm0,xmm1 580 movdqa xmm1,xmm2 581 movdqa xmm2,xmm3 582 movdqa xmm3,xmm4 583 movdqa xmm4,xmm5 584 movdqa xmm5,xmm6 585 add r2, r3 586 sub r0, r1 587 588.start: 589 FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 590 dec r5 591 jz near .x_loop_dec 592 593 lea r0, [r0+2*r1] 594 SSE_LOAD_8P xmm6, xmm7, [r0] 595 FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] 596 dec r5 597 jz near .x_loop_dec 598 599 lea r2, [r2+2*r3] 600 SSE_LOAD_8P xmm7, xmm0, [r0+r1] 601 FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] 602 dec r5 603 jz near .x_loop_dec 604 605 lea r0, [r0+2*r1] 606 SSE_LOAD_8P xmm0, xmm1, [r0] 607 FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] 608 dec r5 609 jz near .x_loop_dec 610 611 lea r2, [r2+2*r3] 612 SSE_LOAD_8P xmm1, xmm2, [r0+r1] 613 FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] 614 dec r5 615 jz near .x_loop_dec 616 617 lea r0, [r0+2*r1] 618 SSE_LOAD_8P xmm2, xmm3, [r0] 619 FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] 620 dec r5 621 jz near .x_loop_dec 622 623 lea r2, [r2+2*r3] 624 SSE_LOAD_8P xmm3, xmm4, [r0+r1] 625 FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] 626 dec r5 627 jz near .x_loop_dec 628 629 lea r0, [r0+2*r1] 630 SSE_LOAD_8P xmm4, xmm5, [r0] 631 FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] 632 dec r5 633 jz near .x_loop_dec 634 635 lea r2, [r2+2*r3] 636 SSE_LOAD_8P xmm5, xmm6, [r0+r1] 637 jmp near .start 638 639.x_loop_dec: 640 dec r4 641 jz near .xx_exit 642%ifdef X86_32 643 mov r0, arg1 644 mov r2, arg3 645 mov r5, arg6 646%else 647 mov r0, r12 648 mov r2, r13 649 mov r5, r14 650%endif 651 sub r0, r1 652 sub r0, r1 653 add r0, 8 654 add r2, 8 655 jmp near .xloop 656 657.xx_exit: 658%ifndef X86_32 659 pop r14 660 pop r13 661 pop r12 662%endif 663 POP_XMM 664 LOAD_6_PARA_POP 665 DEINIT_X86_32_PIC 666 ret 667 668 669;*********************************************************************** 670; void McHorVer02Height5_sse2( const uint8_t *pSrc, 671; int32_t iSrcStride, 672; uint8_t *pDst, 673; int32_t iDstStride, 674; int32_t iWidth, 675; int32_t iHeight ) 676;*********************************************************************** 677WELS_EXTERN McHorVer02Height5_sse2 678%assign push_num 0 679INIT_X86_32_PIC r6 680LOAD_6_PARA 681PUSH_XMM 8 682SIGN_EXTENSION r1, r1d 683SIGN_EXTENSION r3, r3d 684SIGN_EXTENSION r4, r4d 685SIGN_EXTENSION r5, r5d 686 687%ifndef X86_32 688push r12 689push r13 690push r14 691mov r12, r0 692mov r13, r2 693mov r14, r5 694%endif 695 696shr r4, 2 697sub r0, r1 698sub r0, r1 699 700.xloop: 701WELS_Zero xmm7 702SSE_LOAD_8P xmm0, xmm7, [r0] 703SSE_LOAD_8P xmm1, xmm7, [r0+r1] 704lea r0, [r0+2*r1] 705SSE_LOAD_8P xmm2, xmm7, [r0] 706SSE_LOAD_8P xmm3, xmm7, [r0+r1] 707lea r0, [r0+2*r1] 708SSE_LOAD_8P xmm4, xmm7, [r0] 709SSE_LOAD_8P xmm5, xmm7, [r0+r1] 710 711FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 712dec r5 713lea r0, [r0+2*r1] 714SSE_LOAD_8P xmm6, xmm7, [r0] 715movdqa xmm0,xmm1 716movdqa xmm1,xmm2 717movdqa xmm2,xmm3 718movdqa xmm3,xmm4 719movdqa xmm4,xmm5 720movdqa xmm5,xmm6 721add r2, r3 722sub r0, r1 723 724.start: 725FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 726dec r5 727jz near .x_loop_dec 728 729lea r0, [r0+2*r1] 730SSE_LOAD_8P xmm6, xmm7, [r0] 731FILTER_HV_W4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3] 732dec r5 733jz near .x_loop_dec 734 735lea r2, [r2+2*r3] 736SSE_LOAD_8P xmm7, xmm0, [r0+r1] 737FILTER_HV_W4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] 738dec r5 739jz near .x_loop_dec 740 741lea r0, [r0+2*r1] 742SSE_LOAD_8P xmm0, xmm1, [r0] 743FILTER_HV_W4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3] 744dec r5 745jz near .x_loop_dec 746 747lea r2, [r2+2*r3] 748SSE_LOAD_8P xmm1, xmm2, [r0+r1] 749FILTER_HV_W4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2] 750dec r5 751jz near .x_loop_dec 752 753lea r0, [r0+2*r1] 754SSE_LOAD_8P xmm2, xmm3, [r0] 755FILTER_HV_W4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3] 756dec r5 757jz near .x_loop_dec 758 759lea r2, [r2+2*r3] 760SSE_LOAD_8P xmm3, xmm4, [r0+r1] 761FILTER_HV_W4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2] 762dec r5 763jz near .x_loop_dec 764 765lea r0, [r0+2*r1] 766SSE_LOAD_8P xmm4, xmm5, [r0] 767FILTER_HV_W4 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3] 768dec r5 769jz near .x_loop_dec 770 771lea r2, [r2+2*r3] 772SSE_LOAD_8P xmm5, xmm6, [r0+r1] 773jmp near .start 774 775.x_loop_dec: 776dec r4 777jz near .xx_exit 778%ifdef X86_32 779mov r0, arg1 780mov r2, arg3 781mov r5, arg6 782%else 783mov r0, r12 784mov r2, r13 785mov r5, r14 786%endif 787sub r0, r1 788sub r0, r1 789add r0, 4 790add r2, 4 791jmp near .xloop 792 793.xx_exit: 794%ifndef X86_32 795pop r14 796pop r13 797pop r12 798%endif 799POP_XMM 800LOAD_6_PARA_POP 801DEINIT_X86_32_PIC 802ret 803 804 805;*********************************************************************** 806; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc, 807; int32_t iSrcStride, 808; uint8_t *pDst, 809; int32_t iDstStride, 810; int32_t iWidth, 811; int32_t iHeight 812; ); 813;*********************************************************************** 814WELS_EXTERN McHorVer20Width9Or17_sse2 815 %assign push_num 0 816 INIT_X86_32_PIC r6 817 LOAD_6_PARA 818 PUSH_XMM 8 819 SIGN_EXTENSION r1, r1d 820 SIGN_EXTENSION r3, r3d 821 SIGN_EXTENSION r4, r4d 822 SIGN_EXTENSION r5, r5d 823 sub r0, 2 824 pxor xmm7, xmm7 825 826 cmp r4, 9 827 jne near .width_17 828 829.yloop_width_9: 830 movq xmm0, [r0] 831 punpcklbw xmm0, xmm7 832 movq xmm1, [r0+5] 833 punpcklbw xmm1, xmm7 834 movq xmm2, [r0+1] 835 punpcklbw xmm2, xmm7 836 movq xmm3, [r0+4] 837 punpcklbw xmm3, xmm7 838 movq xmm4, [r0+2] 839 punpcklbw xmm4, xmm7 840 movq xmm5, [r0+3] 841 punpcklbw xmm5, xmm7 842 843 movdqa xmm7, xmm2 844 paddw xmm7, xmm3 845 movdqa xmm6, xmm4 846 paddw xmm6, xmm5 847 psllw xmm6, 2 848 psubw xmm6, xmm7 849 paddw xmm0, xmm1 850 paddw xmm0, xmm6 851 psllw xmm6, 2 852 paddw xmm0, xmm6 853 paddw xmm0, [pic(h264_w0x10_1)] 854 psraw xmm0, 5 855 packuswb xmm0, xmm0 856 movd [r2], xmm0 857 858 pxor xmm7, xmm7 859 movq xmm0, [r0+6] 860 punpcklbw xmm0, xmm7 861 862 paddw xmm4, xmm1 863 paddw xmm5, xmm3 864 psllw xmm5, 2 865 psubw xmm5, xmm4 866 paddw xmm2, xmm0 867 paddw xmm2, xmm5 868 psllw xmm5, 2 869 paddw xmm2, xmm5 870 paddw xmm2, [pic(h264_w0x10_1)] 871 psraw xmm2, 5 872 packuswb xmm2, xmm2 873 movq [r2+1], xmm2 874 875 add r0, r1 876 add r2, r3 877 dec r5 878 jnz .yloop_width_9 879 POP_XMM 880 LOAD_6_PARA_POP 881 DEINIT_X86_32_PIC_KEEPDEF 882 ret 883 884 885.width_17: 886.yloop_width_17: 887 movq xmm0, [r0] 888 punpcklbw xmm0, xmm7 889 movq xmm1, [r0+5] 890 punpcklbw xmm1, xmm7 891 movq xmm2, [r0+1] 892 punpcklbw xmm2, xmm7 893 movq xmm3, [r0+4] 894 punpcklbw xmm3, xmm7 895 movq xmm4, [r0+2] 896 punpcklbw xmm4, xmm7 897 movq xmm5, [r0+3] 898 punpcklbw xmm5, xmm7 899 900 paddw xmm2, xmm3 901 paddw xmm4, xmm5 902 psllw xmm4, 2 903 psubw xmm4, xmm2 904 paddw xmm0, xmm1 905 paddw xmm0, xmm4 906 psllw xmm4, 2 907 paddw xmm0, xmm4 908 paddw xmm0, [pic(h264_w0x10_1)] 909 psraw xmm0, 5 910 packuswb xmm0, xmm0 911 movq [r2], xmm0 912 913 movq xmm0, [r0+8] 914 punpcklbw xmm0, xmm7 915 movq xmm1, [r0+5+8] 916 punpcklbw xmm1, xmm7 917 movq xmm2, [r0+1+8] 918 punpcklbw xmm2, xmm7 919 movq xmm3, [r0+4+8] 920 punpcklbw xmm3, xmm7 921 movq xmm4, [r0+2+8] 922 punpcklbw xmm4, xmm7 923 movq xmm5, [r0+3+8] 924 punpcklbw xmm5, xmm7 925 926 movdqa xmm7, xmm2 927 paddw xmm7, xmm3 928 movdqa xmm6, xmm4 929 paddw xmm6, xmm5 930 psllw xmm6, 2 931 psubw xmm6, xmm7 932 paddw xmm0, xmm1 933 paddw xmm0, xmm6 934 psllw xmm6, 2 935 paddw xmm0, xmm6 936 paddw xmm0, [pic(h264_w0x10_1)] 937 psraw xmm0, 5 938 packuswb xmm0, xmm0 939 movd [r2+8], xmm0 940 941 942 pxor xmm7, xmm7 943 movq xmm0, [r0+6+8] 944 punpcklbw xmm0, xmm7 945 946 paddw xmm4, xmm1 947 paddw xmm5, xmm3 948 psllw xmm5, 2 949 psubw xmm5, xmm4 950 paddw xmm2, xmm0 951 paddw xmm2, xmm5 952 psllw xmm5, 2 953 paddw xmm2, xmm5 954 paddw xmm2, [pic(h264_w0x10_1)] 955 psraw xmm2, 5 956 packuswb xmm2, xmm2 957 movq [r2+9], xmm2 958 add r0, r1 959 add r2, r3 960 dec r5 961 jnz .yloop_width_17 962 POP_XMM 963 LOAD_6_PARA_POP 964 DEINIT_X86_32_PIC 965 ret 966 967 968;*********************************************************************** 969; void McHorVer20Width5_sse2( const uint8_t *pSrc, 970; int32_t iSrcStride, 971; uint8_t *pDst, 972; int32_t iDstStride, 973; int32_t iWidth, 974; int32_t iHeight 975; ); 976;*********************************************************************** 977WELS_EXTERN McHorVer20Width5_sse2 978%assign push_num 0 979INIT_X86_32_PIC r6 980LOAD_6_PARA 981PUSH_XMM 8 982SIGN_EXTENSION r1, r1d 983SIGN_EXTENSION r3, r3d 984SIGN_EXTENSION r4, r4d 985SIGN_EXTENSION r5, r5d 986sub r0, 2 987pxor xmm7, xmm7 988 989.yloop_width_5: 990movq xmm0, [r0] 991punpcklbw xmm0, xmm7 992movq xmm1, [r0+5] 993punpcklbw xmm1, xmm7 994movq xmm2, [r0+1] 995punpcklbw xmm2, xmm7 996movq xmm3, [r0+4] 997punpcklbw xmm3, xmm7 998movq xmm4, [r0+2] 999punpcklbw xmm4, xmm7 1000movq xmm5, [r0+3] 1001punpcklbw xmm5, xmm7 1002 1003movdqa xmm7, xmm2 1004paddw xmm7, xmm3 1005movdqa xmm6, xmm4 1006paddw xmm6, xmm5 1007psllw xmm6, 2 1008psubw xmm6, xmm7 1009paddw xmm0, xmm1 1010paddw xmm0, xmm6 1011psllw xmm6, 2 1012paddw xmm0, xmm6 1013paddw xmm0, [pic(h264_w0x10_1)] 1014psraw xmm0, 5 1015packuswb xmm0, xmm0 1016movd [r2], xmm0 1017 1018pxor xmm7, xmm7 1019movq xmm0, [r0+6] 1020punpcklbw xmm0, xmm7 1021 1022paddw xmm4, xmm1 1023paddw xmm5, xmm3 1024psllw xmm5, 2 1025psubw xmm5, xmm4 1026paddw xmm2, xmm0 1027paddw xmm2, xmm5 1028psllw xmm5, 2 1029paddw xmm2, xmm5 1030paddw xmm2, [pic(h264_w0x10_1)] 1031psraw xmm2, 5 1032packuswb xmm2, xmm2 1033movd [r2+1], xmm2 1034 1035add r0, r1 1036add r2, r3 1037dec r5 1038jnz .yloop_width_5 1039POP_XMM 1040LOAD_6_PARA_POP 1041DEINIT_X86_32_PIC 1042ret 1043 1044 1045;*********************************************************************** 1046;void McHorVer22HorFirst_sse2 1047; (const uint8_t *pSrc, 1048; int32_t iSrcStride, 1049; uint8_t * pTap, 1050; int32_t iTapStride, 1051; int32_t iWidth,int32_t iHeight); 1052;*********************************************************************** 1053WELS_EXTERN McHorVer22HorFirst_sse2 1054 %assign push_num 0 1055 LOAD_6_PARA 1056 PUSH_XMM 8 1057 SIGN_EXTENSION r1, r1d 1058 SIGN_EXTENSION r3, r3d 1059 SIGN_EXTENSION r4, r4d 1060 SIGN_EXTENSION r5, r5d 1061 pxor xmm7, xmm7 1062 sub r0, r1 ;;;;;;;;need more 5 lines. 1063 sub r0, r1 1064 1065 cmp r4, 9 1066 jne near .width_17 1067 1068.yloop_width_9: 1069 movq xmm0, [r0] 1070 punpcklbw xmm0, xmm7 1071 movq xmm1, [r0+5] 1072 punpcklbw xmm1, xmm7 1073 movq xmm2, [r0+1] 1074 punpcklbw xmm2, xmm7 1075 movq xmm3, [r0+4] 1076 punpcklbw xmm3, xmm7 1077 movq xmm4, [r0+2] 1078 punpcklbw xmm4, xmm7 1079 movq xmm5, [r0+3] 1080 punpcklbw xmm5, xmm7 1081 1082 movdqa xmm7, xmm2 1083 paddw xmm7, xmm3 1084 movdqa xmm6, xmm4 1085 paddw xmm6, xmm5 1086 psllw xmm6, 2 1087 psubw xmm6, xmm7 1088 paddw xmm0, xmm1 1089 paddw xmm0, xmm6 1090 psllw xmm6, 2 1091 paddw xmm0, xmm6 1092 movd [r2], xmm0 1093 1094 pxor xmm7, xmm7 1095 movq xmm0, [r0+6] 1096 punpcklbw xmm0, xmm7 1097 1098 paddw xmm4, xmm1 1099 paddw xmm5, xmm3 1100 psllw xmm5, 2 1101 psubw xmm5, xmm4 1102 paddw xmm2, xmm0 1103 paddw xmm2, xmm5 1104 psllw xmm5, 2 1105 paddw xmm2, xmm5 1106 movq [r2+2], xmm2 1107 movhps [r2+2+8], xmm2 1108 1109 add r0, r1 1110 add r2, r3 1111 dec r5 1112 jnz .yloop_width_9 1113 POP_XMM 1114 LOAD_6_PARA_POP 1115 ret 1116 1117 1118.width_17: 1119.yloop_width_17: 1120 movq xmm0, [r0] 1121 punpcklbw xmm0, xmm7 1122 movq xmm1, [r0+5] 1123 punpcklbw xmm1, xmm7 1124 movq xmm2, [r0+1] 1125 punpcklbw xmm2, xmm7 1126 movq xmm3, [r0+4] 1127 punpcklbw xmm3, xmm7 1128 movq xmm4, [r0+2] 1129 punpcklbw xmm4, xmm7 1130 movq xmm5, [r0+3] 1131 punpcklbw xmm5, xmm7 1132 1133 paddw xmm2, xmm3 1134 paddw xmm4, xmm5 1135 psllw xmm4, 2 1136 psubw xmm4, xmm2 1137 paddw xmm0, xmm1 1138 paddw xmm0, xmm4 1139 psllw xmm4, 2 1140 paddw xmm0, xmm4 1141 movdqa [r2], xmm0 1142 1143 movq xmm0, [r0+8] 1144 punpcklbw xmm0, xmm7 1145 movq xmm1, [r0+5+8] 1146 punpcklbw xmm1, xmm7 1147 movq xmm2, [r0+1+8] 1148 punpcklbw xmm2, xmm7 1149 movq xmm3, [r0+4+8] 1150 punpcklbw xmm3, xmm7 1151 movq xmm4, [r0+2+8] 1152 punpcklbw xmm4, xmm7 1153 movq xmm5, [r0+3+8] 1154 punpcklbw xmm5, xmm7 1155 1156 movdqa xmm7, xmm2 1157 paddw xmm7, xmm3 1158 movdqa xmm6, xmm4 1159 paddw xmm6, xmm5 1160 psllw xmm6, 2 1161 psubw xmm6, xmm7 1162 paddw xmm0, xmm1 1163 paddw xmm0, xmm6 1164 psllw xmm6, 2 1165 paddw xmm0, xmm6 1166 movd [r2+16], xmm0 1167 1168 1169 pxor xmm7, xmm7 1170 movq xmm0, [r0+6+8] 1171 punpcklbw xmm0, xmm7 1172 1173 paddw xmm4, xmm1 1174 paddw xmm5, xmm3 1175 psllw xmm5, 2 1176 psubw xmm5, xmm4 1177 paddw xmm2, xmm0 1178 paddw xmm2, xmm5 1179 psllw xmm5, 2 1180 paddw xmm2, xmm5 1181 movq [r2+18], xmm2 1182 movhps [r2+18+8], xmm2 1183 1184 add r0, r1 1185 add r2, r3 1186 dec r5 1187 jnz .yloop_width_17 1188 POP_XMM 1189 LOAD_6_PARA_POP 1190 ret 1191 1192 1193%macro FILTER_VER 9 1194 paddw %1, %6 1195 movdqa %7, %2 1196 movdqa %8, %3 1197 1198 1199 paddw %7, %5 1200 paddw %8, %4 1201 1202 psubw %1, %7 1203 psraw %1, 2 1204 paddw %1, %8 1205 psubw %1, %7 1206 psraw %1, 2 1207 paddw %8, %1 1208 paddw %8, [pic(h264_mc_hc_32)] 1209 psraw %8, 6 1210 packuswb %8, %8 1211 movq %9, %8 1212%endmacro 1213;*********************************************************************** 1214;void McHorVer22Width8VerLastAlign_sse2( 1215; const uint8_t *pTap, 1216; int32_t iTapStride, 1217; uint8_t * pDst, 1218; int32_t iDstStride, 1219; int32_t iWidth, 1220; int32_t iHeight); 1221;*********************************************************************** 1222 1223WELS_EXTERN McHorVer22Width8VerLastAlign_sse2 1224 %assign push_num 0 1225 INIT_X86_32_PIC r6 1226 LOAD_6_PARA 1227 PUSH_XMM 8 1228 SIGN_EXTENSION r1, r1d 1229 SIGN_EXTENSION r3, r3d 1230 SIGN_EXTENSION r4, r4d 1231 SIGN_EXTENSION r5, r5d 1232%ifndef X86_32 1233 push r12 1234 push r13 1235 push r14 1236 mov r12, r0 1237 mov r13, r2 1238 mov r14, r5 1239%endif 1240 1241 shr r4, 3 1242 1243.width_loop: 1244 movdqa xmm0, [r0] 1245 movdqa xmm1, [r0+r1] 1246 lea r0, [r0+2*r1] 1247 movdqa xmm2, [r0] 1248 movdqa xmm3, [r0+r1] 1249 lea r0, [r0+2*r1] 1250 movdqa xmm4, [r0] 1251 movdqa xmm5, [r0+r1] 1252 1253 FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1254 dec r5 1255 lea r0, [r0+2*r1] 1256 movdqa xmm6, [r0] 1257 1258 movdqa xmm0, xmm1 1259 movdqa xmm1, xmm2 1260 movdqa xmm2, xmm3 1261 movdqa xmm3, xmm4 1262 movdqa xmm4, xmm5 1263 movdqa xmm5, xmm6 1264 1265 add r2, r3 1266 sub r0, r1 1267 1268.start: 1269 FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1270 dec r5 1271 jz near .x_loop_dec 1272 1273 lea r0, [r0+2*r1] 1274 movdqa xmm6, [r0] 1275 FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] 1276 dec r5 1277 jz near .x_loop_dec 1278 1279 lea r2, [r2+2*r3] 1280 movdqa xmm7, [r0+r1] 1281 FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] 1282 dec r5 1283 jz near .x_loop_dec 1284 1285 lea r0, [r0+2*r1] 1286 movdqa xmm0, [r0] 1287 FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] 1288 dec r5 1289 jz near .x_loop_dec 1290 1291 lea r2, [r2+2*r3] 1292 movdqa xmm1, [r0+r1] 1293 FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] 1294 dec r5 1295 jz near .x_loop_dec 1296 1297 lea r0, [r0+2*r1] 1298 movdqa xmm2, [r0] 1299 FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] 1300 dec r5 1301 jz near .x_loop_dec 1302 1303 lea r2, [r2+2*r3] 1304 movdqa xmm3, [r0+r1] 1305 FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] 1306 dec r5 1307 jz near .x_loop_dec 1308 1309 lea r0, [r0+2*r1] 1310 movdqa xmm4, [r0] 1311 FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] 1312 dec r5 1313 jz near .x_loop_dec 1314 1315 lea r2, [r2+2*r3] 1316 movdqa xmm5, [r0+r1] 1317 jmp near .start 1318 1319.x_loop_dec: 1320 dec r4 1321 jz near .exit 1322%ifdef X86_32 1323 mov r0, arg1 1324 mov r2, arg3 1325 mov r5, arg6 1326%else 1327 mov r0, r12 1328 mov r2, r13 1329 mov r5, r14 1330%endif 1331 add r0, 16 1332 add r2, 8 1333 jmp .width_loop 1334 1335.exit: 1336%ifndef X86_32 1337 pop r14 1338 pop r13 1339 pop r12 1340%endif 1341 POP_XMM 1342 LOAD_6_PARA_POP 1343 DEINIT_X86_32_PIC 1344 ret 1345 1346;*********************************************************************** 1347;void McHorVer22Width8VerLastUnAlign_sse2( 1348; const uint8_t *pTap, 1349; int32_t iTapStride, 1350; uint8_t * pDst, 1351; int32_t iDstStride, 1352; int32_t iWidth, 1353; int32_t iHeight); 1354;*********************************************************************** 1355 1356WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2 1357 %assign push_num 0 1358 INIT_X86_32_PIC r6 1359 LOAD_6_PARA 1360 PUSH_XMM 8 1361 SIGN_EXTENSION r1, r1d 1362 SIGN_EXTENSION r3, r3d 1363 SIGN_EXTENSION r4, r4d 1364 SIGN_EXTENSION r5, r5d 1365%ifndef X86_32 1366 push r12 1367 push r13 1368 push r14 1369 mov r12, r0 1370 mov r13, r2 1371 mov r14, r5 1372%endif 1373 shr r4, 3 1374 1375.width_loop: 1376 movdqu xmm0, [r0] 1377 movdqu xmm1, [r0+r1] 1378 lea r0, [r0+2*r1] 1379 movdqu xmm2, [r0] 1380 movdqu xmm3, [r0+r1] 1381 lea r0, [r0+2*r1] 1382 movdqu xmm4, [r0] 1383 movdqu xmm5, [r0+r1] 1384 1385 FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1386 dec r5 1387 lea r0, [r0+2*r1] 1388 movdqu xmm6, [r0] 1389 1390 movdqa xmm0, xmm1 1391 movdqa xmm1, xmm2 1392 movdqa xmm2, xmm3 1393 movdqa xmm3, xmm4 1394 movdqa xmm4, xmm5 1395 movdqa xmm5, xmm6 1396 1397 add r2, r3 1398 sub r0, r1 1399 1400.start: 1401 FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1402 dec r5 1403 jz near .x_loop_dec 1404 1405 lea r0, [r0+2*r1] 1406 movdqu xmm6, [r0] 1407 FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] 1408 dec r5 1409 jz near .x_loop_dec 1410 1411 lea r2, [r2+2*r3] 1412 movdqu xmm7, [r0+r1] 1413 FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] 1414 dec r5 1415 jz near .x_loop_dec 1416 1417 lea r0, [r0+2*r1] 1418 movdqu xmm0, [r0] 1419 FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] 1420 dec r5 1421 jz near .x_loop_dec 1422 1423 lea r2, [r2+2*r3] 1424 movdqu xmm1, [r0+r1] 1425 FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] 1426 dec r5 1427 jz near .x_loop_dec 1428 1429 lea r0, [r0+2*r1] 1430 movdqu xmm2, [r0] 1431 FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] 1432 dec r5 1433 jz near .x_loop_dec 1434 1435 lea r2, [r2+2*r3] 1436 movdqu xmm3, [r0+r1] 1437 FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] 1438 dec r5 1439 jz near .x_loop_dec 1440 1441 lea r0, [r0+2*r1] 1442 movdqu xmm4, [r0] 1443 FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] 1444 dec r5 1445 jz near .x_loop_dec 1446 1447 lea r2, [r2+2*r3] 1448 movdqu xmm5, [r0+r1] 1449 jmp near .start 1450 1451.x_loop_dec: 1452 dec r4 1453 jz near .exit 1454%ifdef X86_32 1455 mov r0, arg1 1456 mov r2, arg3 1457 mov r5, arg6 1458%else 1459 mov r0, r12 1460 mov r2, r13 1461 mov r5, r14 1462%endif 1463 add r0, 16 1464 add r2, 8 1465 jmp .width_loop 1466 1467.exit: 1468%ifndef X86_32 1469 pop r14 1470 pop r13 1471 pop r12 1472%endif 1473 POP_XMM 1474 LOAD_6_PARA_POP 1475 DEINIT_X86_32_PIC 1476 ret 1477 1478 1479;*********************************************************************** 1480;void McHorVer22Width5HorFirst_sse2 1481; (const uint8_t *pSrc, 1482; int32_t iSrcStride, 1483; uint8_t * pTap, 1484; int32_t iTapStride, 1485; int32_t iWidth,int32_t iHeight); 1486;*********************************************************************** 1487WELS_EXTERN McHorVer22Width5HorFirst_sse2 1488%assign push_num 0 1489LOAD_6_PARA 1490PUSH_XMM 8 1491SIGN_EXTENSION r1, r1d 1492SIGN_EXTENSION r3, r3d 1493SIGN_EXTENSION r4, r4d 1494SIGN_EXTENSION r5, r5d 1495pxor xmm7, xmm7 1496sub r0, r1 ;;;;;;;;need more 5 lines. 1497sub r0, r1 1498 1499.yloop_width_5: 1500movq xmm0, [r0] 1501punpcklbw xmm0, xmm7 1502movq xmm1, [r0+5] 1503punpcklbw xmm1, xmm7 1504movq xmm2, [r0+1] 1505punpcklbw xmm2, xmm7 1506movq xmm3, [r0+4] 1507punpcklbw xmm3, xmm7 1508movq xmm4, [r0+2] 1509punpcklbw xmm4, xmm7 1510movq xmm5, [r0+3] 1511punpcklbw xmm5, xmm7 1512 1513movdqa xmm7, xmm2 1514paddw xmm7, xmm3 1515movdqa xmm6, xmm4 1516paddw xmm6, xmm5 1517psllw xmm6, 2 1518psubw xmm6, xmm7 1519paddw xmm0, xmm1 1520paddw xmm0, xmm6 1521psllw xmm6, 2 1522paddw xmm0, xmm6 1523movd [r2], xmm0 1524 1525pxor xmm7, xmm7 1526movq xmm0, [r0+6] 1527punpcklbw xmm0, xmm7 1528 1529paddw xmm4, xmm1 1530paddw xmm5, xmm3 1531psllw xmm5, 2 1532psubw xmm5, xmm4 1533paddw xmm2, xmm0 1534paddw xmm2, xmm5 1535psllw xmm5, 2 1536paddw xmm2, xmm5 1537movq [r2+2], xmm2 1538movhps [r2+2+8], xmm2 1539 1540add r0, r1 1541add r2, r3 1542dec r5 1543jnz .yloop_width_5 1544POP_XMM 1545LOAD_6_PARA_POP 1546ret 1547 1548 1549%macro FILTER_VER_4 9 1550paddw %1, %6 1551movdqa %7, %2 1552movdqa %8, %3 1553 1554 1555paddw %7, %5 1556paddw %8, %4 1557 1558psubw %1, %7 1559psraw %1, 2 1560paddw %1, %8 1561psubw %1, %7 1562psraw %1, 2 1563paddw %8, %1 1564paddw %8, [pic(h264_mc_hc_32)] 1565psraw %8, 6 1566packuswb %8, %8 1567movd %9, %8 1568%endmacro 1569 1570 1571;*********************************************************************** 1572;void McHorVer22Width4VerLastAlign_sse2( 1573; const uint8_t *pTap, 1574; int32_t iTapStride, 1575; uint8_t * pDst, 1576; int32_t iDstStride, 1577; int32_t iWidth, 1578; int32_t iHeight); 1579;*********************************************************************** 1580 1581WELS_EXTERN McHorVer22Width4VerLastAlign_sse2 1582%assign push_num 0 1583INIT_X86_32_PIC r6 1584LOAD_6_PARA 1585PUSH_XMM 8 1586SIGN_EXTENSION r1, r1d 1587SIGN_EXTENSION r3, r3d 1588SIGN_EXTENSION r4, r4d 1589SIGN_EXTENSION r5, r5d 1590%ifndef X86_32 1591push r12 1592push r13 1593push r14 1594mov r12, r0 1595mov r13, r2 1596mov r14, r5 1597%endif 1598 1599shr r4, 2 1600 1601.width_loop: 1602movdqa xmm0, [r0] 1603movdqa xmm1, [r0+r1] 1604lea r0, [r0+2*r1] 1605movdqa xmm2, [r0] 1606movdqa xmm3, [r0+r1] 1607lea r0, [r0+2*r1] 1608movdqa xmm4, [r0] 1609movdqa xmm5, [r0+r1] 1610 1611FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1612dec r5 1613lea r0, [r0+2*r1] 1614movdqa xmm6, [r0] 1615 1616movdqa xmm0, xmm1 1617movdqa xmm1, xmm2 1618movdqa xmm2, xmm3 1619movdqa xmm3, xmm4 1620movdqa xmm4, xmm5 1621movdqa xmm5, xmm6 1622 1623add r2, r3 1624sub r0, r1 1625 1626.start: 1627FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1628dec r5 1629jz near .x_loop_dec 1630 1631lea r0, [r0+2*r1] 1632movdqa xmm6, [r0] 1633FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] 1634dec r5 1635jz near .x_loop_dec 1636 1637lea r2, [r2+2*r3] 1638movdqa xmm7, [r0+r1] 1639FILTER_VER_4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] 1640dec r5 1641jz near .x_loop_dec 1642 1643lea r0, [r0+2*r1] 1644movdqa xmm0, [r0] 1645FILTER_VER_4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] 1646dec r5 1647jz near .x_loop_dec 1648 1649lea r2, [r2+2*r3] 1650movdqa xmm1, [r0+r1] 1651FILTER_VER_4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] 1652dec r5 1653jz near .x_loop_dec 1654 1655lea r0, [r0+2*r1] 1656movdqa xmm2, [r0] 1657FILTER_VER_4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] 1658dec r5 1659jz near .x_loop_dec 1660 1661lea r2, [r2+2*r3] 1662movdqa xmm3, [r0+r1] 1663FILTER_VER_4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] 1664dec r5 1665jz near .x_loop_dec 1666 1667lea r0, [r0+2*r1] 1668movdqa xmm4, [r0] 1669FILTER_VER_4 xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] 1670dec r5 1671jz near .x_loop_dec 1672 1673lea r2, [r2+2*r3] 1674movdqa xmm5, [r0+r1] 1675jmp near .start 1676 1677.x_loop_dec: 1678dec r4 1679jz near .exit 1680%ifdef X86_32 1681mov r0, arg1 1682mov r2, arg3 1683mov r5, arg6 1684%else 1685mov r0, r12 1686mov r2, r13 1687mov r5, r14 1688%endif 1689add r0, 8 1690add r2, 4 1691jmp .width_loop 1692 1693.exit: 1694%ifndef X86_32 1695pop r14 1696pop r13 1697pop r12 1698%endif 1699POP_XMM 1700LOAD_6_PARA_POP 1701DEINIT_X86_32_PIC 1702ret 1703 1704 1705;*********************************************************************** 1706;void McHorVer22Width4VerLastUnAlign_sse2( 1707; const uint8_t *pTap, 1708; int32_t iTapStride, 1709; uint8_t * pDst, 1710; int32_t iDstStride, 1711; int32_t iWidth, 1712; int32_t iHeight); 1713;*********************************************************************** 1714 1715WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2 1716%assign push_num 0 1717INIT_X86_32_PIC r6 1718LOAD_6_PARA 1719PUSH_XMM 8 1720SIGN_EXTENSION r1, r1d 1721SIGN_EXTENSION r3, r3d 1722SIGN_EXTENSION r4, r4d 1723SIGN_EXTENSION r5, r5d 1724%ifndef X86_32 1725push r12 1726push r13 1727push r14 1728mov r12, r0 1729mov r13, r2 1730mov r14, r5 1731%endif 1732shr r4, 2 1733 1734.width_loop: 1735movdqu xmm0, [r0] 1736movdqu xmm1, [r0+r1] 1737lea r0, [r0+2*r1] 1738movdqu xmm2, [r0] 1739movdqu xmm3, [r0+r1] 1740lea r0, [r0+2*r1] 1741movdqu xmm4, [r0] 1742movdqu xmm5, [r0+r1] 1743 1744FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1745dec r5 1746lea r0, [r0+2*r1] 1747movdqu xmm6, [r0] 1748 1749movdqa xmm0, xmm1 1750movdqa xmm1, xmm2 1751movdqa xmm2, xmm3 1752movdqa xmm3, xmm4 1753movdqa xmm4, xmm5 1754movdqa xmm5, xmm6 1755 1756add r2, r3 1757sub r0, r1 1758 1759.start: 1760FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2] 1761dec r5 1762jz near .x_loop_dec 1763 1764lea r0, [r0+2*r1] 1765movdqu xmm6, [r0] 1766FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3] 1767dec r5 1768jz near .x_loop_dec 1769 1770lea r2, [r2+2*r3] 1771movdqu xmm7, [r0+r1] 1772FILTER_VER_4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2] 1773dec r5 1774jz near .x_loop_dec 1775 1776lea r0, [r0+2*r1] 1777movdqu xmm0, [r0] 1778FILTER_VER_4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3] 1779dec r5 1780jz near .x_loop_dec 1781 1782lea r2, [r2+2*r3] 1783movdqu xmm1, [r0+r1] 1784FILTER_VER_4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2] 1785dec r5 1786jz near .x_loop_dec 1787 1788lea r0, [r0+2*r1] 1789movdqu xmm2, [r0] 1790FILTER_VER_4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3] 1791dec r5 1792jz near .x_loop_dec 1793 1794lea r2, [r2+2*r3] 1795movdqu xmm3, [r0+r1] 1796FILTER_VER_4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2] 1797dec r5 1798jz near .x_loop_dec 1799 1800lea r0, [r0+2*r1] 1801movdqu xmm4, [r0] 1802FILTER_VER_4 xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3] 1803dec r5 1804jz near .x_loop_dec 1805 1806lea r2, [r2+2*r3] 1807movdqu xmm5, [r0+r1] 1808jmp near .start 1809 1810.x_loop_dec: 1811dec r4 1812jz near .exit 1813%ifdef X86_32 1814mov r0, arg1 1815mov r2, arg3 1816mov r5, arg6 1817%else 1818mov r0, r12 1819mov r2, r13 1820mov r5, r14 1821%endif 1822add r0, 8 1823add r2, 4 1824jmp .width_loop 1825 1826.exit: 1827%ifndef X86_32 1828pop r14 1829pop r13 1830pop r12 1831%endif 1832POP_XMM 1833LOAD_6_PARA_POP 1834DEINIT_X86_32_PIC 1835ret 1836 1837 1838; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7 1839%macro SSSE3_FilterVertical_8px 7 1840 pmaddubsw %1, %4 1841 movdqa %7, %2 1842 pmaddubsw %7, %5 1843 paddw %1, %7 1844 movdqa %7, %3 1845 pmaddubsw %7, %6 1846 paddw %1, %7 1847 paddw %1, [pic(h264_w0x10_1)] 1848 psraw %1, 5 1849%endmacro 1850 1851; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8 1852%macro SSSE3_FilterVertical2_8px 8 1853 movdqa %8, %2 1854 pxor %7, %7 1855 punpcklbw %1, %7 1856 punpcklbw %8, %7 1857 paddw %1, %8 1858 movdqa %7, %3 1859 pmaddubsw %7, %5 1860 paddw %1, %7 1861 movdqa %7, %4 1862 pmaddubsw %7, %6 1863 paddw %1, %7 1864 paddw %1, [pic(h264_w0x10_1)] 1865 psraw %1, 5 1866%endmacro 1867 1868; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6 1869%macro SSSE3_FilterHorizontalbw_8px 6 1870 movdqa %5, %1 1871 pshufb %1, %2 1872 pshufb %5, %3 1873 pshufd %6, %1, 10110001b 1874 pmaddubsw %1, [pic(db20_128)] 1875 pmaddubsw %5, %4 1876 pmaddubsw %6, %4 1877 paddw %1, %5 1878 paddw %1, %6 1879%endmacro 1880 1881; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6 1882%macro SSSE3_FilterHorizontal_8px 6 1883 SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6 1884 paddw %1, [pic(h264_w0x10_1)] 1885 psraw %1, 5 1886%endmacro 1887 1888; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7 1889%macro SSSE3_FilterHorizontalbw_2x4px 7 1890 movdqa %6, %1 1891 movdqa %7, %2 1892 pshufb %1, %3 1893 pshufb %2, %3 1894 punpcklqdq %1, %2 1895 pshufb %6, %4 1896 pshufb %7, %4 1897 punpcklqdq %6, %7 1898 pshufd %7, %1, 10110001b 1899 pmaddubsw %1, [pic(db20_128)] 1900 pmaddubsw %6, %5 1901 pmaddubsw %7, %5 1902 paddw %1, %6 1903 paddw %1, %7 1904%endmacro 1905 1906; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7 1907%macro SSSE3_FilterHorizontal_2x4px 7 1908 SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7 1909 paddw %1, [pic(h264_w0x10_1)] 1910 psraw %1, 5 1911%endmacro 1912 1913; pixels=%1 -32768>>scale=%2 tmp=%3 1914%macro SSSE3_FilterHorizontalbw_2px 3 1915 pmaddubsw %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)] 1916 pmaddwd %1, %2 1917 pshufd %3, %1, 10110001b 1918 paddd %1, %3 1919%endmacro 1920 1921; pixels=%1 tmp=%2 1922%macro SSSE3_FilterHorizontal_2px 2 1923 SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2 1924 paddd %1, [pic(dd32768_128)] 1925%endmacro 1926 1927; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7 1928%macro SSE2_FilterVerticalw_8px 7 1929 paddw %1, %6 1930 movdqa %7, %2 1931 paddw %7, %5 1932 psubw %1, %7 1933 psraw %1, 2 1934 psubw %1, %7 1935 movdqa %7, %3 1936 paddw %7, %4 1937 paddw %1, %7 1938 psraw %1, 2 1939 paddw %7, [pic(h264_mc_hc_32)] 1940 paddw %1, %7 1941 psraw %1, 6 1942%endmacro 1943 1944;*********************************************************************** 1945; void McHorVer02_ssse3(const uint8_t *pSrc, 1946; int32_t iSrcStride, 1947; uint8_t *pDst, 1948; int32_t iDstStride, 1949; int32_t iWidth, 1950; int32_t iHeight) 1951;*********************************************************************** 1952 1953WELS_EXTERN McHorVer02_ssse3 1954%define p_src r0 1955%define i_srcstride r1 1956%define p_dst r2 1957%define i_dststride r3 1958%ifdef X86_32_PICASM 1959%define i_width dword arg5 1960%else 1961%define i_width r4 1962%endif 1963%define i_height r5 1964%define i_srcstride3 r6 1965 %assign push_num 0 1966%ifdef X86_32 1967 push r6 1968 %assign push_num 1 1969%endif 1970 LOAD_6_PARA 1971 PUSH_XMM 8 1972 SIGN_EXTENSION r1, r1d 1973 SIGN_EXTENSION r3, r3d 1974 SIGN_EXTENSION r4, r4d 1975 SIGN_EXTENSION r5, r5d 1976 INIT_X86_32_PIC_NOPRESERVE r4 1977 sub p_src, i_srcstride 1978 sub p_src, i_srcstride 1979 lea i_srcstride3, [3 * i_srcstride] 1980 %assign push_num_begin push_num 1981 cmp i_width, 4 1982 jg .width8or16 1983 1984 movd xmm0, [p_src] 1985 movd xmm4, [p_src + i_srcstride] 1986 punpcklbw xmm0, xmm4 1987 movd xmm1, [p_src + 2 * i_srcstride] 1988 punpcklbw xmm4, xmm1 1989 punpcklqdq xmm0, xmm4 1990 movd xmm4, [p_src + i_srcstride3] 1991 lea p_src, [p_src + 4 * i_srcstride] 1992 punpcklbw xmm1, xmm4 1993 movd xmm2, [p_src] 1994 punpcklbw xmm4, xmm2 1995 punpcklqdq xmm1, xmm4 1996 movd xmm4, [p_src + i_srcstride] 1997 lea p_src, [p_src + 2 * i_srcstride] 1998 punpcklbw xmm2, xmm4 1999 movd xmm3, [p_src] 2000 punpcklbw xmm4, xmm3 2001 punpcklqdq xmm2, xmm4 2002 movdqa xmm5, [pic(db20_128)] 2003 SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4 2004 packuswb xmm0, xmm0 2005 movd [p_dst], xmm0 2006 psrlq xmm0, 32 2007 movd [p_dst + i_dststride], xmm0 2008 lea p_dst, [p_dst + 2 * i_dststride] 2009 movd xmm4, [p_src + i_srcstride] 2010 punpcklbw xmm3, xmm4 2011 movd xmm0, [p_src + 2 * i_srcstride] 2012 punpcklbw xmm4, xmm0 2013 punpcklqdq xmm3, xmm4 2014 SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4 2015 packuswb xmm1, xmm1 2016 movd [p_dst], xmm1 2017 psrlq xmm1, 32 2018 movd [p_dst + i_dststride], xmm1 2019 cmp i_height, 5 2020 jl .width4_height_le5_done 2021 lea p_dst, [p_dst + 2 * i_dststride] 2022 movd xmm4, [p_src + i_srcstride3] 2023 punpcklbw xmm0, xmm4 2024 jg .width4_height_ge8 2025 SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4 2026 packuswb xmm2, xmm2 2027 movd [p_dst], xmm2 2028.width4_height_le5_done: 2029 DEINIT_X86_32_PIC_KEEPDEF 2030 POP_XMM 2031 LOAD_6_PARA_POP 2032%ifdef X86_32 2033 pop r6 2034%endif 2035 ret 2036.width4_height_ge8: 2037 lea p_src, [p_src + 4 * i_srcstride] 2038 movd xmm1, [p_src] 2039 punpcklbw xmm4, xmm1 2040 punpcklqdq xmm0, xmm4 2041 SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4 2042 packuswb xmm2, xmm2 2043 movd [p_dst], xmm2 2044 psrlq xmm2, 32 2045 movd [p_dst + i_dststride], xmm2 2046 lea p_dst, [p_dst + 2 * i_dststride] 2047 movd xmm4, [p_src + i_srcstride] 2048 punpcklbw xmm1, xmm4 2049 movd xmm2, [p_src + 2 * i_srcstride] 2050 punpcklbw xmm4, xmm2 2051 punpcklqdq xmm1, xmm4 2052 SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4 2053 packuswb xmm3, xmm3 2054 movd [p_dst], xmm3 2055 psrlq xmm3, 32 2056 movd [p_dst + i_dststride], xmm3 2057 cmp i_height, 9 2058 jl .width4_height_ge8_done 2059 lea p_dst, [p_dst + 2 * i_dststride] 2060 movd xmm4, [p_src + i_srcstride3] 2061 punpcklbw xmm2, xmm4 2062 SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4 2063 packuswb xmm0, xmm0 2064 movd [p_dst], xmm0 2065.width4_height_ge8_done: 2066 DEINIT_X86_32_PIC_KEEPDEF 2067 POP_XMM 2068 LOAD_6_PARA_POP 2069%ifdef X86_32 2070 pop r6 2071%endif 2072 ret 2073 2074.width8or16: 2075 %assign push_num push_num_begin 2076 sub i_height, 1 2077 push i_height 2078 %assign push_num push_num + 1 2079%xdefine i_ycnt i_height 2080%define i_height [r7] 2081.xloop: 2082 push p_src 2083 push p_dst 2084 %assign push_num push_num + 2 2085 test i_ycnt, 1 2086 jnz .yloop_begin_even 2087 movq xmm0, [p_src] 2088 movq xmm1, [p_src + i_srcstride] 2089 punpcklbw xmm0, xmm1 2090 movq xmm2, [p_src + 2 * i_srcstride] 2091 movq xmm3, [p_src + i_srcstride3] 2092 lea p_src, [p_src + 4 * i_srcstride] 2093 punpcklbw xmm2, xmm3 2094 movq xmm4, [p_src] 2095 movq xmm5, [p_src + i_srcstride] 2096 lea p_src, [p_src + 2 * i_srcstride] 2097 punpcklbw xmm4, xmm5 2098 SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7 2099 packuswb xmm0, xmm0 2100 movlps [p_dst], xmm0 2101 add p_dst, i_dststride 2102 jmp .yloop 2103.yloop_begin_even: 2104 movq xmm1, [p_src] 2105 movq xmm2, [p_src + i_srcstride] 2106 movq xmm3, [p_src + 2 * i_srcstride] 2107 add p_src, i_srcstride3 2108 punpcklbw xmm2, xmm3 2109 movq xmm4, [p_src] 2110 movq xmm5, [p_src + i_srcstride] 2111 lea p_src, [p_src + 2 * i_srcstride] 2112 punpcklbw xmm4, xmm5 2113.yloop: 2114 movq xmm6, [p_src] 2115 SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7 2116 movq xmm7, [p_src + i_srcstride] 2117 punpcklbw xmm6, xmm7 2118 SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0 2119 packuswb xmm1, xmm2 2120 movlps [p_dst], xmm1 2121 movhps [p_dst + i_dststride], xmm1 2122 lea p_dst, [p_dst + 2 * i_dststride] 2123 movq xmm0, [p_src + 2 * i_srcstride] 2124 SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1 2125 movq xmm1, [p_src + i_srcstride3] 2126 lea p_src, [p_src + 4 * i_srcstride] 2127 punpcklbw xmm0, xmm1 2128 SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2 2129 packuswb xmm3, xmm4 2130 movlps [p_dst], xmm3 2131 movhps [p_dst + i_dststride], xmm3 2132 cmp i_ycnt, 4 2133 jle .yloop_exit 2134 lea p_dst, [p_dst + 2 * i_dststride] 2135 movq xmm2, [p_src] 2136 SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3 2137 movq xmm3, [p_src + i_srcstride] 2138 punpcklbw xmm2, xmm3 2139 SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4 2140 packuswb xmm5, xmm6 2141 movlps [p_dst], xmm5 2142 movhps [p_dst + i_dststride], xmm5 2143 lea p_dst, [p_dst + 2 * i_dststride] 2144 movq xmm4, [p_src + 2 * i_srcstride] 2145 SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5 2146 movq xmm5, [p_src + i_srcstride3] 2147 lea p_src, [p_src + 4 * i_srcstride] 2148 punpcklbw xmm4, xmm5 2149 SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6 2150 packuswb xmm7, xmm0 2151 movlps [p_dst], xmm7 2152 movhps [p_dst + i_dststride], xmm7 2153 lea p_dst, [p_dst + 2 * i_dststride] 2154 sub i_ycnt, 8 2155 jg .yloop 2156.yloop_exit: 2157 pop p_dst 2158 pop p_src 2159 %assign push_num push_num - 2 2160 sub i_width, 8 2161 jle .width8or16_done 2162 add p_src, 8 2163 add p_dst, 8 2164 mov i_ycnt, i_height 2165 jmp .xloop 2166.width8or16_done: 2167 pop i_ycnt 2168 %assign push_num push_num - 1 2169 DEINIT_X86_32_PIC 2170 POP_XMM 2171 LOAD_6_PARA_POP 2172%ifdef X86_32 2173 pop r6 2174%endif 2175 ret 2176%undef p_src 2177%undef i_srcstride 2178%undef i_srcstride3 2179%undef p_dst 2180%undef i_dststride 2181%undef i_width 2182%undef i_height 2183%undef i_ycnt 2184 2185 2186;******************************************************************************* 2187; void McHorVer20_ssse3(const uint8_t *pSrc, 2188; int iSrcStride, 2189; uint8_t *pDst, 2190; int iDstStride, 2191; int iWidth, 2192; int iHeight); 2193;******************************************************************************* 2194 2195WELS_EXTERN McHorVer20_ssse3 2196%define p_src r0 2197%define i_srcstride r1 2198%define p_dst r2 2199%define i_dststride r3 2200%define i_width r4 2201%define i_height r5 2202 %assign push_num 0 2203 INIT_X86_32_PIC r6 2204 LOAD_6_PARA 2205 PUSH_XMM 7 2206 SIGN_EXTENSION r1, r1d 2207 SIGN_EXTENSION r3, r3d 2208 SIGN_EXTENSION r4, r4d 2209 SIGN_EXTENSION r5, r5d 2210 movdqa xmm4, [pic(shufb_32435465768798A9)] 2211 movdqa xmm5, [pic(shufb_011267784556ABBC)] 2212 movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 2213 cmp i_width, 8 2214 je .width8_yloop 2215 jg .width16_yloop 2216.width4_yloop: 2217 movdqu xmm0, [p_src - 2] 2218 movdqu xmm1, [p_src + i_srcstride - 2] 2219 lea p_src, [p_src + 2 * i_srcstride] 2220 SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3 2221 packuswb xmm0, xmm0 2222 movd [p_dst], xmm0 2223 psrlq xmm0, 32 2224 movd [p_dst + i_dststride], xmm0 2225 lea p_dst, [p_dst + 2 * i_dststride] 2226 sub i_height, 2 2227 jg .width4_yloop 2228 POP_XMM 2229 LOAD_6_PARA_POP 2230 DEINIT_X86_32_PIC_KEEPDEF 2231 ret 2232.width8_yloop: 2233 movdqu xmm0, [p_src - 2] 2234 movdqu xmm1, [p_src + i_srcstride - 2] 2235 lea p_src, [p_src + 2 * i_srcstride] 2236 SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3 2237 SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3 2238 packuswb xmm0, xmm1 2239 movlps [p_dst], xmm0 2240 movhps [p_dst + i_dststride], xmm0 2241 lea p_dst, [p_dst + 2 * i_dststride] 2242 sub i_height, 2 2243 jg .width8_yloop 2244 POP_XMM 2245 LOAD_6_PARA_POP 2246 DEINIT_X86_32_PIC_KEEPDEF 2247 ret 2248.width16_yloop: 2249 movdqu xmm0, [p_src - 2] 2250 movdqu xmm1, [p_src + 6] 2251 add p_src, i_srcstride 2252 SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3 2253 SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3 2254 packuswb xmm0, xmm1 2255 MOVDQ [p_dst], xmm0 2256 add p_dst, i_dststride 2257 sub i_height, 1 2258 jg .width16_yloop 2259 POP_XMM 2260 LOAD_6_PARA_POP 2261 DEINIT_X86_32_PIC 2262 ret 2263%undef p_src 2264%undef i_srcstride 2265%undef p_dst 2266%undef i_dststride 2267%undef i_width 2268%undef i_height 2269 2270 2271;*********************************************************************** 2272; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc, 2273; int32_t iSrcStride, 2274; uint8_t *pDst, 2275; int32_t iDstStride, 2276; int32_t iWidth, 2277; int32_t iHeight); 2278;*********************************************************************** 2279 2280WELS_EXTERN McHorVer20Width5Or9Or17_ssse3 2281%define p_src r0 2282%define i_srcstride r1 2283%define p_dst r2 2284%define i_dststride r3 2285%define i_width r4 2286%define i_height r5 2287 %assign push_num 0 2288 INIT_X86_32_PIC r6 2289 LOAD_6_PARA 2290 PUSH_XMM 8 2291 SIGN_EXTENSION r1, r1d 2292 SIGN_EXTENSION r3, r3d 2293 SIGN_EXTENSION r4, r4d 2294 SIGN_EXTENSION r5, r5d 2295 movdqa xmm5, [pic(shufb_32435465768798A9)] 2296 movdqa xmm6, [pic(shufb_011267784556ABBC)] 2297 movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 2298 cmp i_width, 9 2299 je .width9_yloop 2300 jg .width17_yloop 2301.width5_yloop: 2302 movdqu xmm0, [p_src - 2] 2303 add p_src, i_srcstride 2304 SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2 2305 packuswb xmm0, xmm0 2306 movdqa xmm1, xmm0 2307 psrlq xmm1, 8 2308 movd [p_dst], xmm0 2309 movd [p_dst + 1], xmm1 2310 add p_dst, i_dststride 2311 sub i_height, 1 2312 jg .width5_yloop 2313 POP_XMM 2314 LOAD_6_PARA_POP 2315 DEINIT_X86_32_PIC_KEEPDEF 2316 ret 2317.width9_yloop: 2318 movdqu xmm0, [p_src - 2] 2319 movdqu xmm4, [p_src + i_srcstride - 2] 2320 lea p_src, [p_src + 2 * i_srcstride] 2321 movdqa xmm3, xmm0 2322 punpckhqdq xmm3, xmm4 2323 SSSE3_FilterHorizontal_2px xmm3, xmm2 2324 SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2 2325 packuswb xmm3, xmm0 2326 movd [p_dst + 5], xmm3 2327 movhps [p_dst], xmm3 2328 add p_dst, i_dststride 2329 SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2 2330 packuswb xmm4, xmm4 2331 psrldq xmm3, 4 2332 movd [p_dst + 5], xmm3 2333 movlps [p_dst], xmm4 2334 add p_dst, i_dststride 2335 sub i_height, 2 2336 jg .width9_yloop 2337 POP_XMM 2338 LOAD_6_PARA_POP 2339 DEINIT_X86_32_PIC_KEEPDEF 2340 ret 2341.width17_yloop: 2342 movdqu xmm0, [p_src - 2] 2343 movdqu xmm3, [p_src + 6] 2344 add p_src, i_srcstride 2345 movdqa xmm4, xmm3 2346 SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2 2347 SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2 2348 packuswb xmm0, xmm3 2349 movdqu xmm1, [p_src - 2] 2350 movdqu xmm3, [p_src + 6] 2351 add p_src, i_srcstride 2352 punpckhqdq xmm4, xmm3 2353 SSSE3_FilterHorizontal_2px xmm4, xmm2 2354 packuswb xmm4, xmm4 2355 movd [p_dst + 13], xmm4 2356 MOVDQ [p_dst], xmm0 2357 add p_dst, i_dststride 2358 psrldq xmm4, 4 2359 movd [p_dst + 13], xmm4 2360 SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2 2361 SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2 2362 packuswb xmm1, xmm3 2363 MOVDQ [p_dst], xmm1 2364 add p_dst, i_dststride 2365 sub i_height, 2 2366 jg .width17_yloop 2367 POP_XMM 2368 LOAD_6_PARA_POP 2369 DEINIT_X86_32_PIC 2370 ret 2371%undef p_src 2372%undef i_srcstride 2373%undef p_dst 2374%undef i_dststride 2375%undef i_width 2376%undef i_height 2377 2378 2379;******************************************************************************* 2380; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc, 2381; int iSrcStride, 2382; int16_t *pDst, 2383; int iHeight); 2384;******************************************************************************* 2385 2386WELS_EXTERN McHorVer20Width4U8ToS16_ssse3 2387%define p_src r0 2388%define i_srcstride r1 2389%define p_dst r2 2390%define i_height r3 2391 %assign push_num 0 2392 INIT_X86_32_PIC r4 2393 LOAD_4_PARA 2394 PUSH_XMM 7 2395 SIGN_EXTENSION r1, r1d 2396 SIGN_EXTENSION r3, r3d 2397 sub p_src, i_srcstride 2398 sub p_src, i_srcstride 2399 movdqa xmm4, [pic(shufb_32435465768798A9)] 2400 movdqa xmm5, [pic(shufb_011267784556ABBC)] 2401 movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 2402 sub i_height, 1 2403.yloop: 2404 movdqu xmm0, [p_src - 2] 2405 movdqu xmm1, [p_src + i_srcstride - 2] 2406 lea p_src, [p_src + 2 * i_srcstride] 2407 SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3 2408 movdqa [p_dst], xmm0 2409 add p_dst, 16 2410 sub i_height, 2 2411 jg .yloop 2412 ; Height % 2 remainder. 2413 movdqu xmm0, [p_src - 2] 2414 SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3 2415 movlps [p_dst], xmm0 2416 POP_XMM 2417 LOAD_4_PARA_POP 2418 DEINIT_X86_32_PIC 2419 ret 2420%undef p_src 2421%undef i_srcstride 2422%undef p_dst 2423%undef i_height 2424 2425 2426;*********************************************************************** 2427; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc, 2428; uint8_t *pDst, 2429; int32_t iDstStride, 2430; int32_t iHeight); 2431;*********************************************************************** 2432 2433WELS_EXTERN McHorVer02Width4S16ToU8_ssse3 2434%define p_src r0 2435%define p_dst r1 2436%define i_dststride r2 2437%define i_height r3 2438%define i_srcstride 8 2439 %assign push_num 0 2440 INIT_X86_32_PIC r4 2441 LOAD_4_PARA 2442 PUSH_XMM 8 2443 SIGN_EXTENSION r2, r2d 2444 SIGN_EXTENSION r3, r3d 2445 movdqa xmm0, [p_src + 0 * i_srcstride] 2446 movdqu xmm1, [p_src + 1 * i_srcstride] 2447 movdqa xmm2, [p_src + 2 * i_srcstride] 2448 movdqu xmm3, [p_src + 3 * i_srcstride] 2449 movdqa xmm4, [p_src + 4 * i_srcstride] 2450 movdqu xmm5, [p_src + 5 * i_srcstride] 2451 movdqa xmm6, [p_src + 6 * i_srcstride] 2452 SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7 2453 packuswb xmm0, xmm0 2454 movd [p_dst], xmm0 2455 psrlq xmm0, 32 2456 movd [p_dst + i_dststride], xmm0 2457 lea p_dst, [p_dst + 2 * i_dststride] 2458 movdqu xmm7, [p_src + 7 * i_srcstride] 2459 movdqa xmm0, [p_src + 8 * i_srcstride] 2460 SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1 2461 packuswb xmm2, xmm2 2462 movd [p_dst], xmm2 2463 psrlq xmm2, 32 2464 movd [p_dst + i_dststride], xmm2 2465 cmp i_height, 4 2466 jle .done 2467 lea p_dst, [p_dst + 2 * i_dststride] 2468 movdqu xmm1, [p_src + 9 * i_srcstride] 2469 movdqa xmm2, [p_src + 10 * i_srcstride] 2470 SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3 2471 packuswb xmm4, xmm4 2472 movd [p_dst], xmm4 2473 psrlq xmm4, 32 2474 movd [p_dst + i_dststride], xmm4 2475 lea p_dst, [p_dst + 2 * i_dststride] 2476 movdqu xmm3, [p_src + 11 * i_srcstride] 2477 SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5 2478 packuswb xmm6, xmm6 2479 movd [p_dst], xmm6 2480 psrlq xmm6, 32 2481 movd [p_dst + i_dststride], xmm6 2482.done: 2483 POP_XMM 2484 LOAD_4_PARA_POP 2485 DEINIT_X86_32_PIC 2486 ret 2487%undef p_src 2488%undef p_dst 2489%undef i_dststride 2490%undef i_height 2491%undef i_srcstride 2492 2493 2494;*********************************************************************** 2495; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc, 2496; int16_t iSrcStride, 2497; int16_t *pDst, 2498; int32_t iDstStride, 2499; int32_t iHeight); 2500;*********************************************************************** 2501 2502WELS_EXTERN McHorVer20Width8U8ToS16_ssse3 2503%define p_src r0 2504%define i_srcstride r1 2505%define p_dst r2 2506%define i_dststride r3 2507%define i_height r4 2508 %assign push_num 0 2509 INIT_X86_32_PIC r5 2510 LOAD_5_PARA 2511 PUSH_XMM 7 2512 SIGN_EXTENSION r1, r1d 2513 SIGN_EXTENSION r3, r3d 2514 SIGN_EXTENSION r4, r4d 2515 sub p_src, i_srcstride 2516 sub p_src, i_srcstride 2517 movdqa xmm4, [pic(shufb_32435465768798A9)] 2518 movdqa xmm5, [pic(shufb_011267784556ABBC)] 2519 movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 2520 sub i_height, 1 2521.yloop: 2522 movdqu xmm0, [p_src - 2] 2523 movdqu xmm1, [p_src + i_srcstride - 2] 2524 lea p_src, [p_src + 2 * i_srcstride] 2525 SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3 2526 MOVDQ [p_dst], xmm0 2527 add p_dst, i_dststride 2528 SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3 2529 MOVDQ [p_dst], xmm1 2530 add p_dst, i_dststride 2531 sub i_height, 2 2532 jg .yloop 2533 jl .done 2534 movdqu xmm0, [p_src - 2] 2535 SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3 2536 MOVDQ [p_dst], xmm0 2537.done: 2538 POP_XMM 2539 LOAD_5_PARA_POP 2540 DEINIT_X86_32_PIC 2541 ret 2542%undef p_src 2543%undef i_srcstride 2544%undef p_dst 2545%undef i_dststride 2546%undef i_height 2547 2548 2549;*********************************************************************** 2550; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc, 2551; int32_t iTapStride, 2552; uint8_t *pDst, 2553; int32_t iDstStride, 2554; int32_t iHeight); 2555;*********************************************************************** 2556 2557WELS_EXTERN McHorVer02Width5S16ToU8_ssse3 2558%define p_src r0 2559%define i_srcstride r1 2560%define p_dst r2 2561%define i_dststride r3 2562%define i_height r4 2563%define i_srcstride3 r5 2564 %assign push_num 0 2565%ifdef X86_32 2566 push r5 2567 %assign push_num 1 2568%endif 2569 INIT_X86_32_PIC r6 2570 LOAD_5_PARA 2571 PUSH_XMM 8 2572 SIGN_EXTENSION r1, r1d 2573 SIGN_EXTENSION r3, r3d 2574 SIGN_EXTENSION r4, r4d 2575 lea i_srcstride3, [3 * i_srcstride] 2576 movdqa xmm0, [p_src] 2577 movdqa xmm1, [p_src + i_srcstride] 2578 movdqa xmm2, [p_src + 2 * i_srcstride] 2579 movdqa xmm3, [p_src + i_srcstride3] 2580 lea p_src, [p_src + 4 * i_srcstride] 2581 movdqa xmm4, [p_src] 2582 movdqa xmm5, [p_src + i_srcstride] 2583 SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 2584 movdqa xmm6, [p_src + 2 * i_srcstride] 2585 packuswb xmm0, xmm0 2586 movdqa xmm7, xmm0 2587 psrlq xmm7, 8 2588 movd [p_dst + 1], xmm7 2589 movd [p_dst], xmm0 2590 add p_dst, i_dststride 2591 SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 2592 movdqa xmm7, [p_src + i_srcstride3] 2593 lea p_src, [p_src + 4 * i_srcstride] 2594 packuswb xmm1, xmm1 2595 movdqa xmm0, xmm1 2596 psrlq xmm0, 8 2597 movd [p_dst + 1], xmm0 2598 movd [p_dst], xmm1 2599 add p_dst, i_dststride 2600 SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0 2601 movdqa xmm0, [p_src] 2602 packuswb xmm2, xmm2 2603 movdqa xmm1, xmm2 2604 psrlq xmm1, 8 2605 movd [p_dst + 1], xmm1 2606 movd [p_dst], xmm2 2607 add p_dst, i_dststride 2608 SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 2609 packuswb xmm3, xmm3 2610 movdqa xmm2, xmm3 2611 psrlq xmm2, 8 2612 movd [p_dst + 1], xmm2 2613 movd [p_dst], xmm3 2614 add p_dst, i_dststride 2615 movdqa xmm1, [p_src + i_srcstride] 2616 SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2 2617 packuswb xmm4, xmm4 2618 movdqa xmm3, xmm4 2619 psrlq xmm3, 8 2620 movd [p_dst + 1], xmm3 2621 movd [p_dst], xmm4 2622 cmp i_height, 5 2623 jle .done 2624 add p_dst, i_dststride 2625 movdqa xmm2, [p_src + 2 * i_srcstride] 2626 SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3 2627 movdqa xmm3, [p_src + i_srcstride3] 2628 lea p_src, [p_src + 4 * i_srcstride] 2629 packuswb xmm5, xmm5 2630 movdqa xmm4, xmm5 2631 psrlq xmm4, 8 2632 movd [p_dst + 1], xmm4 2633 movd [p_dst], xmm5 2634 add p_dst, i_dststride 2635 SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4 2636 movdqa xmm4, [p_src] 2637 packuswb xmm6, xmm6 2638 movdqa xmm5, xmm6 2639 psrlq xmm5, 8 2640 movd [p_dst + 1], xmm5 2641 movd [p_dst], xmm6 2642 add p_dst, i_dststride 2643 SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 2644 packuswb xmm7, xmm7 2645 movdqa xmm6, xmm7 2646 psrlq xmm6, 8 2647 movd [p_dst + 1], xmm6 2648 movd [p_dst], xmm7 2649 add p_dst, i_dststride 2650 movdqa xmm5, [p_src + i_srcstride] 2651 SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 2652 packuswb xmm0, xmm0 2653 movdqa xmm7, xmm0 2654 psrlq xmm7, 8 2655 movd [p_dst + 1], xmm7 2656 movd [p_dst], xmm0 2657.done: 2658 POP_XMM 2659 LOAD_5_PARA_POP 2660 DEINIT_X86_32_PIC 2661%ifdef X86_32 2662 pop r5 2663%endif 2664 ret 2665%undef p_src 2666%undef i_srcstride 2667%undef p_dst 2668%undef i_dststride 2669%undef i_height 2670%undef i_srcstride3 2671 2672 2673;*********************************************************************** 2674; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc, 2675; int32_t iSrcStride, 2676; int16_t *pDst, 2677; int32_t iDstStride, 2678; int32_t iWidth, 2679; int32_t iHeight); 2680;*********************************************************************** 2681 2682WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3 2683%define p_src r0 2684%define i_srcstride r1 2685%define p_dst r2 2686%define i_dststride r3 2687%define i_width r4 2688%define i_height r5 2689 %assign push_num 0 2690 INIT_X86_32_PIC r6 2691 LOAD_6_PARA 2692 PUSH_XMM 8 2693 SIGN_EXTENSION r1, r1d 2694 SIGN_EXTENSION r3, r3d 2695 SIGN_EXTENSION r4, r4d 2696 SIGN_EXTENSION r5, r5d 2697 sub p_src, i_srcstride 2698 sub p_src, i_srcstride 2699 pcmpeqw xmm4, xmm4 2700 psllw xmm4, 15 ; dw -32768 2701 movdqa xmm5, [pic(shufb_32435465768798A9)] 2702 movdqa xmm6, [pic(shufb_011267784556ABBC)] 2703 movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 2704 cmp i_width, 9 2705 jne .width17_yloop 2706 2707.width9_yloop: 2708 movdqu xmm0, [p_src - 2] 2709 movdqa xmm3, xmm0 2710 SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2 2711 movdqu xmm2, [p_src + i_srcstride - 2] 2712 lea p_src, [p_src + 2 * i_srcstride] 2713 punpckhqdq xmm3, xmm2 2714 SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1 2715 movlps [p_dst + 10], xmm3 2716 MOVDQ [p_dst], xmm0 2717 add p_dst, i_dststride 2718 movhps [p_dst + 10], xmm3 2719 SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0 2720 MOVDQ [p_dst], xmm2 2721 add p_dst, i_dststride 2722 sub i_height, 2 2723 jg .width9_yloop 2724 POP_XMM 2725 LOAD_6_PARA_POP 2726 DEINIT_X86_32_PIC_KEEPDEF 2727 ret 2728 2729.width17_yloop: 2730 movdqu xmm0, [p_src - 2] 2731 movdqu xmm3, [p_src + 6] 2732 add p_src, i_srcstride 2733 SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2 2734 MOVDQ [p_dst], xmm0 2735 movdqa xmm0, xmm3 2736 SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2 2737 movdqu xmm2, [p_src + 6] 2738 punpckhqdq xmm0, xmm2 2739 SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1 2740 movdqu xmm1, [p_src - 2] 2741 add p_src, i_srcstride 2742 movlps [p_dst + 26], xmm0 2743 MOVDQ [p_dst + 16], xmm3 2744 add p_dst, i_dststride 2745 movhps [p_dst + 26], xmm0 2746 SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3 2747 MOVDQ [p_dst], xmm1 2748 SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3 2749 MOVDQ [p_dst + 16], xmm2 2750 add p_dst, i_dststride 2751 sub i_height, 2 2752 jg .width17_yloop 2753 POP_XMM 2754 LOAD_6_PARA_POP 2755 DEINIT_X86_32_PIC 2756 ret 2757%undef p_src 2758%undef i_srcstride 2759%undef p_dst 2760%undef i_dststride 2761%undef i_width 2762%undef i_height 2763 2764 2765;*********************************************************************** 2766; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc, 2767; int32_t iSrcStride, 2768; uint8_t *pDst, 2769; int32_t iDstStride, 2770; int32_t iWidth, 2771; int32_t iHeight); 2772;*********************************************************************** 2773 2774WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3 2775%define p_src r0 2776%define i_srcstride r1 2777%define p_dst r2 2778%define i_dststride r3 2779%ifdef X86_32_PICASM 2780%define i_width dword arg5 2781%else 2782%define i_width r4 2783%endif 2784%define i_height r5 2785%define i_srcstride3 r6 2786 %assign push_num 0 2787%ifdef X86_32 2788 push r6 2789 %assign push_num 1 2790%endif 2791 LOAD_6_PARA 2792 PUSH_XMM 8 2793 SIGN_EXTENSION r1, r1d 2794 SIGN_EXTENSION r3, r3d 2795 SIGN_EXTENSION r4, r4d 2796 SIGN_EXTENSION r5, r5d 2797 INIT_X86_32_PIC_NOPRESERVE r4 2798 sub i_height, 1 2799 push i_height 2800 %assign push_num push_num + 1 2801 lea i_srcstride3, [3 * i_srcstride] 2802 test i_width, 1 2803 jz .width_loop 2804 push p_src 2805 push p_dst 2806 %assign push_num push_num + 2 2807%ifdef X86_32_PICASM 2808 add p_src, i_width 2809 add p_src, i_width 2810 sub p_src, 2 2811%else 2812 lea p_src, [p_src + 2 * i_width - 2] 2813%endif 2814 add p_dst, i_width 2815 movd xmm0, [p_src] 2816 punpcklwd xmm0, [p_src + i_srcstride] 2817 movd xmm1, [p_src + 2 * i_srcstride] 2818 add p_src, i_srcstride3 2819 punpcklwd xmm1, [p_src] 2820 punpckldq xmm0, xmm1 2821 movd xmm1, [p_src + i_srcstride] 2822 cmp i_height, 4 2823 je .filter5_unalign 2824 punpcklwd xmm1, [p_src + 2 * i_srcstride] 2825 movd xmm2, [p_src + i_srcstride3] 2826 lea p_src, [p_src + 4 * i_srcstride] 2827 punpcklwd xmm2, [p_src] 2828 punpckldq xmm1, xmm2 2829 punpcklqdq xmm0, xmm1 2830.height_loop_unalign: 2831 movd xmm1, [p_src + i_srcstride] 2832 palignr xmm1, xmm0, 2 2833 movd xmm2, [p_src + 2 * i_srcstride] 2834 palignr xmm2, xmm1, 2 2835 movd xmm3, [p_src + i_srcstride3] 2836 palignr xmm3, xmm2, 2 2837 lea p_src, [p_src + 4 * i_srcstride] 2838 movd xmm4, [p_src] 2839 palignr xmm4, xmm3, 2 2840 movd xmm5, [p_src + i_srcstride] 2841 palignr xmm5, xmm4, 2 2842 SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7 2843 packuswb xmm0, xmm0 2844 movdqa xmm6, xmm0 2845 pslld xmm6, 24 2846 movd [p_dst - 4], xmm6 2847 movlps [p_dst + 4 * i_dststride - 8], xmm6 2848 add p_dst, i_dststride 2849 movdqa xmm6, xmm0 2850 pslld xmm6, 16 2851 movd [p_dst - 4], xmm6 2852 movlps [p_dst + 4 * i_dststride - 8], xmm6 2853 add p_dst, i_dststride 2854 movdqa xmm6, xmm0 2855 pslld xmm6, 8 2856 movd [p_dst - 4], xmm6 2857 movd [p_dst + i_dststride - 4], xmm0 2858 lea p_dst, [p_dst + 4 * i_dststride] 2859 movlps [p_dst - 8], xmm6 2860 movlps [p_dst + i_dststride - 8], xmm0 2861 lea p_dst, [p_dst + 2 * i_dststride] 2862 sub i_height, 8 2863 jle .height_loop_unalign_exit 2864 movd xmm1, [p_src + 2 * i_srcstride] 2865 palignr xmm1, xmm5, 2 2866 movd xmm0, [p_src + i_srcstride3] 2867 lea p_src, [p_src + 4 * i_srcstride] 2868 punpcklwd xmm0, [p_src] 2869 palignr xmm0, xmm1, 4 2870 jmp .height_loop_unalign 2871.height_loop_unalign_exit: 2872 movddup xmm6, [p_src + 2 * i_srcstride - 6] 2873 SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 2874 packuswb xmm1, xmm1 2875 movlps [p_dst - 8], xmm1 2876 jmp .unalign_done 2877.filter5_unalign: 2878 pslldq xmm0, 8 2879 palignr xmm1, xmm0, 2 2880 movd xmm2, [p_src + 2 * i_srcstride] 2881 palignr xmm2, xmm1, 2 2882 movd xmm3, [p_src + i_srcstride3] 2883 lea p_src, [p_src + 4 * i_srcstride] 2884 palignr xmm3, xmm2, 2 2885 movd xmm4, [p_src] 2886 palignr xmm4, xmm3, 2 2887 movd xmm5, [p_src + i_srcstride] 2888 palignr xmm5, xmm4, 2 2889 movd xmm6, [p_src + 2 * i_srcstride] 2890 palignr xmm6, xmm5, 2 2891 SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 2892 packuswb xmm1, xmm1 2893 movdqa xmm0, xmm1 2894 psrlq xmm1, 8 2895 movdqa xmm2, xmm0 2896 psrlq xmm2, 16 2897 movdqa xmm3, xmm0 2898 psrlq xmm3, 24 2899 movd [p_dst - 4], xmm0 2900 movd [p_dst + i_dststride - 4], xmm1 2901 lea p_dst, [p_dst + 2 * i_dststride] 2902 movd [p_dst - 4], xmm2 2903 movd [p_dst + i_dststride - 4], xmm3 2904 movlps [p_dst + 2 * i_dststride - 8], xmm0 2905.unalign_done: 2906 pop p_dst 2907 pop p_src 2908 %assign push_num push_num - 2 2909 mov i_height, [r7] 2910 sub i_width, 1 2911.width_loop: 2912 push p_src 2913 push p_dst 2914 %assign push_num push_num + 2 2915 movdqa xmm0, [p_src] 2916 movdqa xmm1, [p_src + i_srcstride] 2917 movdqa xmm2, [p_src + 2 * i_srcstride] 2918 movdqa xmm3, [p_src + i_srcstride3] 2919 lea p_src, [p_src + 4 * i_srcstride] 2920 movdqa xmm4, [p_src] 2921.height_loop: 2922 movdqa xmm5, [p_src + i_srcstride] 2923 SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 2924 movdqa xmm6, [p_src + 2 * i_srcstride] 2925 SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 2926 movdqa xmm7, [p_src + i_srcstride3] 2927 lea p_src, [p_src + 4 * i_srcstride] 2928 packuswb xmm0, xmm1 2929 movlps [p_dst], xmm0 2930 movhps [p_dst + i_dststride], xmm0 2931 lea p_dst, [p_dst + 2 * i_dststride] 2932 SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0 2933 movdqa xmm0, [p_src] 2934 SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 2935 packuswb xmm2, xmm3 2936 movlps [p_dst], xmm2 2937 movhps [p_dst + i_dststride], xmm2 2938 cmp i_height, 4 2939 jl .x_loop_dec 2940 lea p_dst, [p_dst + 2 * i_dststride] 2941 movdqa xmm1, [p_src + i_srcstride] 2942 SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2 2943 je .store_xmm4_exit 2944 movdqa xmm2, [p_src + 2 * i_srcstride] 2945 SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3 2946 movdqa xmm3, [p_src + i_srcstride3] 2947 lea p_src, [p_src + 4 * i_srcstride] 2948 packuswb xmm4, xmm5 2949 movlps [p_dst], xmm4 2950 movhps [p_dst + i_dststride], xmm4 2951 lea p_dst, [p_dst + 2 * i_dststride] 2952 SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4 2953 movdqa xmm4, [p_src] 2954 SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 2955 packuswb xmm6, xmm7 2956 movlps [p_dst], xmm6 2957 movhps [p_dst + i_dststride], xmm6 2958 lea p_dst, [p_dst + 2 * i_dststride] 2959 sub i_height, 8 2960 jg .height_loop 2961 jl .x_loop_dec 2962 movdqa xmm5, [p_src + i_srcstride] 2963 SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 2964 packuswb xmm0, xmm0 2965 movlps [p_dst], xmm0 2966.x_loop_dec: 2967 pop p_dst 2968 pop p_src 2969 %assign push_num push_num - 2 2970 sub i_width, 8 2971 jle .done 2972 mov i_height, [r7] 2973 add p_src, 16 2974 add p_dst, 8 2975 jmp .width_loop 2976.store_xmm4_exit: 2977 packuswb xmm4, xmm4 2978 movlps [p_dst], xmm4 2979 pop p_dst 2980 pop p_src 2981.done: 2982 pop i_height 2983 %assign push_num push_num - 1 2984 DEINIT_X86_32_PIC 2985 POP_XMM 2986 LOAD_6_PARA_POP 2987%ifdef X86_32 2988 pop r6 2989%endif 2990 ret 2991%undef p_src 2992%undef i_srcstride 2993%undef p_dst 2994%undef i_dststride 2995%undef i_width 2996%undef i_height 2997%undef i_srcstride3 2998 2999 3000%ifdef HAVE_AVX2 3001 3002; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6 3003%macro AVX2_FilterHorizontalbw_16px 6 3004 vpshufb %5, %1, %3 3005 vpshufb %1, %1, %2 3006 vpshufd %6, %1, 10110001b 3007 vpmaddubsw %1, %1, [pic(db20_256)] 3008 vpmaddubsw %5, %5, %4 3009 vpmaddubsw %6, %6, %4 3010 vpaddw %1, %1, %5 3011 vpaddw %1, %1, %6 3012%endmacro 3013 3014; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6 3015%macro AVX2_FilterHorizontal_16px 6 3016 AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6 3017 vpaddw %1, %1, [pic(h264_w0x10_256)] 3018 vpsraw %1, %1, 5 3019%endmacro 3020 3021; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7 3022%macro AVX2_FilterHorizontalbw_4x4px 7 3023 vpshufb %6, %1, %4 3024 vpshufb %7, %2, %4 3025 vpshufb %1, %1, %3 3026 vpshufb %2, %2, %3 3027 vpunpcklqdq %1, %1, %2 3028 vpunpcklqdq %6, %6, %7 3029 vpshufd %7, %1, 10110001b 3030 vpmaddubsw %1, %1, [pic(db20_256)] 3031 vpmaddubsw %6, %6, %5 3032 vpmaddubsw %7, %7, %5 3033 vpaddw %1, %1, %6 3034 vpaddw %1, %1, %7 3035%endmacro 3036 3037; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7 3038%macro AVX2_FilterHorizontal_4x4px 7 3039 AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7 3040 vpaddw %1, %1, [pic(h264_w0x10_256)] 3041 vpsraw %1, %1, 5 3042%endmacro 3043 3044; pixels=%1 -32768>>scale=%2 tmp=%3 3045%macro AVX2_FilterHorizontalbw_4px 3 3046 vpmaddubsw %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)] 3047 vpmaddwd %1, %1, %2 3048 vpshufd %3, %1, 10110001b 3049 vpaddd %1, %1, %3 3050%endmacro 3051 3052; pixels=%1 tmp=%2 3053%macro AVX2_FilterHorizontal_4px 2 3054 AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2 3055 vpaddd %1, %1, [pic(dd32768_256)] 3056%endmacro 3057 3058; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7 3059%macro AVX2_FilterVertical_16px 7 3060 vpmaddubsw %1, %1, %4 3061 vpmaddubsw %7, %2, %5 3062 vpaddw %1, %1, %7 3063 vpmaddubsw %7, %3, %6 3064 vpaddw %1, %1, %7 3065 vpaddw %1, %1, [pic(h264_w0x10_256)] 3066 vpsraw %1, %1, 5 3067%endmacro 3068 3069; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8 3070%macro AVX2_FilterVertical2_16px 8 3071 vpxor %7, %7, %7 3072 vpunpcklbw %1, %1, %7 3073 vpunpcklbw %8, %2, %7 3074 vpaddw %1, %1, %8 3075 vpmaddubsw %7, %3, %5 3076 vpaddw %1, %1, %7 3077 vpmaddubsw %7, %4, %6 3078 vpaddw %1, %1, %7 3079 vpaddw %1, %1, [pic(h264_w0x10_256)] 3080 vpsraw %1, %1, 5 3081%endmacro 3082 3083; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7 3084%macro AVX2_FilterVerticalw_16px 7 3085 vpaddw %1, %1, %6 3086 vpaddw %7, %2, %5 3087 vpsubw %1, %1, %7 3088 vpsraw %1, %1, 2 3089 vpsubw %1, %1, %7 3090 vpaddw %7, %3, %4 3091 vpaddw %1, %1, %7 3092 vpsraw %1, %1, 2 3093 vpaddw %7, %7, [pic(dw32_256)] 3094 vpaddw %1, %1, %7 3095 vpsraw %1, %1, 6 3096%endmacro 3097 3098;*********************************************************************** 3099; void McHorVer02_avx2(const uint8_t *pSrc, 3100; int32_t iSrcStride, 3101; uint8_t *pDst, 3102; int32_t iDstStride, 3103; int32_t iWidth, 3104; int32_t iHeight) 3105;*********************************************************************** 3106 3107WELS_EXTERN McHorVer02_avx2 3108%define p_src r0 3109%define i_srcstride r1 3110%define p_dst r2 3111%define i_dststride r3 3112%ifdef X86_32_PICASM 3113%define i_width dword arg5 3114%else 3115%define i_width r4 3116%endif 3117%define i_height r5 3118%define i_srcstride3 r6 3119 %assign push_num 0 3120%ifdef X86_32 3121 push r6 3122 %assign push_num 1 3123%endif 3124 LOAD_6_PARA 3125 PUSH_XMM 8 3126 SIGN_EXTENSION r1, r1d 3127 SIGN_EXTENSION r3, r3d 3128 SIGN_EXTENSION r4, r4d 3129 SIGN_EXTENSION r5, r5d 3130 INIT_X86_32_PIC_NOPRESERVE r4 3131 sub p_src, i_srcstride 3132 sub p_src, i_srcstride 3133 lea i_srcstride3, [3 * i_srcstride] 3134 cmp i_width, 8 3135 je .width8 3136 jg .width16 3137; .width4: 3138 vmovd xmm0, [p_src] 3139 vpbroadcastd xmm5, [p_src + i_srcstride] 3140 vpunpcklbw xmm0, xmm0, xmm5 3141 vpbroadcastd ymm1, [p_src + 2 * i_srcstride] 3142 vpunpcklbw xmm5, xmm5, xmm1 3143 vpblendd xmm0, xmm0, xmm5, 1100b 3144 vpbroadcastd ymm5, [p_src + i_srcstride3] 3145 lea p_src, [p_src + 4 * i_srcstride] 3146 vpunpcklbw ymm1, ymm1, ymm5 3147 vpbroadcastd ymm2, [p_src] 3148 vpunpcklbw ymm5, ymm5, ymm2 3149 vpblendd ymm1, ymm1, ymm5, 11001100b 3150 vpblendd ymm0, ymm0, ymm1, 11110000b 3151 vpbroadcastd ymm5, [p_src + i_srcstride] 3152 lea p_src, [p_src + 2 * i_srcstride] 3153 vpunpcklbw ymm2, ymm2, ymm5 3154 vpbroadcastd ymm3, [p_src] 3155 vpunpcklbw ymm5, ymm5, ymm3 3156 vpblendd ymm2, ymm2, ymm5, 11001100b 3157 vpblendd ymm1, ymm1, ymm2, 11110000b 3158 vpbroadcastd ymm5, [p_src + i_srcstride] 3159 vpunpcklbw ymm3, ymm3, ymm5 3160 vpbroadcastd ymm4, [p_src + 2 * i_srcstride] 3161 vpunpcklbw ymm5, ymm5, ymm4 3162 vpblendd ymm3, ymm3, ymm5, 11001100b 3163 vpblendd ymm2, ymm2, ymm3, 11110000b 3164 vbroadcasti128 ymm6, [pic(db20_128)] 3165 AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5 3166 vpackuswb ymm0, ymm0, ymm0 3167 vmovd [p_dst], xmm0 3168 vpsrlq xmm5, xmm0, 32 3169 vmovd [p_dst + i_dststride], xmm5 3170 lea p_dst, [p_dst + 2 * i_dststride] 3171 vextracti128 xmm0, ymm0, 1 3172 vmovd [p_dst], xmm0 3173 vpsrlq xmm5, xmm0, 32 3174 vmovd [p_dst + i_dststride], xmm5 3175 cmp i_height, 5 3176 jl .width4_done 3177 lea p_dst, [p_dst + 2 * i_dststride] 3178 vpbroadcastd ymm5, [p_src + i_srcstride3] 3179 vpunpcklbw ymm4, ymm4, ymm5 3180 jg .width4_height_ge8 3181 AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5 3182 vpackuswb xmm2, xmm2, xmm2 3183 vmovd [p_dst], xmm2 3184 jmp .width4_done 3185.width4_height_ge8: 3186 lea p_src, [p_src + 4 * i_srcstride] 3187 vpbroadcastd ymm1, [p_src] 3188 vpunpcklbw ymm5, ymm5, ymm1 3189 vpblendd ymm4, ymm4, ymm5, 11001100b 3190 vpblendd ymm3, ymm3, ymm4, 11110000b 3191 vpbroadcastd ymm5, [p_src + i_srcstride] 3192 vpunpcklbw ymm1, ymm5 3193 vpbroadcastd ymm0, [p_src + 2 * i_srcstride] 3194 vpunpcklbw ymm5, ymm5, ymm0 3195 vpblendd ymm1, ymm1, ymm5, 11001100b 3196 vpblendd ymm4, ymm4, ymm1, 11110000b 3197 AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5 3198 vpackuswb ymm2, ymm2, ymm2 3199 vmovd [p_dst], xmm2 3200 vpsrlq xmm5, xmm2, 32 3201 vmovd [p_dst + i_dststride], xmm5 3202 lea p_dst, [p_dst + 2 * i_dststride] 3203 vextracti128 xmm2, ymm2, 1 3204 vmovd [p_dst], xmm2 3205 vpsrlq xmm5, xmm2, 32 3206 vmovd [p_dst + i_dststride], xmm5 3207 cmp i_height, 9 3208 jl .width4_done 3209 lea p_dst, [p_dst + 2 * i_dststride] 3210 vmovd xmm5, [p_src + i_srcstride3] 3211 vpunpcklbw xmm0, xmm0, xmm5 3212 AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5 3213 vpackuswb xmm4, xmm4, xmm4 3214 vmovd [p_dst], xmm4 3215.width4_done: 3216 vzeroupper 3217 DEINIT_X86_32_PIC_KEEPDEF 3218 POP_XMM 3219 LOAD_6_PARA_POP 3220%ifdef X86_32 3221 pop r6 3222%endif 3223 ret 3224 3225.width8: 3226 sub i_height, 1 3227 vmovq xmm0, [p_src] 3228 vmovq xmm4, [p_src + i_srcstride] 3229 vpunpcklbw xmm0, xmm0, xmm4 3230 vmovq xmm1, [p_src + 2 * i_srcstride] 3231 vpunpcklbw xmm4, xmm4, xmm1 3232 vinserti128 ymm0, ymm0, xmm4, 1 3233 vmovq xmm4, [p_src + i_srcstride3] 3234 lea p_src, [p_src + 4 * i_srcstride] 3235 vpunpcklbw xmm1, xmm1, xmm4 3236 vmovq xmm6, [p_src] 3237 vpunpcklbw xmm4, xmm4, xmm6 3238 vinserti128 ymm1, ymm1, xmm4, 1 3239.width8_yloop: 3240 vmovq xmm4, [p_src + i_srcstride] 3241 vpunpcklbw xmm2, xmm6, xmm4 3242 vmovq xmm3, [p_src + 2 * i_srcstride] 3243 vpunpcklbw xmm4, xmm4, xmm3 3244 vinserti128 ymm2, ymm2, xmm4, 1 3245 vbroadcasti128 ymm5, [pic(db20_128)] 3246 AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4 3247 vmovq xmm4, [p_src + i_srcstride3] 3248 lea p_src, [p_src + 4 * i_srcstride] 3249 vpunpcklbw xmm3, xmm3, xmm4 3250 vmovq xmm6, [p_src] 3251 vpunpcklbw xmm4, xmm4, xmm6 3252 vinserti128 ymm3, ymm3, xmm4, 1 3253 AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4 3254 vpackuswb ymm0, ymm0, ymm1 3255 vmovlps [p_dst], xmm0 3256 vextracti128 xmm1, ymm0, 1 3257 vmovlps [p_dst + i_dststride], xmm1 3258 lea p_dst, [p_dst + 2 * i_dststride] 3259 vmovhps [p_dst], xmm0 3260 vmovhps [p_dst + i_dststride], xmm1 3261 cmp i_height, 4 3262 jl .width8_done 3263 lea p_dst, [p_dst + 2 * i_dststride] 3264 vmovq xmm4, [p_src + i_srcstride] 3265 vpunpcklbw xmm0, xmm6, xmm4 3266 jg .width8_height_ge8 3267 AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4 3268 vpackuswb xmm2, xmm2, xmm2 3269 vmovlps [p_dst], xmm2 3270 jmp .width8_done 3271.width8_height_ge8: 3272 vmovq xmm1, [p_src + 2 * i_srcstride] 3273 vpunpcklbw xmm4, xmm4, xmm1 3274 vinserti128 ymm0, ymm0, xmm4, 1 3275 AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4 3276 vmovq xmm4, [p_src + i_srcstride3] 3277 lea p_src, [p_src + 4 * i_srcstride] 3278 vpunpcklbw xmm1, xmm1, xmm4 3279 vmovq xmm6, [p_src] 3280 vpunpcklbw xmm4, xmm4, xmm6 3281 vinserti128 ymm1, ymm1, xmm4, 1 3282 AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4 3283 vpackuswb ymm2, ymm2, ymm3 3284 vmovlps [p_dst], xmm2 3285 vextracti128 xmm3, ymm2, 1 3286 vmovlps [p_dst + i_dststride], xmm3 3287 lea p_dst, [p_dst + 2 * i_dststride] 3288 vmovhps [p_dst], xmm2 3289 vmovhps [p_dst + i_dststride], xmm3 3290 lea p_dst, [p_dst + 2 * i_dststride] 3291 sub i_height, 8 3292 jg .width8_yloop 3293 jl .width8_done 3294 vmovq xmm4, [p_src + i_srcstride] 3295 vpunpcklbw xmm2, xmm6, xmm4 3296 AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4 3297 vpackuswb xmm0, xmm0, xmm0 3298 vmovlps [p_dst], xmm0 3299.width8_done: 3300 vzeroupper 3301 DEINIT_X86_32_PIC_KEEPDEF 3302 POP_XMM 3303 LOAD_6_PARA_POP 3304%ifdef X86_32 3305 pop r6 3306%endif 3307 ret 3308 3309.width16: 3310 sub i_height, 1 3311 test i_height, 1 3312 jnz .width16_yloop_begin_even 3313 vmovq xmm0, [p_src] 3314 vpbroadcastq ymm1, [p_src + 8] 3315 vpblendd ymm0, ymm0, ymm1, 11110000b 3316 vmovq xmm1, [p_src + i_srcstride] 3317 vpbroadcastq ymm2, [p_src + i_srcstride + 8] 3318 vpblendd ymm1, ymm1, ymm2, 11110000b 3319 vpunpcklbw ymm0, ymm0, ymm1 3320 vmovq xmm2, [p_src + 2 * i_srcstride] 3321 vpbroadcastq ymm3, [p_src + 2 * i_srcstride + 8] 3322 vpblendd ymm2, ymm2, ymm3, 11110000b 3323 vmovq xmm3, [p_src + i_srcstride3] 3324 vpbroadcastq ymm4, [p_src + i_srcstride3 + 8] 3325 lea p_src, [p_src + 4 * i_srcstride] 3326 vpblendd ymm3, ymm3, ymm4, 11110000b 3327 vpunpcklbw ymm2, ymm2, ymm3 3328 vmovq xmm4, [p_src] 3329 vpbroadcastq ymm5, [p_src + 8] 3330 vpblendd ymm4, ymm4, ymm5, 11110000b 3331 vmovq xmm5, [p_src + i_srcstride] 3332 vpbroadcastq ymm6, [p_src + i_srcstride + 8] 3333 lea p_src, [p_src + 2 * i_srcstride] 3334 vpblendd ymm5, ymm5, ymm6, 11110000b 3335 vpunpcklbw ymm4, ymm4, ymm5 3336 AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7 3337 vpackuswb ymm0, ymm0, ymm0 3338 vpermq ymm0, ymm0, 1000b 3339 vmovdqa [p_dst], xmm0 3340 add p_dst, i_dststride 3341 jmp .width16_yloop 3342.width16_yloop_begin_even: 3343 vmovq xmm1, [p_src] 3344 vpbroadcastq ymm2, [p_src + 8] 3345 vpblendd ymm1, ymm1, ymm2, 11110000b 3346 vmovq xmm2, [p_src + i_srcstride] 3347 vpbroadcastq ymm3, [p_src + i_srcstride + 8] 3348 vpblendd ymm2, ymm2, ymm3, 11110000b 3349 vmovq xmm3, [p_src + 2 * i_srcstride] 3350 vpbroadcastq ymm4, [p_src + 2 * i_srcstride + 8] 3351 add p_src, i_srcstride3 3352 vpblendd ymm3, ymm3, ymm4, 11110000b 3353 vpunpcklbw ymm2, ymm2, ymm3 3354 vmovq xmm4, [p_src] 3355 vpbroadcastq ymm5, [p_src + 8] 3356 vpblendd ymm4, ymm4, ymm5, 11110000b 3357 vmovq xmm5, [p_src + i_srcstride] 3358 vpbroadcastq ymm6, [p_src + i_srcstride + 8] 3359 lea p_src, [p_src + 2 * i_srcstride] 3360 vpblendd ymm5, ymm5, ymm6, 11110000b 3361 vpunpcklbw ymm4, ymm4, ymm5 3362.width16_yloop: 3363 vmovq xmm6, [p_src] 3364 vpbroadcastq ymm7, [p_src + 8] 3365 vpblendd ymm6, ymm6, ymm7, 11110000b 3366 AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7 3367 vmovq xmm7, [p_src + i_srcstride] 3368 vpbroadcastq ymm0, [p_src + i_srcstride + 8] 3369 vpblendd ymm7, ymm7, ymm0, 11110000b 3370 vpunpcklbw ymm6, ymm6, ymm7 3371 AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0 3372 vpackuswb ymm1, ymm1, ymm2 3373 vpermq ymm1, ymm1, 11011000b 3374 vmovdqa [p_dst], xmm1 3375 vextracti128 [p_dst + i_dststride], ymm1, 1 3376 lea p_dst, [p_dst + 2 * i_dststride] 3377 vmovq xmm0, [p_src + 2 * i_srcstride] 3378 vpbroadcastq ymm1, [p_src + 2 * i_srcstride + 8] 3379 vpblendd ymm0, ymm0, ymm1, 11110000b 3380 AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1 3381 vmovq xmm1, [p_src + i_srcstride3] 3382 vpbroadcastq ymm2, [p_src + i_srcstride3 + 8] 3383 lea p_src, [p_src + 4 * i_srcstride] 3384 vpblendd ymm1, ymm1, ymm2, 11110000b 3385 vpunpcklbw ymm0, ymm0, ymm1 3386 AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2 3387 vpackuswb ymm3, ymm3, ymm4 3388 vpermq ymm3, ymm3, 11011000b 3389 vmovdqa [p_dst], xmm3 3390 vextracti128 [p_dst + i_dststride], ymm3, 1 3391 lea p_dst, [p_dst + 2 * i_dststride] 3392 vmovq xmm2, [p_src] 3393 vpbroadcastq ymm3, [p_src + 8] 3394 vpblendd ymm2, ymm2, ymm3, 11110000b 3395 AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3 3396 vmovq xmm3, [p_src + i_srcstride] 3397 vpbroadcastq ymm4, [p_src + i_srcstride + 8] 3398 vpblendd ymm3, ymm3, ymm4, 11110000b 3399 vpunpcklbw ymm2, ymm2, ymm3 3400 AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4 3401 vpackuswb ymm5, ymm5, ymm6 3402 vpermq ymm5, ymm5, 11011000b 3403 vmovdqa [p_dst], xmm5 3404 vextracti128 [p_dst + i_dststride], ymm5, 1 3405 lea p_dst, [p_dst + 2 * i_dststride] 3406 vmovq xmm4, [p_src + 2 * i_srcstride] 3407 vpbroadcastq ymm5, [p_src + 2 * i_srcstride + 8] 3408 vpblendd ymm4, ymm4, ymm5, 11110000b 3409 AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5 3410 vmovq xmm5, [p_src + i_srcstride3] 3411 vpbroadcastq ymm6, [p_src + i_srcstride3 + 8] 3412 lea p_src, [p_src + 4 * i_srcstride] 3413 vpblendd ymm5, ymm5, ymm6, 11110000b 3414 vpunpcklbw ymm4, ymm4, ymm5 3415 AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6 3416 vpackuswb ymm7, ymm7, ymm0 3417 vpermq ymm7, ymm7, 11011000b 3418 vmovdqa [p_dst], xmm7 3419 vextracti128 [p_dst + i_dststride], ymm7, 1 3420 lea p_dst, [p_dst + 2 * i_dststride] 3421 sub i_height, 8 3422 jg .width16_yloop 3423 vzeroupper 3424 DEINIT_X86_32_PIC 3425 POP_XMM 3426 LOAD_6_PARA_POP 3427%ifdef X86_32 3428 pop r6 3429%endif 3430 ret 3431%undef p_src 3432%undef i_srcstride 3433%undef i_srcstride3 3434%undef p_dst 3435%undef i_dststride 3436%undef i_width 3437%undef i_height 3438%undef i_ycnt 3439 3440 3441;******************************************************************************* 3442; void McHorVer20_avx2(const uint8_t *pSrc, 3443; int iSrcStride, 3444; uint8_t *pDst, 3445; int iDstStride, 3446; int iWidth, 3447; int iHeight); 3448;******************************************************************************* 3449 3450WELS_EXTERN McHorVer20_avx2 3451%define p_src r0 3452%define i_srcstride r1 3453%define p_dst r2 3454%define i_dststride r3 3455%define i_width r4 3456%define i_height r5 3457 %assign push_num 0 3458 INIT_X86_32_PIC r6 3459 LOAD_6_PARA 3460 PUSH_XMM 7 3461 SIGN_EXTENSION r1, r1d 3462 SIGN_EXTENSION r3, r3d 3463 SIGN_EXTENSION r4, r4d 3464 SIGN_EXTENSION r5, r5d 3465 vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)] 3466 vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)] 3467 vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 3468 cmp i_width, 8 3469 je .width8 3470 jg .width16_yloop 3471%xdefine i_srcstride3 i_width 3472%undef i_width 3473 lea i_srcstride3, [3 * i_srcstride] 3474.width4_yloop: 3475 vmovdqu xmm0, [p_src - 2] 3476 vmovdqu xmm1, [p_src + i_srcstride - 2] 3477 vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1 3478 vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1 3479 lea p_src, [p_src + 4 * i_srcstride] 3480 AVX2_FilterHorizontal_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3 3481 vpackuswb ymm0, ymm0, ymm0 3482 vmovd [p_dst], xmm0 3483 vpsrlq xmm1, xmm0, 32 3484 vmovd [p_dst + i_dststride], xmm1 3485 lea p_dst, [p_dst + 2 * i_dststride] 3486 vextracti128 xmm0, ymm0, 1 3487 vmovd [p_dst], xmm0 3488 vpsrlq xmm1, xmm0, 32 3489 vmovd [p_dst + i_dststride], xmm1 3490 lea p_dst, [p_dst + 2 * i_dststride] 3491 sub i_height, 4 3492 jg .width4_yloop 3493 vzeroupper 3494 POP_XMM 3495 LOAD_6_PARA_POP 3496 DEINIT_X86_32_PIC_KEEPDEF 3497 ret 3498.width8: 3499 lea i_srcstride3, [3 * i_srcstride] 3500.width8_yloop: 3501 vmovdqu xmm0, [p_src - 2] 3502 vmovdqu xmm1, [p_src + i_srcstride - 2] 3503 vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1 3504 vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1 3505 lea p_src, [p_src + 4 * i_srcstride] 3506 AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3 3507 AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3 3508 vpackuswb ymm0, ymm0, ymm1 3509 vmovlps [p_dst], xmm0 3510 vmovhps [p_dst + i_dststride], xmm0 3511 lea p_dst, [p_dst + 2 * i_dststride] 3512 vextracti128 xmm0, ymm0, 1 3513 vmovlps [p_dst], xmm0 3514 vmovhps [p_dst + i_dststride], xmm0 3515 lea p_dst, [p_dst + 2 * i_dststride] 3516 sub i_height, 4 3517 jg .width8_yloop 3518 vzeroupper 3519 POP_XMM 3520 LOAD_6_PARA_POP 3521 DEINIT_X86_32_PIC_KEEPDEF 3522 ret 3523%undef i_srcstride3 3524.width16_yloop: 3525 vmovdqu xmm0, [p_src - 2] 3526 vmovdqu xmm1, [p_src + 6] 3527 vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1 3528 vinserti128 ymm1, ymm1, [p_src + i_srcstride + 6], 1 3529 lea p_src, [p_src + 2 * i_srcstride] 3530 AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3 3531 AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3 3532 vpackuswb ymm0, ymm0, ymm1 3533 vmovdqa [p_dst], xmm0 3534 vextracti128 [p_dst + i_dststride], ymm0, 1 3535 lea p_dst, [p_dst + 2 * i_dststride] 3536 sub i_height, 2 3537 jg .width16_yloop 3538 vzeroupper 3539 POP_XMM 3540 LOAD_6_PARA_POP 3541 DEINIT_X86_32_PIC 3542 ret 3543%undef p_src 3544%undef i_srcstride 3545%undef p_dst 3546%undef i_dststride 3547%undef i_width 3548%undef i_height 3549 3550 3551;*********************************************************************** 3552; void McHorVer20Width5Or9Or17_avx2(const uint8_t *pSrc, 3553; int32_t iSrcStride, 3554; uint8_t *pDst, 3555; int32_t iDstStride, 3556; int32_t iWidth, 3557; int32_t iHeight); 3558;*********************************************************************** 3559 3560WELS_EXTERN McHorVer20Width5Or9Or17_avx2 3561%define p_src r0 3562%define i_srcstride r1 3563%define p_dst r2 3564%define i_dststride r3 3565%define i_width r4 3566%define i_height r5 3567 %assign push_num 0 3568 INIT_X86_32_PIC r6 3569 LOAD_6_PARA 3570 PUSH_XMM 8 3571 SIGN_EXTENSION r1, r1d 3572 SIGN_EXTENSION r3, r3d 3573 SIGN_EXTENSION r4, r4d 3574 SIGN_EXTENSION r5, r5d 3575 vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)] 3576 vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)] 3577 vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 3578 cmp i_width, 9 3579 je .width9 3580 jg .width17 3581.width5_yloop: 3582 vmovdqu xmm0, [p_src - 2] 3583 vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1 3584 lea p_src, [p_src + 2 * i_srcstride] 3585 AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2 3586 vpackuswb ymm0, ymm0, ymm0 3587 vpsrlq xmm1, xmm0, 8 3588 vmovd [p_dst + 1], xmm1 3589 vmovd [p_dst], xmm0 3590 add p_dst, i_dststride 3591 vextracti128 xmm0, ymm0, 1 3592 vpsrlq xmm1, xmm0, 8 3593 vmovd [p_dst + 1], xmm1 3594 vmovd [p_dst], xmm0 3595 add p_dst, i_dststride 3596 sub i_height, 2 3597 jg .width5_yloop 3598 vzeroupper 3599 POP_XMM 3600 LOAD_6_PARA_POP 3601 DEINIT_X86_32_PIC_KEEPDEF 3602 ret 3603.width9: 3604%xdefine i_srcstride3 i_width 3605%undef i_width 3606 lea i_srcstride3, [3 * i_srcstride] 3607.width9_yloop: 3608 vmovdqu xmm0, [p_src - 2] 3609 vmovdqu xmm4, [p_src + i_srcstride - 2] 3610 vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1 3611 vinserti128 ymm4, ymm4, [p_src + i_srcstride3 - 2], 1 3612 lea p_src, [p_src + 4 * i_srcstride] 3613 vpunpckhqdq ymm3, ymm0, ymm4 3614 AVX2_FilterHorizontal_4px ymm3, ymm2 3615 AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2 3616 vpackuswb ymm3, ymm3, ymm0 3617 vmovd [p_dst + 5], xmm3 3618 vmovhps [p_dst], xmm3 3619 add p_dst, i_dststride 3620 AVX2_FilterHorizontal_16px ymm4, ymm5, ymm6, ymm7, ymm1, ymm2 3621 vpackuswb ymm4, ymm4, ymm4 3622 vpsrlq xmm2, xmm3, 32 3623 vmovd [p_dst + 5], xmm2 3624 vmovlps [p_dst], xmm4 3625 add p_dst, i_dststride 3626 vextracti128 xmm3, ymm3, 1 3627 vextracti128 xmm4, ymm4, 1 3628 vmovd [p_dst + 5], xmm3 3629 vmovhps [p_dst], xmm3 3630 add p_dst, i_dststride 3631 vpsrlq xmm2, xmm3, 32 3632 vmovd [p_dst + 5], xmm2 3633 vmovlps [p_dst], xmm4 3634 add p_dst, i_dststride 3635 sub i_height, 4 3636 jg .width9_yloop 3637 vzeroupper 3638 POP_XMM 3639 LOAD_6_PARA_POP 3640 DEINIT_X86_32_PIC_KEEPDEF 3641 ret 3642.width17: 3643 lea i_srcstride3, [3 * i_srcstride] 3644.width17_yloop: 3645 vmovdqu xmm0, [p_src - 2] 3646 vmovdqu xmm3, [p_src + 6] 3647 vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1 3648 vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1 3649 vmovdqa ymm4, ymm3 3650 AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2 3651 AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2 3652 vpackuswb ymm0, ymm0, ymm3 3653 vmovdqu xmm1, [p_src + 2 * i_srcstride - 2] 3654 vmovdqu xmm3, [p_src + 2 * i_srcstride + 6] 3655 vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1 3656 vinserti128 ymm3, ymm3, [p_src + i_srcstride3 + 6], 1 3657 lea p_src, [p_src + 4 * i_srcstride] 3658 vpunpckhqdq ymm4, ymm4, ymm3 3659 AVX2_FilterHorizontal_4px ymm4, ymm2 3660 vpackuswb ymm4, ymm4, ymm4 3661 vmovd [p_dst + 13], xmm4 3662 vmovdqa [p_dst], xmm0 3663 add p_dst, i_dststride 3664 vextracti128 xmm2, ymm4, 1 3665 vmovd [p_dst + 13], xmm2 3666 vextracti128 [p_dst], ymm0, 1 3667 add p_dst, i_dststride 3668 vpsrlq xmm4, xmm4, 32 3669 vmovd [p_dst + 13], xmm4 3670 AVX2_FilterHorizontal_16px ymm1, ymm5, ymm6, ymm7, ymm0, ymm4 3671 AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm0, ymm4 3672 vpackuswb ymm1, ymm1, ymm3 3673 vmovdqa [p_dst], xmm1 3674 add p_dst, i_dststride 3675 vpsrlq xmm2, xmm2, 32 3676 vmovd [p_dst + 13], xmm2 3677 vextracti128 [p_dst], ymm1, 1 3678 add p_dst, i_dststride 3679 sub i_height, 4 3680 jg .width17_yloop 3681 vzeroupper 3682 POP_XMM 3683 LOAD_6_PARA_POP 3684 DEINIT_X86_32_PIC 3685 ret 3686%undef i_srcstride3 3687%undef p_src 3688%undef i_srcstride 3689%undef p_dst 3690%undef i_dststride 3691%undef i_width 3692%undef i_height 3693 3694 3695;******************************************************************************* 3696; void McHorVer20Width4U8ToS16_avx2(const uint8_t *pSrc, 3697; int iSrcStride, 3698; int16_t *pDst, 3699; int iHeight); 3700;******************************************************************************* 3701 3702WELS_EXTERN McHorVer20Width4U8ToS16_avx2 3703%define p_src r0 3704%define i_srcstride r1 3705%define p_dst r2 3706%define i_height r3 3707%define i_srcstride3 r4 3708%define i_dststride 8 3709 %assign push_num 0 3710%ifdef X86_32 3711 push r4 3712 %assign push_num 1 3713%endif 3714 INIT_X86_32_PIC r5 3715 LOAD_4_PARA 3716 PUSH_XMM 7 3717 SIGN_EXTENSION r1, r1d 3718 SIGN_EXTENSION r3, r3d 3719 sub p_src, i_srcstride 3720 sub p_src, i_srcstride 3721 lea i_srcstride3, [3 * i_srcstride] 3722 vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)] 3723 vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)] 3724 vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 3725 sub i_height, 3 3726.yloop: 3727 vmovdqu xmm0, [p_src - 2] 3728 vmovdqu xmm1, [p_src + i_srcstride - 2] 3729 vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1 3730 vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1 3731 lea p_src, [p_src + 4 * i_srcstride] 3732 AVX2_FilterHorizontalbw_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3 3733 vmovdqa [p_dst], ymm0 3734 add p_dst, 4 * i_dststride 3735 sub i_height, 4 3736 jg .yloop 3737 ; Height % 4 remaining single. 3738 vmovdqu xmm0, [p_src - 2] 3739 AVX2_FilterHorizontalbw_16px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3 3740 vmovlps [p_dst], xmm0 3741 vzeroupper 3742 POP_XMM 3743 LOAD_4_PARA_POP 3744 DEINIT_X86_32_PIC 3745%ifdef X86_32 3746 pop r4 3747%endif 3748 ret 3749%undef p_src 3750%undef i_srcstride 3751%undef p_dst 3752%undef i_height 3753%undef i_srcstride3 3754%undef i_dststride 3755 3756 3757;*********************************************************************** 3758; void McHorVer02Width4S16ToU8_avx2(const int16_t *pSrc, 3759; uint8_t *pDst, 3760; int32_t iDstStride, 3761; int32_t iHeight); 3762;*********************************************************************** 3763 3764WELS_EXTERN McHorVer02Width4S16ToU8_avx2 3765%define p_src r0 3766%define p_dst r1 3767%define i_dststride r2 3768%define i_height r3 3769%define i_dststride3 r4 3770%define i_srcstride 8 3771 %assign push_num 0 3772%ifdef X86_32 3773 push r4 3774 %assign push_num 1 3775%endif 3776 INIT_X86_32_PIC r5 3777 LOAD_4_PARA 3778 PUSH_XMM 8 3779 SIGN_EXTENSION r2, r2d 3780 SIGN_EXTENSION r3, r3d 3781 lea i_dststride3, [3 * i_dststride] 3782 vmovdqu ymm0, [p_src + 0 * i_srcstride] 3783 vmovdqu ymm1, [p_src + 1 * i_srcstride] 3784 vmovdqu ymm2, [p_src + 2 * i_srcstride] 3785 vmovdqu ymm3, [p_src + 3 * i_srcstride] 3786 vmovdqu ymm4, [p_src + 4 * i_srcstride] 3787 vmovdqu ymm5, [p_src + 5 * i_srcstride] 3788 vmovdqu ymm6, [p_src + 6 * i_srcstride] 3789 AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7 3790 vpackuswb ymm0, ymm0, ymm0 3791 vmovd [p_dst], xmm0 3792 vpsrlq xmm7, xmm0, 32 3793 vmovd [p_dst + i_dststride], xmm7 3794 vextracti128 xmm0, ymm0, 1 3795 vmovd [p_dst + 2 * i_dststride], xmm0 3796 vpsrlq xmm7, xmm0, 32 3797 vmovd [p_dst + i_dststride3], xmm7 3798 cmp i_height, 4 3799 jle .done 3800 lea p_dst, [p_dst + 4 * i_dststride] 3801 vmovdqu ymm7, [p_src + 7 * i_srcstride] 3802 vmovdqu ymm0, [p_src + 8 * i_srcstride] 3803 vmovdqu ymm1, [p_src + 9 * i_srcstride] 3804 AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3 3805 vpackuswb ymm4, ymm4, ymm4 3806 vmovd [p_dst], xmm4 3807 vpsrlq xmm3, xmm4, 32 3808 vmovd [p_dst + i_dststride], xmm3 3809 vextracti128 xmm4, ymm4, 1 3810 vmovd [p_dst + 2 * i_dststride], xmm4 3811 vpsrlq xmm3, xmm4, 32 3812 vmovd [p_dst + i_dststride3], xmm3 3813.done: 3814 vzeroupper 3815 POP_XMM 3816 LOAD_4_PARA_POP 3817 DEINIT_X86_32_PIC 3818%ifdef X86_32 3819 pop r4 3820%endif 3821 ret 3822%undef p_src 3823%undef p_dst 3824%undef i_dststride 3825%undef i_height 3826%undef i_srcstride 3827%undef i_dststride3 3828 3829 3830;******************************************************************************* 3831; void McHorVer20Width8U8ToS16_avx2(const uint8_t *pSrc, 3832; int iSrcStride, 3833; int16_t *pDst, 3834; int iHeight); 3835;******************************************************************************* 3836 3837WELS_EXTERN McHorVer20Width8U8ToS16_avx2 3838%define p_src r0 3839%define i_srcstride r1 3840%define p_dst r2 3841%define i_height r3 3842%define i_dststride 16 3843 %assign push_num 0 3844 INIT_X86_32_PIC r4 3845 LOAD_4_PARA 3846 PUSH_XMM 6 3847 SIGN_EXTENSION r1, r1d 3848 SIGN_EXTENSION r3, r3d 3849 sub p_src, i_srcstride 3850 sub p_src, i_srcstride 3851 vbroadcasti128 ymm3, [pic(shufb_32435465768798A9)] 3852 vbroadcasti128 ymm4, [pic(shufb_011267784556ABBC)] 3853 vbroadcasti128 ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 3854 sub i_height, 1 3855.yloop: 3856 vmovdqu xmm0, [p_src - 2] 3857 vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1 3858 lea p_src, [p_src + 2 * i_srcstride] 3859 AVX2_FilterHorizontalbw_16px ymm0, ymm3, ymm4, ymm5, ymm1, ymm2 3860 vmovdqu [p_dst], ymm0 3861 add p_dst, 2 * i_dststride 3862 sub i_height, 2 3863 jg .yloop 3864 jl .done 3865 vmovdqu xmm0, [p_src - 2] 3866 AVX2_FilterHorizontalbw_16px xmm0, xmm3, xmm4, xmm5, xmm1, xmm2 3867 vmovdqa [p_dst], xmm0 3868.done: 3869 vzeroupper 3870 POP_XMM 3871 LOAD_4_PARA_POP 3872 DEINIT_X86_32_PIC 3873 ret 3874%undef p_src 3875%undef i_srcstride 3876%undef p_dst 3877%undef i_height 3878%undef i_dststride 3879 3880 3881;*********************************************************************** 3882; void McHorVer02Width5S16ToU8_avx2(const int16_t *pSrc, 3883; uint8_t *pDst, 3884; int32_t iDstStride, 3885; int32_t iHeight); 3886;*********************************************************************** 3887 3888WELS_EXTERN McHorVer02Width5S16ToU8_avx2 3889%define p_src r0 3890%define p_dst r1 3891%define i_dststride r2 3892%define i_height r3 3893%define i_srcstride 16 3894 %assign push_num 0 3895 INIT_X86_32_PIC r4 3896 LOAD_4_PARA 3897 PUSH_XMM 8 3898 SIGN_EXTENSION r2, r2d 3899 SIGN_EXTENSION r3, r3d 3900 vmovdqu ymm0, [p_src + 0 * i_srcstride] 3901 vmovdqu ymm2, [p_src + 2 * i_srcstride] 3902 vmovdqu ymm4, [p_src + 4 * i_srcstride] 3903 vmovdqu ymm6, [p_src + 6 * i_srcstride] 3904 vperm2i128 ymm1, ymm0, ymm2, 00100001b 3905 vperm2i128 ymm3, ymm2, ymm4, 00100001b 3906 vperm2i128 ymm5, ymm4, ymm6, 00100001b 3907 AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7 3908 vpackuswb ymm0, ymm0, ymm0 3909 vpsrlq xmm7, xmm0, 8 3910 vmovd [p_dst + 1], xmm7 3911 vmovd [p_dst], xmm0 3912 add p_dst, i_dststride 3913 vextracti128 xmm0, ymm0, 1 3914 vpsrlq xmm7, xmm0, 8 3915 vmovd [p_dst + 1], xmm7 3916 vmovd [p_dst], xmm0 3917 add p_dst, i_dststride 3918 vmovdqu ymm7, [p_src + 7 * i_srcstride] 3919 vmovdqu ymm0, [p_src + 8 * i_srcstride] 3920 AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1 3921 vpackuswb ymm2, ymm2, ymm2 3922 vpsrlq xmm1, xmm2, 8 3923 vmovd [p_dst + 1], xmm1 3924 vmovd [p_dst], xmm2 3925 add p_dst, i_dststride 3926 vextracti128 xmm2, ymm2, 1 3927 vpsrlq xmm1, xmm2, 8 3928 vmovd [p_dst + 1], xmm1 3929 vmovd [p_dst], xmm2 3930 add p_dst, i_dststride 3931 vmovdqu ymm1, [p_src + 9 * i_srcstride] 3932 vmovdqu ymm2, [p_src + 10 * i_srcstride] 3933 AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3 3934 vpackuswb ymm4, ymm4, ymm4 3935 vpsrlq xmm3, xmm4, 8 3936 vmovd [p_dst + 1], xmm3 3937 vmovd [p_dst], xmm4 3938 cmp i_height, 5 3939 jle .done 3940 add p_dst, i_dststride 3941 vextracti128 xmm4, ymm4, 1 3942 vpsrlq xmm3, xmm4, 8 3943 vmovd [p_dst + 1], xmm3 3944 vmovd [p_dst], xmm4 3945 add p_dst, i_dststride 3946 vmovdqu ymm3, [p_src + 11 * i_srcstride] 3947 vmovdqu xmm4, [p_src + 12 * i_srcstride] 3948 AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5 3949 vpackuswb ymm6, ymm6, ymm6 3950 vpsrlq xmm5, xmm6, 8 3951 vmovd [p_dst + 1], xmm5 3952 vmovd [p_dst], xmm6 3953 add p_dst, i_dststride 3954 vextracti128 xmm6, ymm6, 1 3955 vpsrlq xmm5, xmm6, 8 3956 vmovd [p_dst + 1], xmm5 3957 vmovd [p_dst], xmm6 3958 add p_dst, i_dststride 3959 vmovdqu xmm5, [p_src + 13 * i_srcstride] 3960 AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7 3961 vpackuswb xmm0, xmm0, xmm0 3962 vpsrlq xmm7, xmm0, 8 3963 vmovd [p_dst + 1], xmm7 3964 vmovd [p_dst], xmm0 3965.done: 3966 vzeroupper 3967 POP_XMM 3968 LOAD_4_PARA_POP 3969 DEINIT_X86_32_PIC 3970 ret 3971%undef p_src 3972%undef p_dst 3973%undef i_dststride 3974%undef i_height 3975%undef i_srcstride 3976 3977 3978;*********************************************************************** 3979; void McHorVer02Width8S16ToU8_avx2(const int16_t *pSrc, 3980; uint8_t *pDst, 3981; int32_t iDstStride, 3982; int32_t iHeight); 3983;*********************************************************************** 3984 3985WELS_EXTERN McHorVer02Width8S16ToU8_avx2 3986%define p_src r0 3987%define p_dst r1 3988%define i_dststride r2 3989%define i_height r3 3990%define i_dststride3 r4 3991%define i_srcstride 16 3992 %assign push_num 0 3993%ifdef X86_32 3994 push r4 3995 %assign push_num 1 3996%endif 3997 INIT_X86_32_PIC r5 3998 LOAD_4_PARA 3999 PUSH_XMM 8 4000 SIGN_EXTENSION r2, r2d 4001 SIGN_EXTENSION r3, r3d 4002 lea i_dststride3, [3 * i_dststride] 4003 vmovdqa ymm0, [p_src + 0 * i_srcstride] 4004 vmovdqa ymm2, [p_src + 2 * i_srcstride] 4005 vmovdqa ymm4, [p_src + 4 * i_srcstride] 4006 vperm2i128 ymm1, ymm0, ymm2, 00100001b 4007 vperm2i128 ymm3, ymm2, ymm4, 00100001b 4008.yloop: 4009 vmovdqa ymm6, [p_src + 6 * i_srcstride] 4010 vperm2i128 ymm5, ymm4, ymm6, 00100001b 4011 AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7 4012 vmovdqu ymm7, [p_src + 7 * i_srcstride] 4013 AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1 4014 vpackuswb ymm1, ymm0, ymm2 4015 vmovdqa ymm0, [p_src + 8 * i_srcstride] 4016 vextracti128 xmm2, ymm1, 1 4017 vmovlps [p_dst], xmm1 4018 vmovlps [p_dst + i_dststride], xmm2 4019 vmovhps [p_dst + 2 * i_dststride], xmm1 4020 vmovhps [p_dst + i_dststride3], xmm2 4021 cmp i_height, 4 4022 jle .done 4023 lea p_dst, [p_dst + 4 * i_dststride] 4024 vmovdqu ymm1, [p_src + 9 * i_srcstride] 4025 vmovdqa ymm2, [p_src + 10 * i_srcstride] 4026 AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3 4027 vmovdqu ymm3, [p_src + 11 * i_srcstride] 4028 AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5 4029 vpackuswb ymm5, ymm4, ymm6 4030 vmovdqa ymm4, [p_src + 12 * i_srcstride] 4031 add p_src, 8 * i_srcstride 4032 vextracti128 xmm6, ymm5, 1 4033 vmovlps [p_dst], xmm5 4034 vmovlps [p_dst + i_dststride], xmm6 4035 vmovhps [p_dst + 2 * i_dststride], xmm5 4036 vmovhps [p_dst + i_dststride3], xmm6 4037 lea p_dst, [p_dst + 4 * i_dststride] 4038 sub i_height, 8 4039 jg .yloop 4040.done: 4041 vzeroupper 4042 POP_XMM 4043 LOAD_4_PARA_POP 4044 DEINIT_X86_32_PIC 4045%ifdef X86_32 4046 pop r4 4047%endif 4048 ret 4049%undef p_src 4050%undef p_dst 4051%undef i_dststride 4052%undef i_height 4053%undef i_dststride3 4054%undef i_srcstride 4055 4056 4057;******************************************************************************* 4058; void McHorVer20Width16U8ToS16_avx2(const uint8_t *pSrc, 4059; int32_t iSrcStride, 4060; int16_t *pDst, 4061; int32_t iHeight); 4062;******************************************************************************* 4063 4064WELS_EXTERN McHorVer20Width16U8ToS16_avx2 4065%define p_src r0 4066%define i_srcstride r1 4067%define p_dst r2 4068%define i_height r3 4069%define i_dststride 32 4070 %assign push_num 0 4071 INIT_X86_32_PIC r4 4072 LOAD_4_PARA 4073 PUSH_XMM 7 4074 SIGN_EXTENSION r1, r1d 4075 SIGN_EXTENSION r3, r3d 4076 sub p_src, i_srcstride 4077 sub p_src, i_srcstride 4078 vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)] 4079 vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)] 4080 vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 4081 sub i_height, 1 4082.yloop: 4083 vmovdqu xmm0, [p_src - 2] 4084 vinserti128 ymm0, ymm0, [p_src + 6], 1 4085 vmovdqu xmm1, [p_src + i_srcstride - 2] 4086 vinserti128 ymm1, ymm1, [p_src + i_srcstride + 6], 1 4087 lea p_src, [p_src + 2 * i_srcstride] 4088 AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3 4089 vmovdqa [p_dst], ymm0 4090 AVX2_FilterHorizontalbw_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3 4091 vmovdqa [p_dst + i_dststride], ymm1 4092 add p_dst, 2 * i_dststride 4093 sub i_height, 2 4094 jg .yloop 4095 jl .done 4096 vmovdqu xmm0, [p_src - 2] 4097 vinserti128 ymm0, ymm0, [p_src + 6], 1 4098 AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm1, ymm2 4099 vmovdqa [p_dst], ymm0 4100.done: 4101 vzeroupper 4102 POP_XMM 4103 LOAD_4_PARA_POP 4104 DEINIT_X86_32_PIC 4105 ret 4106%undef p_src 4107%undef i_srcstride 4108%undef p_dst 4109%undef i_height 4110%undef i_dststride 4111 4112 4113;*********************************************************************** 4114; void McHorVer02Width9S16ToU8_avx2(const int16_t *pSrc, 4115; uint8_t *pDst, 4116; int32_t iDstStride, 4117; int32_t iHeight); 4118;*********************************************************************** 4119 4120WELS_EXTERN McHorVer02Width9S16ToU8_avx2 4121%define p_src r0 4122%define p_dst r1 4123%define i_dststride r2 4124%define i_height r3 4125%define i_srcstride 32 4126 %assign push_num 0 4127 INIT_X86_32_PIC r4 4128 LOAD_4_PARA 4129 PUSH_XMM 8 4130 SIGN_EXTENSION r2, r2d 4131 SIGN_EXTENSION r3, r3d 4132 vmovdqa ymm0, [p_src + 0 * i_srcstride] 4133 vmovdqa ymm1, [p_src + 1 * i_srcstride] 4134 vmovdqa ymm2, [p_src + 2 * i_srcstride] 4135 vmovdqa ymm3, [p_src + 3 * i_srcstride] 4136 vmovdqa ymm4, [p_src + 4 * i_srcstride] 4137 sub i_height, 1 4138.height_loop: 4139 vmovdqa ymm5, [p_src + 5 * i_srcstride] 4140 AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6 4141 vmovdqa ymm6, [p_src + 6 * i_srcstride] 4142 AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 4143 vmovdqa ymm7, [p_src + 7 * i_srcstride] 4144 vpackuswb ymm0, ymm0, ymm1 4145 vextracti128 xmm1, ymm0, 1 4146 vpsllq xmm1, xmm1, 56 4147 vmovlps [p_dst + 1], xmm1 4148 vmovlps [p_dst], xmm0 4149 add p_dst, i_dststride 4150 vmovhps [p_dst + 1], xmm1 4151 vmovhps [p_dst], xmm0 4152 add p_dst, i_dststride 4153 AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0 4154 vmovdqa ymm0, [p_src + 8 * i_srcstride] 4155 AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1 4156 vpackuswb ymm2, ymm2, ymm3 4157 vextracti128 xmm3, ymm2, 1 4158 vpsllq xmm3, xmm3, 56 4159 vmovlps [p_dst + 1], xmm3 4160 vmovlps [p_dst], xmm2 4161 add p_dst, i_dststride 4162 vmovhps [p_dst + 1], xmm3 4163 vmovhps [p_dst], xmm2 4164 add p_dst, i_dststride 4165 vmovdqa ymm1, [p_src + 9 * i_srcstride] 4166 AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2 4167 vmovdqa ymm2, [p_src + 10 * i_srcstride] 4168 AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3 4169 vmovdqa ymm3, [p_src + 11 * i_srcstride] 4170 vpackuswb ymm4, ymm4, ymm5 4171 vextracti128 xmm5, ymm4, 1 4172 vpsllq xmm5, xmm5, 56 4173 vmovlps [p_dst + 1], xmm5 4174 vmovlps [p_dst], xmm4 4175 cmp i_height, 4 4176 jle .done 4177 add p_dst, i_dststride 4178 vmovhps [p_dst + 1], xmm5 4179 vmovhps [p_dst], xmm4 4180 add p_dst, i_dststride 4181 AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4 4182 vmovdqa ymm4, [p_src + 12 * i_srcstride] 4183 add p_src, 8 * i_srcstride 4184 AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5 4185 vpackuswb ymm6, ymm6, ymm7 4186 vextracti128 xmm7, ymm6, 1 4187 vpsllq xmm7, xmm7, 56 4188 vmovlps [p_dst + 1], xmm7 4189 vmovlps [p_dst], xmm6 4190 add p_dst, i_dststride 4191 vmovhps [p_dst + 1], xmm7 4192 vmovhps [p_dst], xmm6 4193 add p_dst, i_dststride 4194 sub i_height, 8 4195 jg .height_loop 4196 vmovdqa ymm5, [p_src + 5 * i_srcstride] 4197 AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6 4198 vpackuswb ymm0, ymm0, ymm0 4199 vextracti128 xmm1, ymm0, 1 4200 vpsllq xmm1, xmm1, 56 4201 vmovlps [p_dst + 1], xmm1 4202 vmovlps [p_dst], xmm0 4203.done: 4204 vzeroupper 4205 POP_XMM 4206 LOAD_4_PARA_POP 4207 DEINIT_X86_32_PIC 4208 ret 4209%undef p_src 4210%undef i_srcstride 4211%undef p_dst 4212%undef i_dststride 4213%undef i_height 4214 4215 4216;******************************************************************************* 4217; void McHorVer20Width17U8ToS16_avx2(const uint8_t *pSrc, 4218; int32_t iSrcStride, 4219; int16_t *pDst, 4220; int32_t iHeight); 4221;******************************************************************************* 4222 4223WELS_EXTERN McHorVer20Width17U8ToS16_avx2 4224%define p_src r0 4225%define i_srcstride r1 4226%define p_dst r2 4227%define i_height r3 4228%define i_srcstride3 r4 4229%define i_dststride 64 4230 %assign push_num 0 4231%ifdef X86_32 4232 push r4 4233 %assign push_num 1 4234%endif 4235 INIT_X86_32_PIC r5 4236 LOAD_4_PARA 4237 PUSH_XMM 8 4238 SIGN_EXTENSION r1, r1d 4239 SIGN_EXTENSION r3, r3d 4240 sub p_src, i_srcstride 4241 sub p_src, i_srcstride 4242 lea i_srcstride3, [3 * i_srcstride] 4243 vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)] 4244 vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)] 4245 vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)] 4246 sub i_height, 3 4247.yloop: 4248 vmovdqu xmm0, [p_src - 2] 4249 vmovdqu xmm3, [p_src + 6] 4250 vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1 4251 vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1 4252 vmovdqa ymm4, ymm3 4253 AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2 4254 vmovdqa [p_dst], xmm0 4255 vextracti128 [p_dst + i_dststride], ymm0, 1 4256 AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2 4257 vmovdqu xmm1, [p_src + 2 * i_srcstride - 2] 4258 vmovdqu xmm0, [p_src + 2 * i_srcstride + 6] 4259 vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1 4260 vinserti128 ymm0, ymm0, [p_src + i_srcstride3 + 6], 1 4261 lea p_src, [p_src + 4 * i_srcstride] 4262 vpunpckhqdq ymm4, ymm4, ymm0 4263 AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2 4264 vmovlps [p_dst + 26], xmm4 4265 vmovdqa [p_dst + 16], xmm3 4266 vextracti128 xmm2, ymm4, 1 4267 vmovlps [p_dst + i_dststride + 26], xmm2 4268 vextracti128 [p_dst + i_dststride + 16], ymm3, 1 4269 vmovhps [p_dst + 2 * i_dststride + 26], xmm4 4270 AVX2_FilterHorizontalbw_16px ymm1, ymm5, ymm6, ymm7, ymm3, ymm4 4271 vmovdqa [p_dst + 2 * i_dststride], xmm1 4272 AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm3, ymm4 4273 vmovdqa [p_dst + 2 * i_dststride + 16], xmm0 4274 vextracti128 [p_dst + 3 * i_dststride], ymm1, 1 4275 vmovhps [p_dst + 3 * i_dststride + 26], xmm2 4276 vextracti128 [p_dst + 3 * i_dststride + 16], ymm0, 1 4277 add p_dst, 4 * i_dststride 4278 sub i_height, 4 4279 jg .yloop 4280 ; Handle remaining 2 lines after 4x unrolled loop. 4281 vmovdqu xmm0, [p_src - 2] 4282 vinserti128 ymm0, ymm0, [p_src + 6], 1 4283 vmovdqu xmm3, [p_src + i_srcstride - 2] 4284 vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1 4285 vpunpckhqdq ymm4, ymm0, ymm3 4286 AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2 4287 AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2 4288 AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2 4289 vextracti128 xmm4, ymm4, 1 4290 vmovlps [p_dst + 26], xmm4 4291 vmovdqa [p_dst], ymm0 4292 vmovhps [p_dst + i_dststride + 26], xmm4 4293 vmovdqa [p_dst + i_dststride], ymm3 4294 vzeroupper 4295 POP_XMM 4296 LOAD_4_PARA_POP 4297 DEINIT_X86_32_PIC 4298%ifdef X86_32 4299 pop r4 4300%endif 4301 ret 4302%undef p_src 4303%undef i_srcstride 4304%undef p_dst 4305%undef i_dststride 4306%undef i_height 4307%undef i_srcstride3 4308 4309 4310;*********************************************************************** 4311; void McHorVer02Width16Or17S16ToU8_avx2(const int16_t *pSrc, 4312; int32_t iSrcStride, 4313; uint8_t *pDst, 4314; int32_t iDstStride, 4315; int32_t iWidth, 4316; int32_t iHeight); 4317;*********************************************************************** 4318 4319WELS_EXTERN McHorVer02Width16Or17S16ToU8_avx2 4320%define p_src r0 4321%define i_srcstride r1 4322%define p_dst r2 4323%define i_dststride r3 4324%ifdef X86_32_PICASM 4325%define i_width dword arg5 4326%else 4327%define i_width r4 4328%endif 4329%define i_height r5 4330%define i_srcstride3 r6 4331 %assign push_num 0 4332%ifdef X86_32 4333 push r6 4334 %assign push_num 1 4335%endif 4336 LOAD_6_PARA 4337 PUSH_XMM 8 4338 SIGN_EXTENSION r1, r1d 4339 SIGN_EXTENSION r3, r3d 4340 SIGN_EXTENSION r4, r4d 4341 SIGN_EXTENSION r5, r5d 4342 INIT_X86_32_PIC_NOPRESERVE r4 4343 sub i_height, 1 4344 lea i_srcstride3, [3 * i_srcstride] 4345 test i_width, 1 4346 jz .align_begin 4347 push i_height 4348 push p_src 4349 push p_dst 4350 %assign push_num push_num + 3 4351%ifdef X86_32_PICASM 4352 add p_src, i_width 4353 add p_src, i_width 4354 sub p_src, 2 4355%else 4356 lea p_src, [p_src + 2 * i_width - 2] 4357%endif 4358 add p_dst, i_width 4359 vmovd xmm0, [p_src] 4360 vpunpcklwd xmm0, xmm0, [p_src + i_srcstride] 4361 vmovd xmm1, [p_src + 2 * i_srcstride] 4362 add p_src, i_srcstride3 4363 vpunpcklwd xmm1, xmm1, [p_src] 4364 vpunpckldq xmm0, xmm0, xmm1 4365 vmovd xmm1, [p_src + i_srcstride] 4366 vpunpcklwd xmm1, xmm1, [p_src + 2 * i_srcstride] 4367 vmovd xmm2, [p_src + i_srcstride3] 4368 lea p_src, [p_src + 4 * i_srcstride] 4369 vpunpcklwd xmm2, xmm2, [p_src] 4370 vpunpckldq xmm1, xmm1, xmm2 4371 vpunpcklqdq xmm0, xmm0, xmm1 4372.height_loop_unalign: 4373 vmovd xmm1, [p_src + i_srcstride] 4374 vpalignr xmm1, xmm1, xmm0, 2 4375 vmovd xmm2, [p_src + 2 * i_srcstride] 4376 vpalignr xmm2, xmm2, xmm1, 2 4377 vmovd xmm3, [p_src + i_srcstride3] 4378 vpalignr xmm3, xmm3, xmm2, 2 4379 lea p_src, [p_src + 4 * i_srcstride] 4380 vmovd xmm4, [p_src] 4381 vpalignr xmm4, xmm4, xmm3, 2 4382 vmovd xmm5, [p_src + i_srcstride] 4383 vpalignr xmm5, xmm5, xmm4, 2 4384 AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7 4385 vpackuswb xmm0, xmm0, xmm0 4386 vpslld xmm6, xmm0, 24 4387 vmovd [p_dst - 4], xmm6 4388 vmovlps [p_dst + 4 * i_dststride - 8], xmm6 4389 add p_dst, i_dststride 4390 vpslld xmm6, xmm0, 16 4391 vmovd [p_dst - 4], xmm6 4392 vmovlps [p_dst + 4 * i_dststride - 8], xmm6 4393 add p_dst, i_dststride 4394 vpslld xmm6, xmm0, 8 4395 vmovd [p_dst - 4], xmm6 4396 vmovd [p_dst + i_dststride - 4], xmm0 4397 lea p_dst, [p_dst + 4 * i_dststride] 4398 vmovlps [p_dst - 8], xmm6 4399 vmovlps [p_dst + i_dststride - 8], xmm0 4400 lea p_dst, [p_dst + 2 * i_dststride] 4401 sub i_height, 8 4402 jle .height_loop_unalign_exit 4403 vmovd xmm1, [p_src + 2 * i_srcstride] 4404 vpalignr xmm1, xmm1, xmm5, 2 4405 vmovd xmm0, [p_src + i_srcstride3] 4406 lea p_src, [p_src + 4 * i_srcstride] 4407 vpunpcklwd xmm0, xmm0, [p_src] 4408 vpalignr xmm0, xmm0, xmm1, 4 4409 jmp .height_loop_unalign 4410.height_loop_unalign_exit: 4411 vpbroadcastq xmm6, [p_src + 2 * i_srcstride - 6] 4412 AVX2_FilterVerticalw_16px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 4413 vpackuswb xmm1, xmm1, xmm1 4414 vmovlps [p_dst - 8], xmm1 4415 pop p_dst 4416 pop p_src 4417 pop i_height 4418 %assign push_num push_num - 3 4419.align_begin: 4420 vmovdqa ymm0, [p_src] 4421 vmovdqa ymm1, [p_src + i_srcstride] 4422 vmovdqa ymm2, [p_src + 2 * i_srcstride] 4423 vmovdqa ymm3, [p_src + i_srcstride3] 4424 lea p_src, [p_src + 4 * i_srcstride] 4425 vmovdqa ymm4, [p_src] 4426.height_loop: 4427 vmovdqa ymm5, [p_src + i_srcstride] 4428 AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6 4429 vmovdqa ymm6, [p_src + 2 * i_srcstride] 4430 AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7 4431 vmovdqa ymm7, [p_src + i_srcstride3] 4432 lea p_src, [p_src + 4 * i_srcstride] 4433 vpackuswb ymm0, ymm0, ymm1 4434 vpermq ymm0, ymm0, 11011000b 4435 vmovdqa [p_dst], xmm0 4436 vextracti128 [p_dst + i_dststride], ymm0, 1 4437 lea p_dst, [p_dst + 2 * i_dststride] 4438 AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0 4439 vmovdqa ymm0, [p_src] 4440 AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1 4441 vpackuswb ymm2, ymm2, ymm3 4442 vpermq ymm2, ymm2, 11011000b 4443 vmovdqa [p_dst], xmm2 4444 vextracti128 [p_dst + i_dststride], ymm2, 1 4445 lea p_dst, [p_dst + 2 * i_dststride] 4446 vmovdqa ymm1, [p_src + i_srcstride] 4447 AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2 4448 vmovdqa ymm2, [p_src + 2 * i_srcstride] 4449 AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3 4450 vmovdqa ymm3, [p_src + i_srcstride3] 4451 lea p_src, [p_src + 4 * i_srcstride] 4452 vpackuswb ymm4, ymm4, ymm5 4453 vpermq ymm4, ymm4, 11011000b 4454 vmovdqa [p_dst], xmm4 4455 vextracti128 [p_dst + i_dststride], ymm4, 1 4456 lea p_dst, [p_dst + 2 * i_dststride] 4457 AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4 4458 vmovdqa ymm4, [p_src] 4459 AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5 4460 vpackuswb ymm6, ymm6, ymm7 4461 vpermq ymm6, ymm6, 11011000b 4462 vmovdqa [p_dst], xmm6 4463 vextracti128 [p_dst + i_dststride], ymm6, 1 4464 lea p_dst, [p_dst + 2 * i_dststride] 4465 sub i_height, 8 4466 jg .height_loop 4467 jl .done 4468 vmovdqa ymm5, [p_src + i_srcstride] 4469 AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6 4470 vpackuswb ymm0, ymm0, ymm0 4471 vpermq ymm0, ymm0, 11011000b 4472 vmovdqa [p_dst], xmm0 4473.done: 4474 vzeroupper 4475 DEINIT_X86_32_PIC 4476 POP_XMM 4477 LOAD_6_PARA_POP 4478%ifdef X86_32 4479 pop r6 4480%endif 4481 ret 4482%undef p_src 4483%undef i_srcstride 4484%undef p_dst 4485%undef i_dststride 4486%undef i_width 4487%undef i_height 4488%undef i_srcstride3 4489 4490%endif ; HAVE_AVX2 4491