1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* upsampling.asm 33;* 34;* Abstract 35;* SIMD for pixel domain down sampling 36;* 37;* History 38;* 10/22/2009 Created 39;* 40;*************************************************************************/ 41%include "asm_inc.asm" 42 43%ifdef __NASM_VER__ 44 %use smartalign 45%endif 46 47;*********************************************************************** 48; Macros and other preprocessor constants 49;*********************************************************************** 50 51 52;*********************************************************************** 53; Some constants 54;*********************************************************************** 55 56;*********************************************************************** 57; Local Data (Read Only) 58;*********************************************************************** 59 60%ifdef X86_32_PICASM 61SECTION .text align=32 62%else 63SECTION .rodata align=32 64%endif 65 66;*********************************************************************** 67; Various memory constants (trigonometric values or rounding values) 68;*********************************************************************** 69 70ALIGN 32 71%ifndef X86_32_PICASM 72db80h_256: 73 times 32 db 80h 74shufb_0000000088888888: 75 times 8 db 0 76 times 8 db 8 77shufb_000044448888CCCC: 78 times 4 db 0 79 times 4 db 4 80 times 4 db 8 81 times 4 db 12 82%endif 83shufb_mask_low: 84 db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h 85shufb_mask_high: 86 db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h 87add_extra_half: 88 dd 16384,0,0,0 89 90shufb_mask_quarter: 91db 00h, 04h, 08h, 0ch, 80h, 80h, 80h, 80h, 01h, 05h, 09h, 0dh, 80h, 80h, 80h, 80h 92 93shufb_mask_onethird_low_1: 94db 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h 95shufb_mask_onethird_low_2: 96db 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh, 80h, 80h, 80h, 80h, 80h 97shufb_mask_onethird_low_3: 98db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 01h, 04h, 07h, 0ah, 0dh 99 100shufb_mask_onethird_high_1: 101db 01h, 04h, 07h, 0ah, 0dh, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h 102shufb_mask_onethird_high_2: 103db 80h, 80h, 80h, 80h, 80h, 00h, 03h, 06h, 09h, 0ch, 0fh, 80h, 80h, 80h, 80h, 80h 104shufb_mask_onethird_high_3: 105db 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 02h, 05h, 08h, 0bh, 0eh 106 107;*********************************************************************** 108; Code 109;*********************************************************************** 110 111SECTION .text 112 113;*********************************************************************** 114; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride, 115; unsigned char* pSrc, const int iSrcStride, 116; const int iSrcWidth, const int iSrcHeight ); 117;*********************************************************************** 118WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse 119%ifdef X86_32 120 push r6 121 %assign push_num 1 122%else 123 %assign push_num 0 124%endif 125 LOAD_6_PARA 126 SIGN_EXTENSION r1, r1d 127 SIGN_EXTENSION r3, r3d 128 SIGN_EXTENSION r4, r4d 129 SIGN_EXTENSION r5, r5d 130 131%ifndef X86_32 132 push r12 133 mov r12, r4 134%endif 135 sar r5, $01 ; iSrcHeight >> 1 136 137.yloops1: 138%ifdef X86_32 139 mov r4, arg5 140%else 141 mov r4, r12 142%endif 143 sar r4, $01 ; iSrcWidth >> 1 144 mov r6, r4 ; iDstWidth restored at ebx 145 sar r4, $04 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb 146 neg r6 ; - (iSrcWidth >> 1) 147 ; each loop = source bandwidth: 32 bytes 148.xloops1: 149 ; 1st part horizonal loop: x16 bytes 150 ; mem hi<- ->lo 151 ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E 152 ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M 153 ;=> target: 154 ;: H G F E D C B A, P O N M L K J I 155 ;: h g f e d c b a, p o n m l k j i 156 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 157 movq mm0, [r2] ; 1st pSrc line 158 movq mm1, [r2+8] ; 1st pSrc line + 8 159 movq mm2, [r2+r3] ; 2nd pSrc line 160 movq mm3, [r2+r3+8] ; 2nd pSrc line + 8 161 162 ; to handle mm0, mm1, mm2, mm3 163 pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B 164 pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B 165 punpcklbw mm4, mm5 ; d c D C b a B A 166 pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 167 168 pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B 169 pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B 170 punpcklbw mm5, mm6 ; h g H G f e F E 171 pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 172 173 pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B 174 pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B 175 punpcklbw mm6, mm7 ; l k L K j i J I 176 pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 177 178 pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B 179 pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B 180 punpcklbw mm7, mm0 ; p o P O n m N M 181 pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 182 183 ; to handle mm4, mm5, mm6, mm7 184 movq mm0, mm4 ; 185 punpckldq mm0, mm5 ; H G F E D C B A 186 punpckhdq mm4, mm5 ; h g f e d c b a 187 188 movq mm1, mm6 189 punpckldq mm1, mm7 ; P O N M L K J I 190 punpckhdq mm6, mm7 ; p o n m l k j i 191 192 ; avg within MB horizon width (16 x 2 lines) 193 pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 194 pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 195 pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once 196 197 ; 2nd part horizonal loop: x16 bytes 198 ; mem hi<- ->lo 199 ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E 200 ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M 201 ;=> target: 202 ;: H G F E D C B A, P O N M L K J I 203 ;: h g f e d c b a, p o n m l k j i 204 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 205 movq mm1, [r2+16] ; 1st pSrc line + 16 206 movq mm2, [r2+24] ; 1st pSrc line + 24 207 movq mm3, [r2+r3+16] ; 2nd pSrc line + 16 208 movq mm4, [r2+r3+24] ; 2nd pSrc line + 24 209 210 ; to handle mm1, mm2, mm3, mm4 211 pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B 212 pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B 213 punpcklbw mm5, mm6 ; d c D C b a B A 214 pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5 215 216 pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B 217 pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B 218 punpcklbw mm6, mm7 ; h g H G f e F E 219 pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6 220 221 pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B 222 pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B 223 punpcklbw mm7, mm1 ; l k L K j i J I 224 pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7 225 226 pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B 227 pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B 228 punpcklbw mm1, mm2 ; p o P O n m N M 229 pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1 230 231 ; to handle mm5, mm6, mm7, mm1 232 movq mm2, mm5 233 punpckldq mm2, mm6 ; H G F E D C B A 234 punpckhdq mm5, mm6 ; h g f e d c b a 235 236 movq mm3, mm7 237 punpckldq mm3, mm1 ; P O N M L K J I 238 punpckhdq mm7, mm1 ; p o n m l k j i 239 240 ; avg within MB horizon width (16 x 2 lines) 241 pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 242 pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 243 pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part 244 245 movq [r0 ], mm0 246 movq [r0+8], mm2 247 248 ; next SMB 249 lea r2, [r2+32] 250 lea r0, [r0+16] 251 252 dec r4 253 jg near .xloops1 254 255 ; next line 256 lea r2, [r2+2*r3] ; next end of lines 257 lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] 258 lea r0, [r0+r1] 259 lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] 260 261 dec r5 262 jg near .yloops1 263 264 WELSEMMS 265%ifndef X86_32 266 pop r12 267%endif 268 LOAD_6_PARA_POP 269%ifdef X86_32 270 pop r6 271%endif 272 ret 273 274;*********************************************************************** 275; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride, 276; unsigned char* pSrc, const int iSrcStride, 277; const int iSrcWidth, const int iSrcHeight ); 278;*********************************************************************** 279WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse 280%ifdef X86_32 281 push r6 282 %assign push_num 1 283%else 284 %assign push_num 0 285%endif 286 LOAD_6_PARA 287 SIGN_EXTENSION r1, r1d 288 SIGN_EXTENSION r3, r3d 289 SIGN_EXTENSION r4, r4d 290 SIGN_EXTENSION r5, r5d 291 292%ifndef X86_32 293 push r12 294 mov r12, r4 295%endif 296 sar r5, $01 ; iSrcHeight >> 1 297 298.yloops2: 299%ifdef X86_32 300 mov r4, arg5 301%else 302 mov r4, r12 303%endif 304 sar r4, $01 ; iSrcWidth >> 1 305 mov r6, r4 ; iDstWidth restored at ebx 306 sar r4, $03 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb 307 neg r6 ; - (iSrcWidth >> 1) 308 ; each loop = source bandwidth: 16 bytes 309.xloops2: 310 ; 1st part horizonal loop: x16 bytes 311 ; mem hi<- ->lo 312 ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E 313 ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M 314 ;=> target: 315 ;: H G F E D C B A, P O N M L K J I 316 ;: h g f e d c b a, p o n m l k j i 317 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 318 movq mm0, [r2] ; 1st pSrc line 319 movq mm1, [r2+8] ; 1st pSrc line + 8 320 movq mm2, [r2+r3] ; 2nd pSrc line 321 movq mm3, [r2+r3+8] ; 2nd pSrc line + 8 322 323 ; to handle mm0, mm1, mm2, mm3 324 pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B 325 pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B 326 punpcklbw mm4, mm5 ; d c D C b a B A 327 pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4 328 329 pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B 330 pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B 331 punpcklbw mm5, mm6 ; h g H G f e F E 332 pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5 333 334 pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B 335 pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B 336 punpcklbw mm6, mm7 ; l k L K j i J I 337 pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6 338 339 pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B 340 pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B 341 punpcklbw mm7, mm0 ; p o P O n m N M 342 pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7 343 344 ; to handle mm4, mm5, mm6, mm7 345 movq mm0, mm4 ; 346 punpckldq mm0, mm5 ; H G F E D C B A 347 punpckhdq mm4, mm5 ; h g f e d c b a 348 349 movq mm1, mm6 350 punpckldq mm1, mm7 ; P O N M L K J I 351 punpckhdq mm6, mm7 ; p o n m l k j i 352 353 ; avg within MB horizon width (16 x 2 lines) 354 pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 355 pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 356 pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once 357 358 movq [r0 ], mm0 359 360 ; next SMB 361 lea r2, [r2+16] 362 lea r0, [r0+8] 363 364 dec r4 365 jg near .xloops2 366 367 ; next line 368 lea r2, [r2+2*r3] ; next end of lines 369 lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] 370 lea r0, [r0+r1] 371 lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] 372 373 dec r5 374 jg near .yloops2 375 376 WELSEMMS 377%ifndef X86_32 378 pop r12 379%endif 380 LOAD_6_PARA_POP 381%ifdef X86_32 382 pop r6 383%endif 384 ret 385 386;*********************************************************************** 387; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride, 388; unsigned char* pSrc, const int iSrcStride, 389; const int iSrcWidth, const int iSrcHeight ); 390;*********************************************************************** 391WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse 392%ifdef X86_32 393 push r6 394 %assign push_num 1 395%else 396 %assign push_num 0 397%endif 398 LOAD_6_PARA 399 SIGN_EXTENSION r1, r1d 400 SIGN_EXTENSION r3, r3d 401 SIGN_EXTENSION r4, r4d 402 SIGN_EXTENSION r5, r5d 403 404%ifndef X86_32 405 push r12 406 mov r12, r4 407%endif 408 sar r5, $01 ; iSrcHeight >> 1 409 410.yloops3: 411%ifdef X86_32 412 mov r4, arg5 413%else 414 mov r4, r12 415%endif 416 sar r4, $01 ; iSrcWidth >> 1 417 mov r6, r4 ; iDstWidth restored at ebx 418 sar r4, $02 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb 419 neg r6 ; - (iSrcWidth >> 1) 420 ; each loop = source bandwidth: 8 bytes 421.xloops3: 422 ; 1st part horizonal loop: x8 bytes 423 ; mem hi<- ->lo 424 ;1st Line Src: mm0: d D c C b B a A 425 ;2nd Line Src: mm1: h H g G f F e E 426 ;=> target: 427 ;: H G F E D C B A 428 ;: h g f e d c b a 429 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 430 movq mm0, [r2] ; 1st pSrc line 431 movq mm1, [r2+r3] ; 2nd pSrc line 432 433 ; to handle mm0, mm1, mm2, mm3 434 pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B 435 pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B 436 punpcklbw mm2, mm3 ; d c D C b a B A 437 pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4 438 439 pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B 440 pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B 441 punpcklbw mm4, mm5 ; h g H G f e F E 442 pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5 443 444 ; to handle mm2, mm4 445 movq mm0, mm2 ; 446 punpckldq mm0, mm4 ; H G F E D C B A 447 punpckhdq mm2, mm4 ; h g f e d c b a 448 449 ; avg within MB horizon width (16 x 2 lines) 450 pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2 451 pshufw mm1, mm0, 04eh ; 01001110 B 452 pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once 453 454 movd [r0], mm0 455 456 ; next unit 457 lea r2, [r2+8] 458 lea r0, [r0+4] 459 460 dec r4 461 jg near .xloops3 462 463 ; next line 464 lea r2, [r2+2*r3] ; next end of lines 465 lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] 466 lea r0, [r0+r1] 467 lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] 468 469 dec r5 470 jg near .yloops3 471 472 WELSEMMS 473%ifndef X86_32 474 pop r12 475%endif 476 LOAD_6_PARA_POP 477%ifdef X86_32 478 pop r6 479%endif 480 ret 481 482 483 484;*********************************************************************** 485; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride, 486; unsigned char* pSrc, const int iSrcStride, 487; const int iSrcWidth, const int iSrcHeight ); 488;*********************************************************************** 489WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3 490%ifdef X86_32 491 push r6 492 %assign push_num 1 493%else 494 %assign push_num 0 495%endif 496 LOAD_6_PARA 497 PUSH_XMM 4 498 SIGN_EXTENSION r1, r1d 499 SIGN_EXTENSION r3, r3d 500 SIGN_EXTENSION r4, r4d 501 SIGN_EXTENSION r5, r5d 502 503%ifndef X86_32 504 push r12 505 mov r12, r4 506%endif 507 sar r5, $01 ; iSrcHeight >> 1 508 509 WELS_DB1 xmm3 510 WELS_Zero xmm2 511 sar r4, $01 ; iSrcWidth >> 1 512 add r0, r4 ; pDst += iSrcWidth >> 1 513 514.yloops4: 515%ifdef X86_32 516 mov r4, arg5 517%else 518 mov r4, r12 519%endif 520 sar r4, $01 ; iSrcWidth >> 1 521 neg r4 ; -(iSrcWidth >> 1) 522 mov r6, r4 523 align 16 524 ; each loop = source bandwidth: 32 bytes 525.xloops4: 526 movdqa xmm0, [r2+r3] 527 movdqa xmm1, [r2+r3+16] 528 pavgb xmm0, [r2] ; avg vertical pixels 0-15 529 pavgb xmm1, [r2+16] ; avg vertical pixels 16-31 530 add r2, 32 ; pSrc += 32 531 pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels 0-15 532 pmaddubsw xmm1, xmm3 ; pairwise horizontal sum neighboring pixels 16-31 533 pavgw xmm0, xmm2 ; (sum + 1) >> 1 534 pavgw xmm1, xmm2 ; (sum + 1) >> 1 535 packuswb xmm0, xmm1 ; pack words to bytes 536 movdqa [r0+r4], xmm0 ; store results 537 add r4, 16 538 jl .xloops4 539 540 ; next line 541 lea r2, [r2+2*r3] ; next end of lines 542 lea r2, [r2+2*r6] ; reset to base 0 [- 2 * iDstWidth] 543 lea r0, [r0+r1] 544 545 sub r5, 1 546 jg .yloops4 547 548%ifndef X86_32 549 pop r12 550%endif 551 552 POP_XMM 553 LOAD_6_PARA_POP 554%ifdef X86_32 555 pop r6 556%endif 557 ret 558 559;*********************************************************************** 560; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride, 561; unsigned char* pSrc, const int iSrcStride, 562; const int iSrcWidth, const int iSrcHeight ); 563;*********************************************************************** 564WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3 565%ifdef X86_32 566 push r6 567 %assign push_num 1 568%else 569 %assign push_num 0 570%endif 571 LOAD_6_PARA 572 PUSH_XMM 4 573 SIGN_EXTENSION r1, r1d 574 SIGN_EXTENSION r3, r3d 575 SIGN_EXTENSION r4, r4d 576 SIGN_EXTENSION r5, r5d 577 578%ifndef X86_32 579 push r12 580 mov r12, r4 581%endif 582 sar r5, $01 ; iSrcHeight >> 1 583 WELS_DB1 xmm3 584 WELS_Zero xmm2 585 add r2, r4 ; pSrc += iSrcWidth 586 sar r4, $01 ; iSrcWidth >> 1 587 add r0, r4 ; pDst += iSrcWidth >> 1 588 589.yloops5: 590%ifdef X86_32 591 mov r4, arg5 592%else 593 mov r4, r12 594%endif 595 sar r4, $01 ; iSrcWidth >> 1 596 neg r4 ; -(iSrcWidth >> 1) 597 lea r6, [r2+r3] ; pSrc + iSrcStride 598 align 16 599 ; each loop = source bandwidth: 16 bytes 600.xloops5: 601 movdqa xmm0, [r2+2*r4] 602 pavgb xmm0, [r6+2*r4] ; avg vertical pixels 603 pmaddubsw xmm0, xmm3 ; pairwise horizontal sum neighboring pixels 604 pavgw xmm0, xmm2 ; (sum + 1) >> 1 605 packuswb xmm0, xmm0 ; pack words to bytes 606 movlps [r0+r4], xmm0 ; store results 607 add r4, 8 608 jl .xloops5 609 610 ; next line 611 lea r2, [r2+2*r3] ; next end of lines 612 lea r0, [r0+r1] 613 614 sub r5, 1 615 jg .yloops5 616 617%ifndef X86_32 618 pop r12 619%endif 620 621 POP_XMM 622 LOAD_6_PARA_POP 623%ifdef X86_32 624 pop r6 625%endif 626 ret 627 628 629%ifdef X86_32 630;************************************************************************************************************** 631;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, 632; unsigned char* pSrc, const int iSrcStride, 633; unsigned int uiScaleX, unsigned int uiScaleY ); 634;{ 635;************************************************************************************************************** 636 637WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 638 push ebp 639 push esi 640 push edi 641 push ebx 642%define pushsize 16 643%define localsize 16 644%define pDstData esp + pushsize + localsize + 4 645%define dwDstStride esp + pushsize + localsize + 8 646%define dwDstWidth esp + pushsize + localsize + 12 647%define dwDstHeight esp + pushsize + localsize + 16 648%define pSrcData esp + pushsize + localsize + 20 649%define dwSrcStride esp + pushsize + localsize + 24 650%define uiScaleX esp + pushsize + localsize + 28 651%define uiScaleY esp + pushsize + localsize + 32 652%define tmpHeight esp + 0 653%define yInverse esp + 4 654%define xInverse esp + 8 655%define dstStep esp + 12 656 sub esp, localsize 657 658 pxor xmm0, xmm0 659 mov eax, [uiScaleX] 660 and eax, 32767 661 mov ebx, eax 662 neg ebx 663 and ebx, 32767 664 movd xmm1, eax ; uinc(uiScaleX mod 32767) 665 movd xmm2, ebx ; -uinc 666 psllq xmm1, 32 667 por xmm1, xmm2 ; 0 0 uinc -uinc (dword) 668 pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc 669 670 mov eax, [uiScaleY] 671 and eax, 32767 672 mov ebx, eax 673 neg ebx 674 and ebx, 32767 675 movd xmm6, eax ; vinc(uiScaleY mod 32767) 676 movd xmm2, ebx ; -vinc 677 psllq xmm6, 32 678 por xmm6, xmm2 ; 0 0 vinc -vinc (dword) 679 pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc 680 681 mov edx, 40003fffh 682 movd xmm5, edx 683 punpcklwd xmm5, xmm0 ; 16384 16383 684 pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 685 686 687DOWNSAMPLE: 688 689 mov eax, [dwDstHeight] 690 mov edi, [pDstData] 691 mov edx, [dwDstStride] 692 mov ecx, [dwDstWidth] 693 sub edx, ecx 694 mov [dstStep], edx ; stride - width 695 dec eax 696 mov [tmpHeight], eax 697 mov eax, 16384 698 mov [yInverse], eax 699 700 pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 701 702HEIGHT: 703 mov eax, [yInverse] 704 mov esi, [pSrcData] 705 shr eax, 15 706 mul dword [dwSrcStride] 707 add esi, eax ; get current row address 708 mov ebp, esi 709 add ebp, [dwSrcStride] 710 711 mov eax, 16384 712 mov [xInverse], eax 713 mov ecx, [dwDstWidth] 714 dec ecx 715 716 movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 717 718WIDTH: 719 mov eax, [xInverse] 720 shr eax, 15 721 722 movd xmm1, [esi+eax] ; xxxxxxba 723 movd xmm2, [ebp+eax] ; xxxxxxdc 724 pxor xmm0, xmm0 725 punpcklwd xmm1, xmm2 ; xxxxdcba 726 punpcklbw xmm1, xmm0 ; 0d0c0b0a 727 punpcklwd xmm1, xmm0 ; 000d000c000b000a 728 729 movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv 730 pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 731 movdqa xmm0, xmm2 732 pmuludq xmm2, xmm1 733 psrlq xmm0, 32 734 psrlq xmm1, 32 735 pmuludq xmm0, xmm1 736 paddq xmm2, xmm0 737 pshufd xmm1, xmm2, 00001110b 738 paddq xmm2, xmm1 739 psrlq xmm2, 29 740 741 movd eax, xmm2 742 inc eax 743 shr eax, 1 744 mov [edi], al 745 inc edi 746 747 mov eax, [uiScaleX] 748 add [xInverse], eax 749 750 paddw xmm3, xmm7 ; inc u 751 psllw xmm3, 1 752 psrlw xmm3, 1 753 754 loop WIDTH 755 756WIDTH_END: 757 mov eax, [xInverse] 758 shr eax, 15 759 mov cl, [esi+eax] 760 mov [edi], cl 761 inc edi 762 763 mov eax, [uiScaleY] 764 add [yInverse], eax 765 add edi, [dstStep] 766 767 paddw xmm4, xmm6 ; inc v 768 psllw xmm4, 1 769 psrlw xmm4, 1 770 771 dec dword [tmpHeight] 772 jg HEIGHT 773 774 775LAST_ROW: 776 mov eax, [yInverse] 777 mov esi, [pSrcData] 778 shr eax, 15 779 mul dword [dwSrcStride] 780 add esi, eax ; get current row address 781 782 mov eax, 16384 783 mov [xInverse], eax 784 mov ecx, [dwDstWidth] 785 786LAST_ROW_WIDTH: 787 mov eax, [xInverse] 788 shr eax, 15 789 790 mov al, [esi+eax] 791 mov [edi], al 792 inc edi 793 794 mov eax, [uiScaleX] 795 add [xInverse], eax 796 797 loop LAST_ROW_WIDTH 798 799LAST_ROW_END: 800 801 add esp, localsize 802 pop ebx 803 pop edi 804 pop esi 805 pop ebp 806%undef pushsize 807%undef localsize 808%undef pSrcData 809%undef dwSrcWidth 810%undef dwSrcHeight 811%undef dwSrcStride 812%undef pDstData 813%undef dwDstWidth 814%undef dwDstHeight 815%undef dwDstStride 816%undef uiScaleX 817%undef uiScaleY 818%undef tmpHeight 819%undef yInverse 820%undef xInverse 821%undef dstStep 822 ret 823 824 825 826 827;************************************************************************************************************** 828;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, 829; unsigned char* pSrc, const int iSrcStride, 830; unsigned int uiScaleX, unsigned int uiScaleY ); 831;{ 832;************************************************************************************************************** 833 834WELS_EXTERN GeneralBilinearFastDownsampler_sse2 835 push ebp 836 push esi 837 push edi 838 push ebx 839%define pushsize 16 840%define localsize 16 841%define pDstData esp + pushsize + localsize + 4 842%define dwDstStride esp + pushsize + localsize + 8 843%define dwDstWidth esp + pushsize + localsize + 12 844%define dwDstHeight esp + pushsize + localsize + 16 845%define pSrcData esp + pushsize + localsize + 20 846%define dwSrcStride esp + pushsize + localsize + 24 847%define uiScaleX esp + pushsize + localsize + 28 848%define uiScaleY esp + pushsize + localsize + 32 849%define tmpHeight esp + 0 850%define yInverse esp + 4 851%define xInverse esp + 8 852%define dstStep esp + 12 853 sub esp, localsize 854 855 pxor xmm0, xmm0 856 mov edx, 65535 857 mov eax, [uiScaleX] 858 and eax, edx 859 mov ebx, eax 860 neg ebx 861 and ebx, 65535 862 movd xmm1, eax ; uinc(uiScaleX mod 65536) 863 movd xmm2, ebx ; -uinc 864 psllq xmm1, 32 865 por xmm1, xmm2 ; 0 uinc 0 -uinc 866 pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc 867 868 mov eax, [uiScaleY] 869 and eax, 32767 870 mov ebx, eax 871 neg ebx 872 and ebx, 32767 873 movd xmm6, eax ; vinc(uiScaleY mod 32767) 874 movd xmm2, ebx ; -vinc 875 psllq xmm6, 32 876 por xmm6, xmm2 ; 0 vinc 0 -vinc 877 pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc 878 879 mov edx, 80007fffh ; 32768 32767 880 movd xmm5, edx 881 pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 882 mov ebx, 16384 883 884 885FAST_DOWNSAMPLE: 886 887 mov eax, [dwDstHeight] 888 mov edi, [pDstData] 889 mov edx, [dwDstStride] 890 mov ecx, [dwDstWidth] 891 sub edx, ecx 892 mov [dstStep], edx ; stride - width 893 dec eax 894 mov [tmpHeight], eax 895 mov eax, 16384 896 mov [yInverse], eax 897 898 pshuflw xmm4, xmm5, 01010000b 899 psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 900 901FAST_HEIGHT: 902 mov eax, [yInverse] 903 mov esi, [pSrcData] 904 shr eax, 15 905 mul dword [dwSrcStride] 906 add esi, eax ; get current row address 907 mov ebp, esi 908 add ebp, [dwSrcStride] 909 910 mov eax, 32768 911 mov [xInverse], eax 912 mov ecx, [dwDstWidth] 913 dec ecx 914 915 movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 916 917FAST_WIDTH: 918 mov eax, [xInverse] 919 shr eax, 16 920 921 movd xmm1, [esi+eax] ; xxxxxxba 922 movd xmm2, [ebp+eax] ; xxxxxxdc 923 punpcklwd xmm1, xmm2 ; xxxxdcba 924 punpcklbw xmm1, xmm0 ; 0d0c0b0a 925 926 movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv 927 pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 928 pmaddwd xmm2, xmm1 929 pshufd xmm1, xmm2, 00000001b 930 paddd xmm2, xmm1 931 movd xmm1, ebx 932 paddd xmm2, xmm1 933 psrld xmm2, 15 934 935 packuswb xmm2, xmm0 936 movd eax, xmm2 937 mov [edi], al 938 inc edi 939 940 mov eax, [uiScaleX] 941 add [xInverse], eax 942 943 paddw xmm3, xmm7 ; inc u 944 945 loop FAST_WIDTH 946 947FAST_WIDTH_END: 948 mov eax, [xInverse] 949 shr eax, 16 950 mov cl, [esi+eax] 951 mov [edi], cl 952 inc edi 953 954 mov eax, [uiScaleY] 955 add [yInverse], eax 956 add edi, [dstStep] 957 958 paddw xmm4, xmm6 ; inc v 959 psllw xmm4, 1 960 psrlw xmm4, 1 961 962 dec dword [tmpHeight] 963 jg FAST_HEIGHT 964 965 966FAST_LAST_ROW: 967 mov eax, [yInverse] 968 mov esi, [pSrcData] 969 shr eax, 15 970 mul dword [dwSrcStride] 971 add esi, eax ; get current row address 972 973 mov eax, 32768 974 mov [xInverse], eax 975 mov ecx, [dwDstWidth] 976 977FAST_LAST_ROW_WIDTH: 978 mov eax, [xInverse] 979 shr eax, 16 980 981 mov al, [esi+eax] 982 mov [edi], al 983 inc edi 984 985 mov eax, [uiScaleX] 986 add [xInverse], eax 987 988 loop FAST_LAST_ROW_WIDTH 989 990FAST_LAST_ROW_END: 991 992 add esp, localsize 993 pop ebx 994 pop edi 995 pop esi 996 pop ebp 997%undef pushsize 998%undef localsize 999%undef pSrcData 1000%undef dwSrcWidth 1001%undef dwSrcHeight 1002%undef dwSrcStride 1003%undef pDstData 1004%undef dwDstStride 1005%undef uiScaleX 1006%undef uiScaleY 1007%undef tmpHeight 1008%undef yInverse 1009%undef xInverse 1010%undef dstStep 1011 ret 1012 1013%elifdef WIN64 1014 1015;************************************************************************************************************** 1016;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, 1017; unsigned char* pSrc, const int iSrcStride, 1018; unsigned int uiScaleX, unsigned int uiScaleY ); 1019;{ 1020;************************************************************************************************************** 1021 1022WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 1023 push r12 1024 push r13 1025 push r14 1026 push r15 1027 push rsi 1028 push rdi 1029 push rbx 1030 push rbp 1031 %assign push_num 8 1032 LOAD_7_PARA 1033 PUSH_XMM 8 1034 SIGN_EXTENSION r1, r1d 1035 SIGN_EXTENSION r2, r2d 1036 SIGN_EXTENSION r3, r3d 1037 SIGN_EXTENSION r5, r5d 1038 SIGN_EXTENSION r6, r6d 1039 1040 pxor xmm0, xmm0 1041 mov r12d, r6d 1042 and r12d, 32767 1043 mov r13d, r12d 1044 neg r13d 1045 and r13d, 32767 1046 movd xmm1, r12d ; uinc(uiScaleX mod 32767) 1047 movd xmm2, r13d ; -uinc 1048 psllq xmm1, 32 1049 por xmm1, xmm2 ; 0 0 uinc -uinc (dword) 1050 pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc 1051 1052 mov r12, arg8 1053 SIGN_EXTENSION r12, r12d 1054 mov rbp, r12 1055 and r12d, 32767 1056 mov r13d, r12d 1057 neg r13d 1058 and r13d, 32767 1059 movd xmm6, r12d ; vinc(uiScaleY mod 32767) 1060 movd xmm2, r13d ; -vinc 1061 psllq xmm6, 32 1062 por xmm6, xmm2 ; 0 0 vinc -vinc (dword) 1063 pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc 1064 1065 mov r12d, 40003fffh 1066 movd xmm5, r12d 1067 punpcklwd xmm5, xmm0 ; 16384 16383 1068 pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 1069 1070DOWNSAMPLE: 1071 sub r1, r2 ; stride - width 1072 dec r3 1073 mov r14,16384 1074 pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 1075 1076HEIGHT: 1077 ;mov r12, r4 1078 mov r12, r14 1079 shr r12, 15 1080 imul r12, r5 1081 add r12, r4 ; get current row address 1082 mov r13, r12 1083 add r13, r5 1084 1085 mov r15, 16384 1086 mov rsi, r2 1087 dec rsi 1088 movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 1089 1090WIDTH: 1091 mov rdi, r15 1092 shr rdi, 15 1093 1094 movd xmm1, [r12+rdi] ; xxxxxxba 1095 movd xmm2, [r13+rdi] ; xxxxxxdc 1096 pxor xmm0, xmm0 1097 punpcklwd xmm1, xmm2 ; xxxxdcba 1098 punpcklbw xmm1, xmm0 ; 0d0c0b0a 1099 punpcklwd xmm1, xmm0 ; 000d000c000b000a 1100 1101 movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv 1102 pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 1103 movdqa xmm0, xmm2 1104 pmuludq xmm2, xmm1 1105 psrlq xmm0, 32 1106 psrlq xmm1, 32 1107 pmuludq xmm0, xmm1 1108 paddq xmm2, xmm0 1109 pshufd xmm1, xmm2, 00001110b 1110 paddq xmm2, xmm1 1111 psrlq xmm2, 29 1112 1113 movd ebx, xmm2 1114 inc ebx 1115 shr ebx, 1 1116 mov [r0], bl 1117 inc r0 1118 1119 add r15, r6 1120 paddw xmm3, xmm7 ; inc u 1121 psllw xmm3, 1 1122 psrlw xmm3, 1 1123 1124 dec rsi 1125 jg WIDTH 1126 1127WIDTH_END: 1128 shr r15, 15 1129 mov bl, [r12+r15] 1130 mov [r0],bl 1131 inc r0 1132 add r14, rbp 1133 add r0, r1 1134 1135 paddw xmm4, xmm6 ; inc v 1136 psllw xmm4, 1 1137 psrlw xmm4, 1 1138 1139 dec r3 1140 jg HEIGHT 1141 1142LAST_ROW: 1143 shr r14, 15 1144 imul r14, r5 1145 add r4, r14 1146 mov r15, 16384 1147 1148LAST_ROW_WIDTH: 1149 mov rdi, r15 1150 shr rdi, 15 1151 mov bl, [r4+rdi] 1152 mov [r0],bl 1153 inc r0 1154 1155 add r15, r6 1156 dec r2 1157 jg LAST_ROW_WIDTH 1158 1159LAST_ROW_END: 1160 1161 POP_XMM 1162 pop rbp 1163 pop rbx 1164 pop rdi 1165 pop rsi 1166 pop r15 1167 pop r14 1168 pop r13 1169 pop r12 1170 ret 1171 1172;************************************************************************************************************** 1173;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, 1174; unsigned char* pSrc, const int iSrcStride, 1175; unsigned int uiScaleX, unsigned int uiScaleY ); 1176;{ 1177;************************************************************************************************************** 1178 1179WELS_EXTERN GeneralBilinearFastDownsampler_sse2 1180 push r12 1181 push r13 1182 push r14 1183 push r15 1184 push rsi 1185 push rdi 1186 push rbx 1187 push rbp 1188 %assign push_num 8 1189 LOAD_7_PARA 1190 PUSH_XMM 8 1191 SIGN_EXTENSION r1, r1d 1192 SIGN_EXTENSION r2, r2d 1193 SIGN_EXTENSION r3, r3d 1194 SIGN_EXTENSION r5, r5d 1195 SIGN_EXTENSION r6, r6d 1196 1197 pxor xmm0, xmm0 1198 mov r12d, r6d 1199 and r12d, 65535 1200 mov r13d, r12d 1201 neg r13d 1202 and r13d, 65535 1203 movd xmm1, r12d ; uinc(uiScaleX mod 65536) 1204 movd xmm2, r13d ; -uinc 1205 psllq xmm1, 32 1206 por xmm1, xmm2 ; 0 uinc 0 -uinc 1207 pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc 1208 1209 mov r12, arg8 1210 SIGN_EXTENSION r12, r12d 1211 mov rbp, r12 1212 and r12d, 32767 1213 mov r13d, r12d 1214 neg r13d 1215 and r13d, 32767 1216 movd xmm6, r12d ; vinc(uiScaleY mod 32767) 1217 movd xmm2, r13d ; -vinc 1218 psllq xmm6, 32 1219 por xmm6, xmm2 ; 0 vinc 0 -vinc 1220 pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc 1221 1222 mov r12d, 80007fffh ; 32768 32767 1223 movd xmm5, r12d 1224 pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 1225 1226FAST_DOWNSAMPLE: 1227 sub r1, r2 ; stride - width 1228 dec r3 1229 mov r14,16384 1230 1231 pshuflw xmm4, xmm5, 01010000b 1232 psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 1233 1234FAST_HEIGHT: 1235 mov r12, r14 1236 shr r12, 15 1237 imul r12, r5 1238 add r12, r4 ; get current row address 1239 mov r13, r12 1240 add r13, r5 1241 1242 mov r15, 32768 1243 mov rsi, r2 1244 dec rsi 1245 1246 movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 1247 1248FAST_WIDTH: 1249 mov rdi, r15 1250 shr rdi, 16 1251 1252 movd xmm1, [r12+rdi] ; xxxxxxba 1253 movd xmm2, [r13+rdi] ; xxxxxxdc 1254 punpcklwd xmm1, xmm2 ; xxxxdcba 1255 punpcklbw xmm1, xmm0 ; 0d0c0b0a 1256 1257 movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv 1258 pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 1259 pmaddwd xmm2, xmm1 1260 pshufd xmm1, xmm2, 00000001b 1261 paddd xmm2, xmm1 1262 movdqa xmm1, [add_extra_half] 1263 paddd xmm2, xmm1 1264 psrld xmm2, 15 1265 1266 packuswb xmm2, xmm0 1267 movd ebx, xmm2 1268 mov [r0], bl 1269 inc r0 1270 1271 add r15, r6 1272 1273 paddw xmm3, xmm7 ; inc u 1274 dec rsi 1275 jg FAST_WIDTH 1276 1277FAST_WIDTH_END: 1278 shr r15, 16 1279 mov bl, [r12+r15] 1280 mov [r0],bl 1281 inc r0 1282 add r14, rbp 1283 add r0, r1 1284 1285 paddw xmm4, xmm6 ; inc v 1286 psllw xmm4, 1 1287 psrlw xmm4, 1 1288 1289 dec r3 1290 jg FAST_HEIGHT 1291 1292 1293FAST_LAST_ROW: 1294 shr r14, 15 1295 imul r14, r5 1296 add r4, r14 1297 mov r15, 32768 1298 1299FAST_LAST_ROW_WIDTH: 1300 mov rdi, r15 1301 shr rdi, 16 1302 mov bl, [r4+rdi] 1303 mov [r0],bl 1304 inc r0 1305 1306 add r15, r6 1307 dec r2 1308 jg FAST_LAST_ROW_WIDTH 1309 1310FAST_LAST_ROW_END: 1311 1312 POP_XMM 1313 pop rbp 1314 pop rbx 1315 pop rdi 1316 pop rsi 1317 pop r15 1318 pop r14 1319 pop r13 1320 pop r12 1321 ret 1322 1323%elifdef UNIX64 1324 1325;************************************************************************************************************** 1326;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, 1327; unsigned char* pSrc, const int iSrcStride, 1328; unsigned int uiScaleX, unsigned int uiScaleY ); 1329;{ 1330;************************************************************************************************************** 1331 1332WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2 1333 push r12 1334 push r13 1335 push r14 1336 push r15 1337 push rbx 1338 push rbp 1339 %assign push_num 6 1340 LOAD_7_PARA 1341 SIGN_EXTENSION r1, r1d 1342 SIGN_EXTENSION r2, r2d 1343 SIGN_EXTENSION r3, r3d 1344 SIGN_EXTENSION r5, r5d 1345 SIGN_EXTENSION r6, r6d 1346 1347 pxor xmm0, xmm0 1348 mov r12d, r6d 1349 and r12d, 32767 1350 mov r13d, r12d 1351 neg r13d 1352 and r13d, 32767 1353 movd xmm1, r12d ; uinc(uiScaleX mod 32767) 1354 movd xmm2, r13d ; -uinc 1355 psllq xmm1, 32 1356 por xmm1, xmm2 ; 0 0 uinc -uinc (dword) 1357 pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc 1358 1359 mov r12, arg8 1360 SIGN_EXTENSION r12, r12d 1361 mov rbp, r12 1362 and r12d, 32767 1363 mov r13d, r12d 1364 neg r13d 1365 and r13d, 32767 1366 movd xmm6, r12d ; vinc(uiScaleY mod 32767) 1367 movd xmm2, r13d ; -vinc 1368 psllq xmm6, 32 1369 por xmm6, xmm2 ; 0 0 vinc -vinc (dword) 1370 pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc 1371 1372 mov r12d, 40003fffh 1373 movd xmm5, r12d 1374 punpcklwd xmm5, xmm0 ; 16384 16383 1375 pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383 1376 1377DOWNSAMPLE: 1378 sub r1, r2 ; stride - width 1379 dec r3 1380 mov r14,16384 1381 pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383 1382 1383HEIGHT: 1384 ;mov r12, r4 1385 mov r12, r14 1386 shr r12, 15 1387 imul r12, r5 1388 add r12, r4 ; get current row address 1389 mov r13, r12 1390 add r13, r5 1391 1392 mov r15, 16384 1393 mov rax, r2 1394 dec rax 1395 movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383 1396 1397WIDTH: 1398 mov r11, r15 1399 shr r11, 15 1400 1401 movd xmm1, [r12+r11] ; xxxxxxba 1402 movd xmm2, [r13+r11] ; xxxxxxdc 1403 pxor xmm0, xmm0 1404 punpcklwd xmm1, xmm2 ; xxxxdcba 1405 punpcklbw xmm1, xmm0 ; 0d0c0b0a 1406 punpcklwd xmm1, xmm0 ; 000d000c000b000a 1407 1408 movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv 1409 pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 1410 movdqa xmm0, xmm2 1411 pmuludq xmm2, xmm1 1412 psrlq xmm0, 32 1413 psrlq xmm1, 32 1414 pmuludq xmm0, xmm1 1415 paddq xmm2, xmm0 1416 pshufd xmm1, xmm2, 00001110b 1417 paddq xmm2, xmm1 1418 psrlq xmm2, 29 1419 1420 movd ebx, xmm2 1421 inc ebx 1422 shr ebx, 1 1423 mov [r0], bl 1424 inc r0 1425 1426 add r15, r6 1427 paddw xmm3, xmm7 ; inc u 1428 psllw xmm3, 1 1429 psrlw xmm3, 1 1430 1431 dec rax 1432 jg WIDTH 1433 1434WIDTH_END: 1435 shr r15, 15 1436 mov bl, [r12+r15] 1437 mov [r0],bl 1438 inc r0 1439 add r14, rbp 1440 add r0, r1 1441 1442 paddw xmm4, xmm6 ; inc v 1443 psllw xmm4, 1 1444 psrlw xmm4, 1 1445 1446 dec r3 1447 jg HEIGHT 1448 1449LAST_ROW: 1450 shr r14, 15 1451 imul r14, r5 1452 add r4, r14 1453 mov r15, 16384 1454 1455LAST_ROW_WIDTH: 1456 mov r11, r15 1457 shr r11, 15 1458 mov bl, [r4+r11] 1459 mov [r0],bl 1460 inc r0 1461 1462 add r15, r6 1463 dec r2 1464 jg LAST_ROW_WIDTH 1465 1466LAST_ROW_END: 1467 1468 pop rbp 1469 pop rbx 1470 pop r15 1471 pop r14 1472 pop r13 1473 pop r12 1474 ret 1475 1476;************************************************************************************************************** 1477;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight, 1478; unsigned char* pSrc, const int iSrcStride, 1479; unsigned int uiScaleX, unsigned int uiScaleY ); 1480;{ 1481;************************************************************************************************************** 1482 1483WELS_EXTERN GeneralBilinearFastDownsampler_sse2 1484 push r12 1485 push r13 1486 push r14 1487 push r15 1488 push rbx 1489 push rbp 1490 %assign push_num 6 1491 LOAD_7_PARA 1492 SIGN_EXTENSION r1, r1d 1493 SIGN_EXTENSION r2, r2d 1494 SIGN_EXTENSION r3, r3d 1495 SIGN_EXTENSION r5, r5d 1496 SIGN_EXTENSION r6, r6d 1497 1498 pxor xmm0, xmm0 1499 mov r12d, r6d 1500 and r12d, 65535 1501 mov r13d, r12d 1502 neg r13d 1503 and r13d, 65535 1504 movd xmm1, r12d ; uinc(uiScaleX mod 65536) 1505 movd xmm2, r13d ; -uinc 1506 psllq xmm1, 32 1507 por xmm1, xmm2 ; 0 uinc 0 -uinc 1508 pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc 1509 1510 mov r12, arg8 1511 SIGN_EXTENSION r12, r12d 1512 mov rbp, r12 1513 and r12d, 32767 1514 mov r13d, r12d 1515 neg r13d 1516 and r13d, 32767 1517 movd xmm6, r12d ; vinc(uiScaleY mod 32767) 1518 movd xmm2, r13d ; -vinc 1519 psllq xmm6, 32 1520 por xmm6, xmm2 ; 0 vinc 0 -vinc 1521 pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc 1522 1523 mov r12d, 80007fffh ; 32768 32767 1524 movd xmm5, r12d 1525 pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767 1526 1527FAST_DOWNSAMPLE: 1528 sub r1, r2 ; stride - width 1529 dec r3 1530 mov r14,16384 1531 1532 pshuflw xmm4, xmm5, 01010000b 1533 psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383 1534 1535FAST_HEIGHT: 1536 mov r12, r14 1537 shr r12, 15 1538 imul r12, r5 1539 add r12, r4 ; get current row address 1540 mov r13, r12 1541 add r13, r5 1542 1543 mov r15, 32768 1544 mov rax, r2 1545 dec rax 1546 1547 movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767 1548 1549FAST_WIDTH: 1550 mov r11, r15 1551 shr r11, 16 1552 1553 movd xmm1, [r12+r11] ; xxxxxxba 1554 movd xmm2, [r13+r11] ; xxxxxxdc 1555 punpcklwd xmm1, xmm2 ; xxxxdcba 1556 punpcklbw xmm1, xmm0 ; 0d0c0b0a 1557 1558 movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv 1559 pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2 1560 pmaddwd xmm2, xmm1 1561 pshufd xmm1, xmm2, 00000001b 1562 paddd xmm2, xmm1 1563 movdqa xmm1, [add_extra_half] 1564 paddd xmm2, xmm1 1565 psrld xmm2, 15 1566 1567 packuswb xmm2, xmm0 1568 movd ebx, xmm2 1569 mov [r0], bl 1570 inc r0 1571 1572 add r15, r6 1573 1574 paddw xmm3, xmm7 ; inc u 1575 dec rax 1576 jg FAST_WIDTH 1577 1578FAST_WIDTH_END: 1579 shr r15, 16 1580 mov bl, [r12+r15] 1581 mov [r0],bl 1582 inc r0 1583 add r14, rbp 1584 add r0, r1 1585 1586 paddw xmm4, xmm6 ; inc v 1587 psllw xmm4, 1 1588 psrlw xmm4, 1 1589 1590 dec r3 1591 jg FAST_HEIGHT 1592 1593 1594FAST_LAST_ROW: 1595 shr r14, 15 1596 imul r14, r5 1597 add r4, r14 1598 mov r15, 32768 1599 1600FAST_LAST_ROW_WIDTH: 1601 mov r11, r15 1602 shr r11, 16 1603 mov bl, [r4+r11] 1604 mov [r0],bl 1605 inc r0 1606 1607 add r15, r6 1608 dec r2 1609 jg FAST_LAST_ROW_WIDTH 1610 1611FAST_LAST_ROW_END: 1612 1613 pop rbp 1614 pop rbx 1615 pop r15 1616 pop r14 1617 pop r13 1618 pop r12 1619 ret 1620%endif 1621 1622;*********************************************************************** 1623; void DyadicBilinearOneThirdDownsampler_ssse3( unsigned char* pDst, const int iDstStride, 1624; unsigned char* pSrc, const int iSrcStride, 1625; const int iSrcWidth, const int iSrcHeight ); 1626;*********************************************************************** 1627WELS_EXTERN DyadicBilinearOneThirdDownsampler_ssse3 1628%ifdef X86_32 1629 push r6 1630 %assign push_num 1 1631%else 1632 %assign push_num 0 1633%endif 1634 LOAD_6_PARA 1635 PUSH_XMM 8 1636 SIGN_EXTENSION r1, r1d 1637 SIGN_EXTENSION r3, r3d 1638 SIGN_EXTENSION r4, r4d 1639 SIGN_EXTENSION r5, r5d 1640%ifdef X86_32_PICASM 1641 %define i_height dword arg6 1642%else 1643 %define i_height r5 1644%endif 1645 INIT_X86_32_PIC_NOPRESERVE r5 1646 1647%ifndef X86_32 1648 push r12 1649 mov r12, r4 1650%endif 1651 1652 mov r6, r1 ;Save the tailer for the unasigned size 1653 imul r6, i_height 1654 add r6, r0 1655 movdqa xmm7, [r6] 1656 1657.yloops_onethird_sse3: 1658%ifdef X86_32 1659 mov r4, arg5 1660%else 1661 mov r4, r12 1662%endif 1663 1664 mov r6, r0 ;save base address 1665 ; each loop = source bandwidth: 48 bytes 1666.xloops_onethird_sse3: 1667 ; 1st part horizonal loop: x48 bytes 1668 ; mem hi<- ->lo 1669 ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A 1670 ; xmm2: k K * j J * i I * h H * g G * f 1671 ; xmm2: * p P * o O * n N * m M * l L * 1672 ; 1673 ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A' 1674 ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f' 1675 ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' * 1676 ;=> target: 1677 ;: P O N M L K J I H G F E D C B A 1678 ;: p o n m l k j i h g f e d c b a 1679 ;: P' .. A' 1680 ;: p' .. a' 1681 1682 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1683 ;1st line 1684 movdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A 1685 movdqa xmm1, xmm0 1686 movdqa xmm5, [pic(shufb_mask_onethird_low_1)] 1687 movdqa xmm6, [pic(shufb_mask_onethird_high_1)] 1688 pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0 1689 pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1 1690 1691 movdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f 1692 movdqa xmm3, xmm2 1693 movdqa xmm5, [pic(shufb_mask_onethird_low_2)] 1694 movdqa xmm6, [pic(shufb_mask_onethird_high_2)] 1695 pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2 1696 pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3 1697 1698 paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0 1699 paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1 1700 1701 movdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L * 1702 movdqa xmm3, xmm2 1703 movdqa xmm5, [pic(shufb_mask_onethird_low_3)] 1704 movdqa xmm6, [pic(shufb_mask_onethird_high_3)] 1705 pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2 1706 pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3 1707 1708 paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0 1709 paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1 1710 pavgb xmm0, xmm1 ;1st line average -> xmm0 1711 1712 ;2nd line 1713 movdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A' 1714 movdqa xmm3, xmm2 1715 movdqa xmm5, [pic(shufb_mask_onethird_low_1)] 1716 movdqa xmm6, [pic(shufb_mask_onethird_high_1)] 1717 pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2 1718 pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3 1719 1720 movdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f' 1721 movdqa xmm4, xmm1 1722 movdqa xmm5, [pic(shufb_mask_onethird_low_2)] 1723 movdqa xmm6, [pic(shufb_mask_onethird_high_2)] 1724 pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1 1725 pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4 1726 1727 paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2 1728 paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3 1729 1730 movdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' * 1731 movdqa xmm4, xmm1 1732 movdqa xmm5, [pic(shufb_mask_onethird_low_3)] 1733 movdqa xmm6, [pic(shufb_mask_onethird_high_3)] 1734 pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1 1735 pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4 1736 1737 paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2 1738 paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3 1739 pavgb xmm2, xmm3 ;2nd line average -> xmm2 1740 1741 pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line ) 1742 1743 ; write pDst 1744 movdqa [r0], xmm0 ;write result in dst 1745 1746 ; next SMB 1747 lea r2, [r2+48] ;current src address 1748 lea r0, [r0+16] ;current dst address 1749 1750 sub r4, 48 ;xloops counter 1751 cmp r4, 0 1752 jg near .xloops_onethird_sse3 1753 1754 sub r6, r0 ;offset = base address - current address 1755 lea r2, [r2+2*r3] ; 1756 lea r2, [r2+r3] ; 1757 lea r2, [r2+2*r6] ;current line + 3 lines 1758 lea r2, [r2+r6] 1759 lea r0, [r0+r1] 1760 lea r0, [r0+r6] ;current dst lien + 1 line 1761 1762 dec i_height 1763 jg near .yloops_onethird_sse3 1764 1765 movdqa [r0], xmm7 ;restore the tailer for the unasigned size 1766 1767%ifndef X86_32 1768 pop r12 1769%endif 1770 1771 DEINIT_X86_32_PIC 1772 POP_XMM 1773 LOAD_6_PARA_POP 1774%ifdef X86_32 1775 pop r6 1776%endif 1777 ret 1778%undef i_height 1779 1780;*********************************************************************** 1781; void DyadicBilinearOneThirdDownsampler_sse4( unsigned char* pDst, const int iDstStride, 1782; unsigned char* pSrc, const int iSrcStride, 1783; const int iSrcWidth, const int iSrcHeight ); 1784;*********************************************************************** 1785WELS_EXTERN DyadicBilinearOneThirdDownsampler_sse4 1786%ifdef X86_32 1787 push r6 1788 %assign push_num 1 1789%else 1790 %assign push_num 0 1791%endif 1792 LOAD_6_PARA 1793 PUSH_XMM 8 1794 SIGN_EXTENSION r1, r1d 1795 SIGN_EXTENSION r3, r3d 1796 SIGN_EXTENSION r4, r4d 1797 SIGN_EXTENSION r5, r5d 1798%ifdef X86_32_PICASM 1799 %define i_height dword arg6 1800%else 1801 %define i_height r5 1802%endif 1803 INIT_X86_32_PIC_NOPRESERVE r5 1804 1805%ifndef X86_32 1806 push r12 1807 mov r12, r4 1808%endif 1809 1810 mov r6, r1 ;Save the tailer for the unasigned size 1811 imul r6, i_height 1812 add r6, r0 1813 movdqa xmm7, [r6] 1814 1815.yloops_onethird_sse4: 1816%ifdef X86_32 1817 mov r4, arg5 1818%else 1819 mov r4, r12 1820%endif 1821 1822 mov r6, r0 ;save base address 1823 ; each loop = source bandwidth: 48 bytes 1824.xloops_onethird_sse4: 1825 ; 1st part horizonal loop: x48 bytes 1826 ; mem hi<- ->lo 1827 ;1st Line Src: xmm0: F * e E * d D * c C * b B * a A 1828 ; xmm2: k K * j J * i I * h H * g G * f 1829 ; xmm2: * p P * o O * n N * m M * l L * 1830 ; 1831 ;2nd Line Src: xmm2: F' * e' E' * d' D' * c' C' * b' B' * a' A' 1832 ; xmm1: k' K' * j' J' * i' I' * h' H' * g' G' * f' 1833 ; xmm1: * p' P' * o' O' * n' N' * m' M' * l' L' * 1834 ;=> target: 1835 ;: P O N M L K J I H G F E D C B A 1836 ;: p o n m l k j i h g f e d c b a 1837 ;: P' .. A' 1838 ;: p' .. a' 1839 1840 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1841 ;1st line 1842 movntdqa xmm0, [r2] ;F * e E * d D * c C * b B * a A 1843 movdqa xmm1, xmm0 1844 movdqa xmm5, [pic(shufb_mask_onethird_low_1)] 1845 movdqa xmm6, [pic(shufb_mask_onethird_high_1)] 1846 pshufb xmm0, xmm5 ;0 0 0 0 0 0 0 0 0 0 F E D C B A -> xmm0 1847 pshufb xmm1, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e d c b a -> xmm1 1848 1849 movntdqa xmm2, [r2+16] ;k K * j J * i I * h H * g G * f 1850 movdqa xmm3, xmm2 1851 movdqa xmm5, [pic(shufb_mask_onethird_low_2)] 1852 movdqa xmm6, [pic(shufb_mask_onethird_high_2)] 1853 pshufb xmm2, xmm5 ;0 0 0 0 0 K J I H G 0 0 0 0 0 0 -> xmm2 1854 pshufb xmm3, xmm6 ;0 0 0 0 0 k j i h g f 0 0 0 0 0 -> xmm3 1855 1856 paddusb xmm0, xmm2 ;0 0 0 0 0 K J I H G F E D C B A -> xmm0 1857 paddusb xmm1, xmm3 ;0 0 0 0 0 k j i h g f e d c b a -> xmm1 1858 1859 movntdqa xmm2, [r2+32] ;* p P * o O * n N * m M * l L * 1860 movdqa xmm3, xmm2 1861 movdqa xmm5, [pic(shufb_mask_onethird_low_3)] 1862 movdqa xmm6, [pic(shufb_mask_onethird_high_3)] 1863 pshufb xmm2, xmm5 ;P O N M L 0 0 0 0 0 0 0 0 0 0 0 -> xmm2 1864 pshufb xmm3, xmm6 ;p o n m l 0 0 0 0 0 0 0 0 0 0 0 -> xmm3 1865 1866 paddusb xmm0, xmm2 ;P O N M L K J I H G F E D C B A -> xmm0 1867 paddusb xmm1, xmm3 ;p o n m l k j i h g f e d c b a -> xmm1 1868 pavgb xmm0, xmm1 ;1st line average -> xmm0 1869 1870 ;2nd line 1871 movntdqa xmm2, [r2+r3] ;F' * e' E' * d' D' * c' C' * b' B' * a' A' 1872 movdqa xmm3, xmm2 1873 movdqa xmm5, [pic(shufb_mask_onethird_low_1)] 1874 movdqa xmm6, [pic(shufb_mask_onethird_high_1)] 1875 pshufb xmm2, xmm5 ;0 0 0 0 0 0 0 0 0 0 F' E' D' C' B' A' -> xmm2 1876 pshufb xmm3, xmm6 ;0 0 0 0 0 0 0 0 0 0 0 e' d' c' b' a' -> xmm3 1877 1878 movntdqa xmm1, [r2+r3+16] ;k' K' * j' J' * i' I' * h' H' * g' G' * f' 1879 movdqa xmm4, xmm1 1880 movdqa xmm5, [pic(shufb_mask_onethird_low_2)] 1881 movdqa xmm6, [pic(shufb_mask_onethird_high_2)] 1882 pshufb xmm1, xmm5 ;0 0 0 0 0 K' J' I' H' G' 0 0 0 0 0 0 -> xmm1 1883 pshufb xmm4, xmm6 ;0 0 0 0 0 k' j' i' h' g' f' 0 0 0 0 0 -> xmm4 1884 1885 paddusb xmm2, xmm1 ;0 0 0 0 0 K' J' I' H' G' F' E' D' C' B' A' -> xmm2 1886 paddusb xmm3, xmm4 ;0 0 0 0 0 k' j' i' h' g' f' e' d' c' b' a' -> xmm3 1887 1888 movntdqa xmm1, [r2+r3+32] ; * p' P' * o' O' * n' N' * m' M' * l' L' * 1889 movdqa xmm4, xmm1 1890 movdqa xmm5, [pic(shufb_mask_onethird_low_3)] 1891 movdqa xmm6, [pic(shufb_mask_onethird_high_3)] 1892 pshufb xmm1, xmm5 ;P' O' N' M' L' 0 0 0 0 0 0 0 0 0 0 0 -> xmm1 1893 pshufb xmm4, xmm6 ;p' o' n' m' l' 0 0 0 0 0 0 0 0 0 0 0 -> xmm4 1894 1895 paddusb xmm2, xmm1 ;P' O' N' M' L' K' J' I' H' G' F' E' D' C' B' A' -> xmm2 1896 paddusb xmm3, xmm4 ;p' o' n' m' l' k' j' i' h' g' f' e' d' c' b' a' -> xmm3 1897 pavgb xmm2, xmm3 ;2nd line average -> xmm2 1898 1899 pavgb xmm0, xmm2 ; bytes-average(1st line , 2nd line ) 1900 1901 ; write pDst 1902 movdqa [r0], xmm0 ;write result in dst 1903 1904 ; next SMB 1905 lea r2, [r2+48] ;current src address 1906 lea r0, [r0+16] ;current dst address 1907 1908 sub r4, 48 ;xloops counter 1909 cmp r4, 0 1910 jg near .xloops_onethird_sse4 1911 1912 sub r6, r0 ;offset = base address - current address 1913 lea r2, [r2+2*r3] ; 1914 lea r2, [r2+r3] ; 1915 lea r2, [r2+2*r6] ;current line + 3 lines 1916 lea r2, [r2+r6] 1917 lea r0, [r0+r1] 1918 lea r0, [r0+r6] ;current dst lien + 1 line 1919 1920 dec i_height 1921 jg near .yloops_onethird_sse4 1922 1923 movdqa [r0], xmm7 ;restore the tailer for the unasigned size 1924 1925%ifndef X86_32 1926 pop r12 1927%endif 1928 1929 DEINIT_X86_32_PIC 1930 POP_XMM 1931 LOAD_6_PARA_POP 1932%ifdef X86_32 1933 pop r6 1934%endif 1935 ret 1936%undef i_height 1937 1938;*********************************************************************** 1939; void DyadicBilinearQuarterDownsampler_sse( unsigned char* pDst, const int iDstStride, 1940; unsigned char* pSrc, const int iSrcStride, 1941; const int iSrcWidth, const int iSrcHeight ); 1942;*********************************************************************** 1943WELS_EXTERN DyadicBilinearQuarterDownsampler_sse 1944%ifdef X86_32 1945 push r6 1946 %assign push_num 1 1947%else 1948 %assign push_num 0 1949%endif 1950 LOAD_6_PARA 1951 SIGN_EXTENSION r1, r1d 1952 SIGN_EXTENSION r3, r3d 1953 SIGN_EXTENSION r4, r4d 1954 SIGN_EXTENSION r5, r5d 1955 1956%ifndef X86_32 1957 push r12 1958 mov r12, r4 1959%endif 1960 sar r5, $02 ; iSrcHeight >> 2 1961 1962 mov r6, r1 ;Save the tailer for the unasigned size 1963 imul r6, r5 1964 add r6, r0 1965 movq xmm7, [r6] 1966 1967.yloops_quarter_sse: 1968%ifdef X86_32 1969 mov r4, arg5 1970%else 1971 mov r4, r12 1972%endif 1973 1974 mov r6, r0 ;save base address 1975 ; each loop = source bandwidth: 32 bytes 1976.xloops_quarter_sse: 1977 ; 1st part horizonal loop: x16 bytes 1978 ; mem hi<- ->lo 1979 ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E 1980 ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M 1981 ; 1982 ;=> target: 1983 ;: G E C A, 1984 ;: 1985 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1986 movq mm0, [r2] ; 1st pSrc line 1987 movq mm1, [r2+8] ; 1st pSrc line + 8 1988 movq mm2, [r2+r3] ; 2nd pSrc line 1989 movq mm3, [r2+r3+8] ; 2nd pSrc line + 8 1990 1991 pshufw mm0, mm0, 0d8h ; x X x X c C a A 1992 pshufw mm1, mm1, 0d8h ; x X x X g G e E 1993 pshufw mm2, mm2, 0d8h ; x X x X k K i I 1994 pshufw mm3, mm3, 0d8h ; x X x X o O m M 1995 1996 punpckldq mm0, mm1 ; g G e E c C a A 1997 punpckldq mm2, mm3 ; o O m M k K i I 1998 1999 ; to handle mm0,mm2 2000 pshufw mm4, mm0, 0d8h ;g G c C e E a A 2001 pshufw mm5, mm4, 04eh ;e E a A g G c C 2002 punpcklbw mm4, mm5 ;g e G E c a C A -> mm4 2003 pshufw mm4, mm4, 0d8h ;g e c a G E C A -> mm4 2004 2005 pshufw mm5, mm2, 0d8h ;o O k K m M i I 2006 pshufw mm6, mm5, 04eh ;m M i I o O k K 2007 punpcklbw mm5, mm6 ;o m O M k i K I 2008 pshufw mm5, mm5, 0d8h ;o m k i O M K I -> mm5 2009 2010 ; to handle mm4, mm5 2011 movq mm0, mm4 2012 punpckldq mm0, mm6 ;x x x x G E C A 2013 punpckhdq mm4, mm6 ;x x x x g e c a 2014 2015 movq mm1, mm5 2016 punpckldq mm1, mm6 ;x x x x O M K I 2017 punpckhdq mm5, mm6 ;x x x x o m k i 2018 2019 ; avg within MB horizon width (8 x 2 lines) 2020 pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 2021 pavgb mm1, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 2022 pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once 2023 2024 ; 2nd part horizonal loop: x16 bytes 2025 movq mm1, [r2+16] ; 1st pSrc line + 16 2026 movq mm2, [r2+24] ; 1st pSrc line + 24 2027 movq mm3, [r2+r3+16] ; 2nd pSrc line + 16 2028 movq mm4, [r2+r3+24] ; 2nd pSrc line + 24 2029 2030 pshufw mm1, mm1, 0d8h 2031 pshufw mm2, mm2, 0d8h 2032 pshufw mm3, mm3, 0d8h 2033 pshufw mm4, mm4, 0d8h 2034 2035 punpckldq mm1, mm2 2036 punpckldq mm3, mm4 2037 2038 ; to handle mm1, mm3 2039 pshufw mm4, mm1, 0d8h 2040 pshufw mm5, mm4, 04eh 2041 punpcklbw mm4, mm5 2042 pshufw mm4, mm4, 0d8h 2043 2044 pshufw mm5, mm3, 0d8h 2045 pshufw mm6, mm5, 04eh 2046 punpcklbw mm5, mm6 2047 pshufw mm5, mm5, 0d8h 2048 2049 ; to handle mm4, mm5 2050 movq mm2, mm4 2051 punpckldq mm2, mm6 2052 punpckhdq mm4, mm6 2053 2054 movq mm3, mm5 2055 punpckldq mm3, mm6 2056 punpckhdq mm5, mm6 2057 2058 ; avg within MB horizon width (8 x 2 lines) 2059 pavgb mm2, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1 2060 pavgb mm3, mm5 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2 2061 pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part 2062 2063 movd [r0 ], mm0 2064 movd [r0+4], mm2 2065 2066 ; next SMB 2067 lea r2, [r2+32] 2068 lea r0, [r0+8] 2069 2070 sub r4, 32 2071 cmp r4, 0 2072 jg near .xloops_quarter_sse 2073 2074 sub r6, r0 2075 ; next line 2076 lea r2, [r2+4*r3] ; next 4 end of lines 2077 lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth] 2078 lea r0, [r0+r1] 2079 lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] 2080 2081 dec r5 2082 jg near .yloops_quarter_sse 2083 2084 movq [r0], xmm7 ;restored the tailer for the unasigned size 2085 2086 WELSEMMS 2087%ifndef X86_32 2088 pop r12 2089%endif 2090 LOAD_6_PARA_POP 2091%ifdef X86_32 2092 pop r6 2093%endif 2094 ret 2095 2096;*********************************************************************** 2097; void DyadicBilinearQuarterDownsampler_ssse3( unsigned char* pDst, const int iDstStride, 2098; unsigned char* pSrc, const int iSrcStride, 2099; const int iSrcWidth, const int iSrcHeight ); 2100;*********************************************************************** 2101WELS_EXTERN DyadicBilinearQuarterDownsampler_ssse3 2102 ;push ebx 2103 ;push edx 2104 ;push esi 2105 ;push edi 2106 ;push ebp 2107 2108 ;mov edi, [esp+24] ; pDst 2109 ;mov edx, [esp+28] ; iDstStride 2110 ;mov esi, [esp+32] ; pSrc 2111 ;mov ecx, [esp+36] ; iSrcStride 2112 ;mov ebp, [esp+44] ; iSrcHeight 2113%ifdef X86_32 2114 push r6 2115 %assign push_num 1 2116%else 2117 %assign push_num 0 2118%endif 2119 LOAD_6_PARA 2120 PUSH_XMM 8 2121 SIGN_EXTENSION r1, r1d 2122 SIGN_EXTENSION r3, r3d 2123 SIGN_EXTENSION r4, r4d 2124 SIGN_EXTENSION r5, r5d 2125 2126%ifndef X86_32 2127 push r12 2128 mov r12, r4 2129%endif 2130 sar r5, $02 ; iSrcHeight >> 2 2131 2132 mov r6, r1 ;Save the tailer for the unasigned size 2133 imul r6, r5 2134 add r6, r0 2135 movq xmm7, [r6] 2136 2137 INIT_X86_32_PIC_NOPRESERVE r4 2138 movdqa xmm6, [pic(shufb_mask_quarter)] 2139 DEINIT_X86_32_PIC 2140 2141.yloops_quarter_sse3: 2142 ;mov eax, [esp+40] ; iSrcWidth 2143 ;sar eax, $02 ; iSrcWidth >> 2 2144 ;mov ebx, eax ; iDstWidth restored at ebx 2145 ;sar eax, $04 ; (iSrcWidth >> 2) / 16 ; loop count = num_of_mb 2146 ;neg ebx ; - (iSrcWidth >> 2) 2147%ifdef X86_32 2148 mov r4, arg5 2149%else 2150 mov r4, r12 2151%endif 2152 2153 mov r6, r0 2154 ; each loop = source bandwidth: 32 bytes 2155.xloops_quarter_sse3: 2156 ; 1st part horizonal loop: x32 bytes 2157 ; mem hi<- ->lo 2158 ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A 2159 ; xmm1: p P o O n N m M l L k K j J i I 2160 ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A 2161 ; xmm3: p P o O n N m M l L k K j J i I 2162 2163 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2164 movdqa xmm0, [r2] ; 1st_src_line 2165 movdqa xmm1, [r2+16] ; 1st_src_line + 16 2166 movdqa xmm2, [r2+r3] ; 2nd_src_line 2167 movdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16 2168 2169 pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A 2170 pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I 2171 pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A 2172 pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I 2173 2174 movdqa xmm4, xmm0 2175 movdqa xmm5, xmm2 2176 punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0 2177 punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4 2178 punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2 2179 punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5 2180 2181 pavgb xmm0, xmm4 2182 pavgb xmm2, xmm5 2183 pavgb xmm0, xmm2 ;average 2184 2185 ; write pDst 2186 movq [r0], xmm0 2187 2188 ; next SMB 2189 lea r2, [r2+32] 2190 lea r0, [r0+8] 2191 2192 sub r4, 32 2193 cmp r4, 0 2194 jg near .xloops_quarter_sse3 2195 2196 sub r6, r0 2197 ; next line 2198 lea r2, [r2+4*r3] ; next end of lines 2199 lea r2, [r2+4*r6] ; reset to base 0 [- 4 * iDstWidth] 2200 lea r0, [r0+r1] 2201 lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] 2202 2203 dec r5 2204 jg near .yloops_quarter_sse3 2205 2206 movq [r0], xmm7 ;restored the tailer for the unasigned size 2207 2208%ifndef X86_32 2209 pop r12 2210%endif 2211 2212 POP_XMM 2213 LOAD_6_PARA_POP 2214%ifdef X86_32 2215 pop r6 2216%endif 2217 ret 2218 2219;*********************************************************************** 2220; void DyadicBilinearQuarterDownsampler_sse4( unsigned char* pDst, const int iDstStride, 2221; unsigned char* pSrc, const int iSrcStride, 2222; const int iSrcWidth, const int iSrcHeight ); 2223;*********************************************************************** 2224WELS_EXTERN DyadicBilinearQuarterDownsampler_sse4 2225%ifdef X86_32 2226 push r6 2227 %assign push_num 1 2228%else 2229 %assign push_num 0 2230%endif 2231 LOAD_6_PARA 2232 PUSH_XMM 8 2233 SIGN_EXTENSION r1, r1d 2234 SIGN_EXTENSION r3, r3d 2235 SIGN_EXTENSION r4, r4d 2236 SIGN_EXTENSION r5, r5d 2237 2238%ifndef X86_32 2239 push r12 2240 mov r12, r4 2241%endif 2242 sar r5, $02 ; iSrcHeight >> 2 2243 2244 mov r6, r1 ;Save the tailer for the unasigned size 2245 imul r6, r5 2246 add r6, r0 2247 movq xmm7, [r6] 2248 2249 INIT_X86_32_PIC_NOPRESERVE r4 2250 movdqa xmm6, [pic(shufb_mask_quarter)] ;mask 2251 DEINIT_X86_32_PIC 2252 2253.yloops_quarter_sse4: 2254%ifdef X86_32 2255 mov r4, arg5 2256%else 2257 mov r4, r12 2258%endif 2259 2260 mov r6, r0 2261 ; each loop = source bandwidth: 32 bytes 2262.xloops_quarter_sse4: 2263 ; 1st part horizonal loop: x16 bytes 2264 ; mem hi<- ->lo 2265 ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A 2266 ; xmm1: p P o O n N m M l L k K j J i I 2267 ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A 2268 ; xmm3: p P o O n N m M l L k K j J i I 2269 2270 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2271 movntdqa xmm0, [r2] ; 1st_src_line 2272 movntdqa xmm1, [r2+16] ; 1st_src_line + 16 2273 movntdqa xmm2, [r2+r3] ; 2nd_src_line 2274 movntdqa xmm3, [r2+r3+16] ; 2nd_src_line + 16 2275 2276 pshufb xmm0, xmm6 ;1st line: 0 0 0 0 g e c a 0 0 0 0 G E C A 2277 pshufb xmm1, xmm6 ;1st line: 0 0 0 0 o m k i 0 0 0 0 O M K I 2278 pshufb xmm2, xmm6 ;2nd line: 0 0 0 0 g e c a 0 0 0 0 G E C A 2279 pshufb xmm3, xmm6 ;2nd line: 0 0 0 0 o m k i 0 0 0 0 O M K I 2280 2281 movdqa xmm4, xmm0 2282 movdqa xmm5, xmm2 2283 punpckldq xmm0, xmm1 ;1st line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm0 2284 punpckhdq xmm4, xmm1 ;1st line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm4 2285 punpckldq xmm2, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 O M K I G E C A -> xmm2 2286 punpckhdq xmm5, xmm3 ;2nd line: 0 0 0 0 0 0 0 0 o m k i g e c a -> xmm5 2287 2288 pavgb xmm0, xmm4 2289 pavgb xmm2, xmm5 2290 pavgb xmm0, xmm2 ;average 2291 2292 ; write pDst 2293 movq [r0], xmm0 2294 2295 ; next SMB 2296 lea r2, [r2+32] 2297 lea r0, [r0+8] 2298 2299 sub r4, 32 2300 cmp r4, 0 2301 jg near .xloops_quarter_sse4 2302 2303 sub r6, r0 2304 lea r2, [r2+4*r3] ; next end of lines 2305 lea r2, [r2+4*r6] ; reset to base 0 [- 2 * iDstWidth] 2306 lea r0, [r0+r1] 2307 lea r0, [r0+r6] ; reset to base 0 [- iDstWidth] 2308 2309 dec r5 2310 jg near .yloops_quarter_sse4 2311 2312 movq [r0], xmm7 ;restore the tailer for the unasigned size 2313 2314%ifndef X86_32 2315 pop r12 2316%endif 2317 2318 POP_XMM 2319 LOAD_6_PARA_POP 2320%ifdef X86_32 2321 pop r6 2322%endif 2323 ret 2324 2325; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5 2326%macro SSE2_BilinearIncXposuw 5 2327 movdqa %5, %2 2328 paddw %2, %4 2329 paddusw %5, %4 2330 pcmpeqw %5, %2 2331 paddb %1, %3 2332 paddb %1, %5 ; subtract 1 if no carry 2333%endmacro 2334 2335; outl=%1 outh=%2 in=%3 2336%macro SSE2_UnpckXFracuw 3 2337 pcmpeqw %1, %1 2338 pxor %1, %3 2339 movdqa %2, %1 2340 punpcklwd %1, %3 2341 punpckhwd %2, %3 2342%endmacro 2343 2344; [in:xfrac out:xyfrac0]=%1 [out:xyfrac1]=%2 yfrac0=%3 yfrac1=%4 2345%macro SSE2_BilinearFastCalcXYFrac 4 2346 movdqa %2, %1 2347 pmulhuw %1, %3 2348 pmulhuw %2, %4 2349%endmacro 2350 2351; [in:dwordsl out:bytes] dwordsh=%2 zero=%3 2352%macro SSE2_BilinearFastPackDwordsToBytes 3 2353 psrld %1, 14 2354 psrld %2, 14 2355 packssdw %1, %2 2356 pavgw %1, %3 2357 packuswb %1, %1 2358%endmacro 2359 2360%macro SSSE3_BilinearFastDownsample2xOrLess_8px 0 2361 movdqa xmm_tmp0, xmm_xpos_int 2362 pshufb xmm_tmp0, xmm_0 2363 psubb xmm_xpos_int, xmm_tmp0 2364 SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac 2365 mov r_tmp0, i_xpos 2366 lea i_xpos, [i_xpos + 8 * i_scalex] 2367 shr r_tmp0, 16 2368 lddqu xmm_tmp4, [p_src_row0 + r_tmp0] 2369 pshufb xmm_tmp4, xmm_xpos_int 2370 movdqa xmm_tmp5, xmm_tmp4 2371 punpcklbw xmm_tmp4, xmm_0 2372 punpckhbw xmm_tmp5, xmm_0 2373 SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 2374 SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp3, xmm_yfrac0, xmm_yfrac1 2375 pmaddwd xmm_tmp0, xmm_tmp4 2376 pmaddwd xmm_tmp1, xmm_tmp5 2377 lddqu xmm_tmp4, [p_src_row1 + r_tmp0] 2378 pshufb xmm_tmp4, xmm_xpos_int 2379 movdqa xmm_tmp5, xmm_tmp4 2380 punpcklbw xmm_tmp4, xmm_0 2381 punpckhbw xmm_tmp5, xmm_0 2382 pmaddwd xmm_tmp2, xmm_tmp4 2383 pmaddwd xmm_tmp3, xmm_tmp5 2384 paddd xmm_tmp0, xmm_tmp2 2385 paddd xmm_tmp1, xmm_tmp3 2386 SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0 2387 movlps [p_dst], xmm_tmp0 2388 add p_dst, 8 2389 SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0 2390%endmacro 2391 2392%macro SSSE3_BilinearFastDownsample4xOrLess_8px 0 2393 movdqa xmm_tmp0, xmm_xpos_int 2394 pshufb xmm_tmp0, xmm_shufb_0000000088888888 2395 psubb xmm_xpos_int, xmm_tmp0 2396 SSE2_UnpckXFracuw xmm_tmp0, xmm_tmp1, xmm_xpos_frac 2397 mov r_tmp0, i_xpos 2398 shr r_tmp0, 16 2399 lddqu xmm_tmp3, [p_src_row0 + r_tmp0] 2400 lddqu xmm_tmp4, [p_src_row1 + r_tmp0] 2401 movdqa xmm_tmp2, xmm_xpos_int 2402 punpcklbw xmm_tmp2, xmm_db80h 2403 pshufb xmm_tmp3, xmm_tmp2 2404 pshufb xmm_tmp4, xmm_tmp2 2405 SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 2406 pmaddwd xmm_tmp0, xmm_tmp3 2407 pmaddwd xmm_tmp2, xmm_tmp4 2408 paddd xmm_tmp0, xmm_tmp2 2409 lea r_tmp0, [i_xpos + 4 * i_scalex] 2410 lea i_xpos, [i_xpos + 8 * i_scalex] 2411 shr r_tmp0, 16 2412 lddqu xmm_tmp3, [p_src_row0 + r_tmp0] 2413 lddqu xmm_tmp4, [p_src_row1 + r_tmp0] 2414 movdqa xmm_tmp2, xmm_xpos_int 2415 punpckhbw xmm_tmp2, xmm_db80h 2416 pshufb xmm_tmp3, xmm_tmp2 2417 pshufb xmm_tmp4, xmm_tmp2 2418 SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 2419 pmaddwd xmm_tmp1, xmm_tmp3 2420 pmaddwd xmm_tmp2, xmm_tmp4 2421 paddd xmm_tmp1, xmm_tmp2 2422 SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0 2423 movlps [p_dst], xmm_tmp0 2424 add p_dst, 8 2425 SSE2_BilinearIncXposuw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_tmp0 2426%endmacro 2427 2428%macro SSE2_GeneralBilinearFastDownsample_8px 0 2429 mov r_tmp0, i_xpos 2430 shr r_tmp0, 16 2431 movd xmm_tmp3, [p_src_row0 + r_tmp0] 2432 movd xmm_tmp4, [p_src_row1 + r_tmp0] 2433 lea r_tmp0, [i_xpos + i_scalex] 2434 shr r_tmp0, 16 2435 pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 1 2436 pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 1 2437 lea r_tmp0, [i_xpos + 2 * i_scalex] 2438 lea i_xpos, [i_xpos + 4 * i_scalex] 2439 shr r_tmp0, 16 2440 pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 2 2441 pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 2 2442 mov r_tmp0, i_xpos 2443 sub r_tmp0, i_scalex 2444 shr r_tmp0, 16 2445 pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 3 2446 pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 3 2447 punpcklbw xmm_tmp3, xmm_0 2448 punpcklbw xmm_tmp4, xmm_0 2449 movdqa xmm_tmp0, xmm_xfrac0 2450 SSE2_BilinearFastCalcXYFrac xmm_tmp0, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 2451 pmaddwd xmm_tmp0, xmm_tmp3 2452 pmaddwd xmm_tmp2, xmm_tmp4 2453 paddd xmm_tmp0, xmm_tmp2 2454 mov r_tmp0, i_xpos 2455 shr r_tmp0, 16 2456 movd xmm_tmp3, [p_src_row0 + r_tmp0] 2457 movd xmm_tmp4, [p_src_row1 + r_tmp0] 2458 lea r_tmp0, [i_xpos + i_scalex] 2459 shr r_tmp0, 16 2460 pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 1 2461 pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 1 2462 lea r_tmp0, [i_xpos + 2 * i_scalex] 2463 lea i_xpos, [i_xpos + 4 * i_scalex] 2464 shr r_tmp0, 16 2465 pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 2 2466 pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 2 2467 mov r_tmp0, i_xpos 2468 sub r_tmp0, i_scalex 2469 shr r_tmp0, 16 2470 pinsrw xmm_tmp3, [p_src_row0 + r_tmp0], 3 2471 pinsrw xmm_tmp4, [p_src_row1 + r_tmp0], 3 2472 punpcklbw xmm_tmp3, xmm_0 2473 punpcklbw xmm_tmp4, xmm_0 2474 movdqa xmm_tmp1, xmm_xfrac1 2475 SSE2_BilinearFastCalcXYFrac xmm_tmp1, xmm_tmp2, xmm_yfrac0, xmm_yfrac1 2476 pmaddwd xmm_tmp1, xmm_tmp3 2477 pmaddwd xmm_tmp2, xmm_tmp4 2478 paddd xmm_tmp1, xmm_tmp2 2479 SSE2_BilinearFastPackDwordsToBytes xmm_tmp0, xmm_tmp1, xmm_0 2480 movlps [p_dst], xmm_tmp0 2481 add p_dst, 8 2482 paddw xmm_xfrac0, xmm_xfrac_inc 2483 paddw xmm_xfrac1, xmm_xfrac_inc 2484%endmacro 2485 2486; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6 2487%macro SSE2_BilinearIncXposw 6 2488 pxor %6, %6 2489 paddw %2, %4 2490 pcmpgtw %6, %2 2491 paddb %1, %3 2492 psubb %1, %6 ; add carry 2493 pand %2, %5 2494%endmacro 2495 2496; outl=%1 outh=%2 in=%3 7FFFh=%4 2497%macro SSE2_UnpckXFracw 4 2498 movdqa %1, %3 2499 pxor %1, %4 2500 movdqa %2, %1 2501 punpcklwd %1, %3 2502 punpckhwd %2, %3 2503%endmacro 2504 2505; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6 2506%macro SSE41_LinearAccurateInterpolateVerticalDwords 6 2507 pshufd %1, %2, 10110001b 2508 pshufd %6, %3, 10110001b 2509 pmuludq %1, %4 2510 pmuludq %6, %5 2511 paddq %1, %6 2512 pmuludq %2, %4 2513 pmuludq %3, %5 2514 paddq %2, %3 2515 psllq %1, 3 2516 psrlq %2, 29 2517 blendps %1, %2, 0101b 2518%endmacro 2519 2520%macro SSE41_BilinearAccurateDownsample2xOrLess_8px 0 2521 movdqa xmm_tmp0, xmm_xpos_int 2522 pshufb xmm_tmp0, xmm_0 2523 psubb xmm_xpos_int, xmm_tmp0 2524 SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff 2525 mov r_tmp0, i_xpos 2526 lea i_xpos, [i_xpos + 8 * i_scalex] 2527 shr r_tmp0, 16 2528 lddqu xmm_tmp4, [p_src_row0 + r_tmp0] 2529 pshufb xmm_tmp4, xmm_xpos_int 2530 movdqa xmm_tmp5, xmm_tmp4 2531 punpcklbw xmm_tmp4, xmm_0 2532 punpckhbw xmm_tmp5, xmm_0 2533 pmaddwd xmm_tmp4, xmm_tmp0 2534 pmaddwd xmm_tmp5, xmm_tmp1 2535 lddqu xmm_tmp2, [p_src_row1 + r_tmp0] 2536 pshufb xmm_tmp2, xmm_xpos_int 2537 movdqa xmm_tmp3, xmm_tmp2 2538 punpcklbw xmm_tmp2, xmm_0 2539 punpckhbw xmm_tmp3, xmm_0 2540 pmaddwd xmm_tmp2, xmm_tmp0 2541 pmaddwd xmm_tmp3, xmm_tmp1 2542 SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp1 2543 SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp5, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2 2544 packssdw xmm_tmp0, xmm_tmp1 2545 pavgw xmm_tmp0, xmm_0 2546 packuswb xmm_tmp0, xmm_tmp0 2547 movlps [p_dst], xmm_tmp0 2548 add p_dst, 8 2549 SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0 2550%endmacro 2551 2552%macro SSE41_BilinearAccurateDownsample4xOrLess_8px 0 2553 movdqa xmm_tmp0, xmm_xpos_int 2554 pshufb xmm_tmp0, xmm_shufb_0000000088888888 2555 psubb xmm_xpos_int, xmm_tmp0 2556 SSE2_UnpckXFracw xmm_tmp0, xmm_tmp1, xmm_xpos_frac, xmm_7fff 2557 mov r_tmp0, i_xpos 2558 shr r_tmp0, 16 2559 movdqa xmm_tmp3, xmm_xpos_int 2560 punpcklbw xmm_tmp3, xmm_db80h 2561 lddqu xmm_tmp4, [p_src_row0 + r_tmp0] 2562 lddqu xmm_tmp2, [p_src_row1 + r_tmp0] 2563 lea r_tmp0, [i_xpos + 4 * i_scalex] 2564 lea i_xpos, [i_xpos + 8 * i_scalex] 2565 shr r_tmp0, 16 2566 pshufb xmm_tmp4, xmm_tmp3 2567 pshufb xmm_tmp2, xmm_tmp3 2568 pmaddwd xmm_tmp4, xmm_tmp0 2569 pmaddwd xmm_tmp2, xmm_tmp0 2570 SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3 2571 movdqa xmm_tmp2, xmm_xpos_int 2572 punpckhbw xmm_tmp2, xmm_db80h 2573 lddqu xmm_tmp4, [p_src_row0 + r_tmp0] 2574 lddqu xmm_tmp3, [p_src_row1 + r_tmp0] 2575 pshufb xmm_tmp4, xmm_tmp2 2576 pshufb xmm_tmp3, xmm_tmp2 2577 pmaddwd xmm_tmp4, xmm_tmp1 2578 pmaddwd xmm_tmp3, xmm_tmp1 2579 SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2 2580 packssdw xmm_tmp0, xmm_tmp1 2581 pavgw xmm_tmp0, xmm_0 2582 packuswb xmm_tmp0, xmm_tmp0 2583 movlps [p_dst], xmm_tmp0 2584 add p_dst, 8 2585 SSE2_BilinearIncXposw xmm_xpos_int, xmm_xpos_frac, xmm_xpos_int_inc, xmm_xpos_frac_inc, xmm_7fff, xmm_tmp0 2586%endmacro 2587 2588%macro SSE41_GeneralBilinearAccurateDownsample_8px 0 2589 mov r_tmp0, i_xpos 2590 shr r_tmp0, 16 2591 movd xmm_tmp4, [p_src_row0 + r_tmp0] 2592 movd xmm_tmp2, [p_src_row1 + r_tmp0] 2593 lea r_tmp0, [i_xpos + 1 * i_scalex] 2594 shr r_tmp0, 16 2595 pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1 2596 pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 1 2597 lea r_tmp0, [i_xpos + 2 * i_scalex] 2598 lea i_xpos, [i_xpos + 4 * i_scalex] 2599 shr r_tmp0, 16 2600 pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2 2601 pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 2 2602 mov r_tmp0, i_xpos 2603 sub r_tmp0, i_scalex 2604 shr r_tmp0, 16 2605 pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3 2606 pinsrw xmm_tmp2, [p_src_row1 + r_tmp0], 3 2607 punpcklbw xmm_tmp4, xmm_0 2608 punpcklbw xmm_tmp2, xmm_0 2609 pmaddwd xmm_tmp4, xmm_xfrac0 2610 pmaddwd xmm_tmp2, xmm_xfrac0 2611 SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp0, xmm_tmp4, xmm_tmp2, xmm_yfrac0, xmm_yfrac1, xmm_tmp3 2612 mov r_tmp0, i_xpos 2613 shr r_tmp0, 16 2614 movd xmm_tmp4, [p_src_row0 + r_tmp0] 2615 movd xmm_tmp3, [p_src_row1 + r_tmp0] 2616 lea r_tmp0, [i_xpos + 1 * i_scalex] 2617 shr r_tmp0, 16 2618 pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 1 2619 pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 1 2620 lea r_tmp0, [i_xpos + 2 * i_scalex] 2621 lea i_xpos, [i_xpos + 4 * i_scalex] 2622 shr r_tmp0, 16 2623 pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 2 2624 pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 2 2625 mov r_tmp0, i_xpos 2626 sub r_tmp0, i_scalex 2627 shr r_tmp0, 16 2628 pinsrw xmm_tmp4, [p_src_row0 + r_tmp0], 3 2629 pinsrw xmm_tmp3, [p_src_row1 + r_tmp0], 3 2630 punpcklbw xmm_tmp4, xmm_0 2631 punpcklbw xmm_tmp3, xmm_0 2632 pmaddwd xmm_tmp4, xmm_xfrac1 2633 pmaddwd xmm_tmp3, xmm_xfrac1 2634 SSE41_LinearAccurateInterpolateVerticalDwords xmm_tmp1, xmm_tmp4, xmm_tmp3, xmm_yfrac0, xmm_yfrac1, xmm_tmp2 2635 packssdw xmm_tmp0, xmm_tmp1 2636 pavgw xmm_tmp0, xmm_0 2637 packuswb xmm_tmp0, xmm_tmp0 2638 movlps [p_dst], xmm_tmp0 2639 add p_dst, 8 2640 paddw xmm_xfrac0, xmm_xfrac_inc 2641 paddw xmm_xfrac1, xmm_xfrac_inc 2642 pand xmm_xfrac0, xmm_7fff 2643 pand xmm_xfrac1, xmm_7fff 2644%endmacro 2645 2646; downsample_8px_macro=%1 b_fast=%2 2647%macro SSE2_GeneralBilinearDownsampler_loop 2 2648%%height: 2649 mov p_src_row0, i_ypos 2650 shr p_src_row0, 15 2651 imul p_src_row0, i_src_stride 2652 add p_src_row0, p_src 2653 mov p_src_row1, p_src_row0 2654 add p_src_row1, i_src_stride 2655 movd xmm_tmp1, i_yposd 2656%if %2 2657 pshuflw xmm_tmp1, xmm_tmp1, 0 2658 psllw xmm_tmp1, 1 2659 psrlw xmm_tmp1, 1 2660%else 2661 pslld xmm_tmp1, 17 2662 psrld xmm_tmp1, 17 2663%endif 2664%ifdef X86_32 2665 pshufd xmm_tmp1, xmm_tmp1, 0 2666 pcmpeqw xmm_tmp0, xmm_tmp0 2667%if %2 2668 psrlw xmm_tmp0, 1 2669%else 2670 psrld xmm_tmp0, 17 2671%endif 2672 pxor xmm_tmp0, xmm_tmp1 2673 movdqa xmm_yfrac0, xmm_tmp0 2674 movdqa xmm_yfrac1, xmm_tmp1 2675%else 2676 pshufd xmm_yfrac1, xmm_tmp1, 0 2677 pcmpeqw xmm_yfrac0, xmm_yfrac0 2678%if %2 2679 psrlw xmm_yfrac0, 1 2680%else 2681 psrld xmm_yfrac0, 17 2682%endif 2683 pxor xmm_yfrac0, xmm_yfrac1 2684%endif 2685 2686 mov i_xpos, 1 << 15 2687 mov i_width_cnt, i_dst_width 2688 sub i_width_cnt, 1 2689 2690%ifdef xmm_xpos_int 2691 movdqa xmm_xpos_int, xmm_xpos_int_begin 2692 movdqa xmm_xpos_frac, xmm_xpos_frac_begin 2693%else 2694 movdqa xmm_xfrac0, xmm_xfrac0_begin 2695 movdqa xmm_xfrac1, xmm_xfrac1_begin 2696%endif 2697 2698%%width: 2699 %1 2700 sub i_width_cnt, 8 2701 jg %%width 2702 2703 lea p_dst, [p_dst + i_width_cnt + 1] 2704 imul i_width_cnt, i_scalex 2705 add i_xpos, i_width_cnt 2706 shr i_xpos, 16 2707 movzx r_tmp0, byte [p_src_row0 + i_xpos] 2708 mov [p_dst - 1], r_tmp0b 2709%ifdef X86_32 2710 mov r_tmp0, i_scaleyd 2711 add i_yposd, r_tmp0 2712%else 2713 add i_yposd, i_scaleyd 2714%endif 2715 add p_dst, i_dst_stride_less_width 2716 sub i_dst_height, 1 2717 jg %%height 2718%endmacro 2719 2720;************************************************************************************************************** 2721;void GeneralBilinearFastDownsampler_ssse3 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, 2722; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, 2723; uint32_t uiScaleY); 2724; 2725;************************************************************************************************************** 2726 2727WELS_EXTERN GeneralBilinearFastDownsampler_ssse3 2728 %assign push_num 0 2729%ifndef X86_32 2730 push r12 2731 push r13 2732 push rbx 2733 push rbp 2734 %assign push_num 4 2735%ifdef WIN64 2736 push rdi 2737 push rsi 2738 %assign push_num push_num + 2 2739%endif 2740%endif 2741 LOAD_7_PARA 2742 PUSH_XMM 16 2743 SIGN_EXTENSION r1, r1d 2744 SIGN_EXTENSION r2, r2d 2745 SIGN_EXTENSION r3, r3d 2746 SIGN_EXTENSION r5, r5d 2747 ZERO_EXTENSION r6d 2748 sub r1, r2 ; dst_stride - dst_width 2749%ifdef X86_32 2750 movd xmm0, arg8 2751 movd xmm1, esp 2752 and esp, -16 2753%ifdef X86_32_PICASM 2754 sub esp, 8 * 4 + 9 * 16 2755%else 2756 sub esp, 8 * 4 + 7 * 16 2757%endif 2758 movd [esp], xmm1 2759 %define p_dst r0 2760 %define i_dst_stride_less_width [esp + 1 * 4] 2761 %define i_dst_width [esp + 2 * 4] 2762 %define i_dst_height dword [esp + 3 * 4] 2763 %define p_src [esp + 4 * 4] 2764 %define i_src_stride [esp + 5 * 4] 2765 %define i_scalex r6 2766 %define i_scalexd r6d 2767 %define i_scaleyd [esp + 6 * 4] 2768 %define i_xpos r2 2769 %define i_ypos dword [esp + 7 * 4] 2770 %define i_yposd dword [esp + 7 * 4] 2771 %define p_src_row0 r3 2772 %define p_src_row1 r4 2773 %define i_width_cnt r5 2774 %define r_tmp0 r1 2775 %define r_tmp0b r1b 2776 %define xmm_xpos_frac xmm1 2777 %define xmm_xpos_frac_inc [esp + 8 * 4] 2778 %define xmm_xpos_int xmm3 2779 %define xmm_xpos_int_inc [esp + 8 * 4 + 1 * 16] 2780 %define xmm_yfrac0 [esp + 8 * 4 + 2 * 16] 2781 %define xmm_yfrac1 [esp + 8 * 4 + 3 * 16] 2782 %define xmm_tmp0 xmm7 2783 %define xmm_tmp1 xmm0 2784 %define xmm_tmp2 xmm2 2785 %define xmm_tmp3 xmm4 2786 %define xmm_tmp4 xmm5 2787 %define xmm_tmp5 xmm6 2788 %define xmm_0 [esp + 8 * 4 + 4 * 16] 2789 %define xmm_xpos_int_begin [esp + 8 * 4 + 5 * 16] 2790 %define xmm_xpos_frac_begin [esp + 8 * 4 + 6 * 16] 2791%ifdef X86_32_PICASM 2792 %define xmm_db80h [esp + 8 * 4 + 7 * 16] 2793 %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 16] 2794 pxor xmm_tmp4, xmm_tmp4 2795 pcmpeqb xmm_tmp5, xmm_tmp5 2796 psubb xmm_tmp4, xmm_tmp5 2797 movdqa xmm_tmp3, xmm_tmp4 2798 psllw xmm_tmp3, 3 2799 pslldq xmm_tmp3, 8 2800 movdqa xmm_shufb_0000000088888888, xmm_tmp3 2801 psllw xmm_tmp4, 7 2802 movdqa xmm_db80h, xmm_tmp4 2803%else 2804 %define xmm_db80h [db80h_256] 2805 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 2806%endif 2807 mov i_dst_stride_less_width, r1 2808 mov i_dst_width, r2 2809 mov i_dst_height, r3 2810 mov p_src, r4 2811 mov i_src_stride, r5 2812 movd i_scaleyd, xmm0 2813 pxor xmm_tmp0, xmm_tmp0 2814 movdqa xmm_0, xmm_tmp0 2815%else 2816 %define p_dst r0 2817 %define i_dst_stride_less_width r1 2818 %define i_dst_width r2 2819 %define i_dst_height r3 2820 %define p_src r4 2821 %define i_src_stride r5 2822 %define i_scalex r6 2823 %define i_scalexd r6d 2824 %define i_scaleyd dword arg8d 2825 %define i_xpos r12 2826 %define i_ypos r13 2827 %define i_yposd r13d 2828 %define p_src_row0 rbp 2829%ifdef WIN64 2830 %define p_src_row1 rsi 2831 %define i_width_cnt rdi 2832%else 2833 %define p_src_row1 r11 2834 %define i_width_cnt rax 2835%endif 2836 %define r_tmp0 rbx 2837 %define r_tmp0b bl 2838 %define xmm_0 xmm0 2839 %define xmm_xpos_frac xmm1 2840 %define xmm_xpos_frac_inc xmm8 2841 %define xmm_xpos_int xmm3 2842 %define xmm_xpos_int_inc xmm10 2843 %define xmm_yfrac0 xmm11 2844 %define xmm_yfrac1 xmm12 2845 %define xmm_tmp0 xmm7 2846 %define xmm_tmp1 xmm2 2847 %define xmm_tmp2 xmm9 2848 %define xmm_tmp3 xmm4 2849 %define xmm_tmp4 xmm5 2850 %define xmm_tmp5 xmm6 2851 %define xmm_xpos_int_begin xmm14 2852 %define xmm_xpos_frac_begin xmm15 2853 %define xmm_db80h [db80h_256] 2854 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 2855 pxor xmm_0, xmm_0 2856%endif 2857 2858 sub i_dst_height, 1 2859 je .final_row 2860 jl .done 2861 2862 mov i_ypos, 1 << 14 2863 movd xmm_xpos_frac, i_scalexd 2864 pshufd xmm_xpos_frac, xmm_xpos_frac, 0 2865 movdqa xmm_tmp0, xmm_xpos_frac 2866 pslld xmm_tmp0, 2 2867 pslldq xmm_xpos_frac, 4 2868 paddd xmm_tmp0, xmm_xpos_frac 2869 movdqa xmm_tmp1, xmm_xpos_frac 2870 pslldq xmm_tmp1, 4 2871 paddd xmm_xpos_frac, xmm_tmp1 2872 paddd xmm_tmp0, xmm_tmp1 2873 pslldq xmm_tmp1, 4 2874 paddd xmm_xpos_frac, xmm_tmp1 2875 paddd xmm_tmp0, xmm_tmp1 2876 pcmpeqw xmm_tmp1, xmm_tmp1 2877 psrld xmm_tmp1, 31 2878 pslld xmm_tmp1, 15 2879 paddd xmm_xpos_frac, xmm_tmp1 2880 paddd xmm_tmp0, xmm_tmp1 2881 movdqa xmm_xpos_int, xmm_xpos_frac 2882 movdqa xmm_tmp1, xmm_tmp0 2883 psrld xmm_xpos_int, 16 2884 psrld xmm_tmp1, 16 2885 packssdw xmm_xpos_int, xmm_tmp1 2886 packuswb xmm_xpos_int, xmm_xpos_int 2887 movdqa xmm_tmp1, xmm_xpos_int 2888 pcmpeqw xmm_tmp2, xmm_tmp2 2889 psubb xmm_tmp1, xmm_tmp2 2890 punpcklbw xmm_xpos_int, xmm_tmp1 2891 pslld xmm_xpos_frac, 16 2892 pslld xmm_tmp0, 16 2893 psrad xmm_xpos_frac, 16 2894 psrad xmm_tmp0, 16 2895 packssdw xmm_xpos_frac, xmm_tmp0 2896 movd xmm_tmp0, i_scalexd 2897 pslld xmm_tmp0, 3 2898 movdqa xmm_tmp1, xmm_tmp0 2899 punpcklwd xmm_tmp0, xmm_tmp0 2900 pshufd xmm_tmp0, xmm_tmp0, 0 2901 movdqa xmm_xpos_frac_inc, xmm_tmp0 2902 psrld xmm_tmp1, 16 2903 psubw xmm_tmp1, xmm_tmp2 2904 pxor xmm_tmp2, xmm_tmp2 2905 pshufb xmm_tmp1, xmm_tmp2 2906 movdqa xmm_xpos_int_inc, xmm_tmp1 2907 movdqa xmm_xpos_int_begin, xmm_xpos_int 2908 movdqa xmm_xpos_frac_begin, xmm_xpos_frac 2909 2910 cmp i_scalex, 4 << 16 2911 ja .scalex_above4 2912 cmp i_scalex, 2 << 16 2913 ja .scalex_above2_beloweq4 2914 SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample2xOrLess_8px, 1 2915 jmp .final_row 2916%ifdef X86_32 2917 %undef xmm_yfrac0 2918 %xdefine xmm_yfrac0 xmm_tmp5 2919 %undef xmm_tmp5 2920%endif 2921.scalex_above2_beloweq4: 2922 SSE2_GeneralBilinearDownsampler_loop SSSE3_BilinearFastDownsample4xOrLess_8px, 1 2923 jmp .final_row 2924.scalex_above4: 2925%xdefine xmm_xfrac0 xmm_xpos_frac 2926%xdefine xmm_xfrac1 xmm_xpos_int 2927%xdefine xmm_xfrac0_begin xmm_xpos_int_begin 2928%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin 2929%xdefine xmm_xfrac_inc xmm_xpos_frac_inc 2930%undef xmm_xpos_int 2931%undef xmm_xpos_frac 2932%undef xmm_xpos_int_begin 2933%undef xmm_xpos_frac_begin 2934%undef xmm_xpos_int_inc 2935%undef xmm_xpos_frac_inc 2936 SSE2_UnpckXFracuw xmm_tmp0, xmm_xfrac1, xmm_xfrac0 2937 movdqa xmm_xfrac0, xmm_tmp0 2938 movdqa xmm_xfrac0_begin, xmm_xfrac0 2939 movdqa xmm_xfrac1_begin, xmm_xfrac1 2940 pcmpeqw xmm_tmp0, xmm_tmp0 2941 pmullw xmm_tmp0, xmm_xfrac_inc 2942 punpcklwd xmm_tmp0, xmm_xfrac_inc 2943 movdqa xmm_xfrac_inc, xmm_tmp0 2944 SSE2_GeneralBilinearDownsampler_loop SSE2_GeneralBilinearFastDownsample_8px, 1 2945 2946.final_row: 2947 mov p_src_row0, i_ypos 2948 shr p_src_row0, 15 2949 imul p_src_row0, i_src_stride 2950 add p_src_row0, p_src 2951 mov i_xpos, 1 << 15 2952 mov i_width_cnt, i_dst_width 2953 2954.final_row_width: 2955 mov r_tmp0, i_xpos 2956 shr r_tmp0, 16 2957 movzx r_tmp0, byte [p_src_row0 + r_tmp0] 2958 mov [p_dst], r_tmp0b 2959 add p_dst, 1 2960 add i_xpos, i_scalex 2961 sub i_width_cnt, 1 2962 jg .final_row_width 2963 2964.done: 2965%ifdef X86_32 2966 mov esp, [esp] 2967%endif 2968 POP_XMM 2969 LOAD_7_PARA_POP 2970%ifndef X86_32 2971%ifdef WIN64 2972 pop rsi 2973 pop rdi 2974%endif 2975 pop rbp 2976 pop rbx 2977 pop r13 2978 pop r12 2979%endif 2980 ret 2981%undef p_dst 2982%undef i_dst_stride_less_width 2983%undef i_dst_width 2984%undef i_dst_height 2985%undef p_src 2986%undef i_src_stride 2987%undef i_scalex 2988%undef i_scalexd 2989%undef i_scaleyd 2990%undef i_xpos 2991%undef i_ypos 2992%undef i_yposd 2993%undef p_src_row0 2994%undef p_src_row1 2995%undef i_width_cnt 2996%undef r_tmp0 2997%undef r_tmp0b 2998%undef xmm_0 2999%undef xmm_xpos_frac 3000%undef xmm_xpos_frac_inc 3001%undef xmm_xpos_int 3002%undef xmm_xpos_int_inc 3003%undef xmm_yfrac0 3004%undef xmm_yfrac1 3005%undef xmm_tmp0 3006%undef xmm_tmp1 3007%undef xmm_tmp2 3008%undef xmm_tmp3 3009%undef xmm_tmp4 3010%undef xmm_tmp5 3011%undef xmm_xpos_int_begin 3012%undef xmm_xpos_frac_begin 3013%undef xmm_xfrac0 3014%undef xmm_xfrac1 3015%undef xmm_xfrac0_begin 3016%undef xmm_xfrac1_begin 3017%undef xmm_xfrac_inc 3018%undef xmm_db80h 3019%undef xmm_shufb_0000000088888888 3020 3021;************************************************************************************************************** 3022;void GeneralBilinearAccurateDownsampler_sse41 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, 3023; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, 3024; uint32_t uiScaleY); 3025; 3026;************************************************************************************************************** 3027 3028WELS_EXTERN GeneralBilinearAccurateDownsampler_sse41 3029 %assign push_num 0 3030%ifndef X86_32 3031 push r12 3032 push r13 3033 push rbx 3034 push rbp 3035 %assign push_num 4 3036%ifdef WIN64 3037 push rdi 3038 push rsi 3039 %assign push_num push_num + 2 3040%endif 3041%endif 3042 LOAD_7_PARA 3043 PUSH_XMM 16 3044 SIGN_EXTENSION r1, r1d 3045 SIGN_EXTENSION r2, r2d 3046 SIGN_EXTENSION r3, r3d 3047 SIGN_EXTENSION r5, r5d 3048 ZERO_EXTENSION r6d 3049 sub r1, r2 ; dst_stride - dst_width 3050 add r6, r6 ; 2 * scalex 3051%ifdef X86_32 3052 movd xmm0, arg8 3053 movd xmm1, esp 3054 and esp, -16 3055%ifdef X86_32_PICASM 3056 sub esp, 8 * 4 + 10 * 16 3057%else 3058 sub esp, 8 * 4 + 8 * 16 3059%endif 3060 movd [esp], xmm1 3061 %define p_dst r0 3062 %define i_dst_stride_less_width [esp + 1 * 4] 3063 %define i_dst_width [esp + 2 * 4] 3064 %define i_dst_height dword [esp + 3 * 4] 3065 %define p_src [esp + 4 * 4] 3066 %define i_src_stride [esp + 5 * 4] 3067 %define i_scalex r6 3068 %define i_scalexd r6d 3069 %define i_scaleyd [esp + 6 * 4] 3070 %define i_xpos r2 3071 %define i_ypos dword [esp + 7 * 4] 3072 %define i_yposd dword [esp + 7 * 4] 3073 %define p_src_row0 r3 3074 %define p_src_row1 r4 3075 %define i_width_cnt r5 3076 %define r_tmp0 r1 3077 %define r_tmp0b r1b 3078 %define xmm_xpos_frac xmm1 3079 %define xmm_xpos_frac_inc [esp + 8 * 4] 3080 %define xmm_xpos_int xmm3 3081 %define xmm_xpos_int_inc [esp + 8 * 4 + 1 * 16] 3082 %define xmm_yfrac0 [esp + 8 * 4 + 2 * 16] 3083 %define xmm_yfrac1 [esp + 8 * 4 + 3 * 16] 3084 %define xmm_tmp0 xmm7 3085 %define xmm_tmp1 xmm0 3086 %define xmm_tmp2 xmm2 3087 %define xmm_tmp3 xmm4 3088 %define xmm_tmp4 xmm5 3089 %define xmm_tmp5 xmm6 3090 %define xmm_0 [esp + 8 * 4 + 4 * 16] 3091 %define xmm_7fff [esp + 8 * 4 + 5 * 16] 3092 %define xmm_xpos_int_begin [esp + 8 * 4 + 6 * 16] 3093 %define xmm_xpos_frac_begin [esp + 8 * 4 + 7 * 16] 3094%ifdef X86_32_PICASM 3095 %define xmm_db80h [esp + 8 * 4 + 8 * 16] 3096 %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 16] 3097 pxor xmm_tmp4, xmm_tmp4 3098 pcmpeqb xmm_tmp5, xmm_tmp5 3099 psubb xmm_tmp4, xmm_tmp5 3100 movdqa xmm_tmp3, xmm_tmp4 3101 psllw xmm_tmp3, 3 3102 pslldq xmm_tmp3, 8 3103 movdqa xmm_shufb_0000000088888888, xmm_tmp3 3104 psllw xmm_tmp4, 7 3105 movdqa xmm_db80h, xmm_tmp4 3106%else 3107 %define xmm_db80h [db80h_256] 3108 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 3109%endif 3110 mov i_dst_stride_less_width, r1 3111 mov i_dst_width, r2 3112 mov i_dst_height, r3 3113 mov p_src, r4 3114 mov i_src_stride, r5 3115 movd i_scaleyd, xmm0 3116 pxor xmm_tmp5, xmm_tmp5 3117 movdqa xmm_0, xmm_tmp5 3118 pcmpeqw xmm_tmp5, xmm_tmp5 3119 psrlw xmm_tmp5, 1 3120 movdqa xmm_7fff, xmm_tmp5 3121%else 3122 %define p_dst r0 3123 %define i_dst_stride_less_width r1 3124 %define i_dst_width r2 3125 %define i_dst_height r3 3126 %define p_src r4 3127 %define i_src_stride r5 3128 %define i_scalex r6 3129 %define i_scalexd r6d 3130 %define i_scaleyd dword arg8d 3131 %define i_xpos r12 3132 %define i_ypos r13 3133 %define i_yposd r13d 3134 %define p_src_row0 rbp 3135%ifdef WIN64 3136 %define p_src_row1 rsi 3137 %define i_width_cnt rdi 3138%else 3139 %define p_src_row1 r11 3140 %define i_width_cnt rax 3141%endif 3142 %define r_tmp0 rbx 3143 %define r_tmp0b bl 3144 %define xmm_0 xmm0 3145 %define xmm_xpos_frac xmm1 3146 %define xmm_xpos_frac_inc xmm8 3147 %define xmm_xpos_int xmm3 3148 %define xmm_xpos_int_inc xmm10 3149 %define xmm_yfrac0 xmm11 3150 %define xmm_yfrac1 xmm12 3151 %define xmm_tmp0 xmm7 3152 %define xmm_tmp1 xmm2 3153 %define xmm_tmp2 xmm9 3154 %define xmm_tmp3 xmm4 3155 %define xmm_tmp4 xmm5 3156 %define xmm_tmp5 xmm6 3157 %define xmm_7fff xmm13 3158 %define xmm_xpos_int_begin xmm14 3159 %define xmm_xpos_frac_begin xmm15 3160 %define xmm_db80h [db80h_256] 3161 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 3162 pxor xmm_0, xmm_0 3163 pcmpeqw xmm_7fff, xmm_7fff 3164 psrlw xmm_7fff, 1 3165%endif 3166 3167 sub i_dst_height, 1 3168 je .final_row 3169 jl .done 3170 3171 mov i_ypos, 1 << 14 3172 movd xmm_xpos_frac, i_scalexd 3173 pshufd xmm_xpos_frac, xmm_xpos_frac, 0 3174 movdqa xmm_tmp0, xmm_xpos_frac 3175 pslld xmm_tmp0, 2 3176 pslldq xmm_xpos_frac, 4 3177 paddd xmm_tmp0, xmm_xpos_frac 3178 movdqa xmm_tmp1, xmm_xpos_frac 3179 pslldq xmm_tmp1, 4 3180 paddd xmm_xpos_frac, xmm_tmp1 3181 paddd xmm_tmp0, xmm_tmp1 3182 pslldq xmm_tmp1, 4 3183 paddd xmm_xpos_frac, xmm_tmp1 3184 paddd xmm_tmp0, xmm_tmp1 3185 pcmpeqw xmm_tmp1, xmm_tmp1 3186 psrld xmm_tmp1, 31 3187 pslld xmm_tmp1, 15 3188 paddd xmm_xpos_frac, xmm_tmp1 3189 paddd xmm_tmp0, xmm_tmp1 3190 movdqa xmm_xpos_int, xmm_xpos_frac 3191 movdqa xmm_tmp1, xmm_tmp0 3192 psrld xmm_xpos_int, 16 3193 psrld xmm_tmp1, 16 3194 packssdw xmm_xpos_int, xmm_tmp1 3195 packuswb xmm_xpos_int, xmm_xpos_int 3196 movdqa xmm_tmp1, xmm_xpos_int 3197 pcmpeqw xmm_tmp2, xmm_tmp2 3198 psubb xmm_tmp1, xmm_tmp2 3199 punpcklbw xmm_xpos_int, xmm_tmp1 3200 pslld xmm_xpos_frac, 16 3201 pslld xmm_tmp0, 16 3202 psrad xmm_xpos_frac, 16 3203 psrad xmm_tmp0, 16 3204 packssdw xmm_xpos_frac, xmm_tmp0 3205 psrlw xmm_xpos_frac, 1 3206 movd xmm_tmp0, i_scalexd 3207 pslld xmm_tmp0, 3 3208 movdqa xmm_tmp1, xmm_tmp0 3209 punpcklwd xmm_tmp0, xmm_tmp0 3210 pshufd xmm_tmp0, xmm_tmp0, 0 3211 psrlw xmm_tmp0, 1 3212 movdqa xmm_xpos_frac_inc, xmm_tmp0 3213 psrld xmm_tmp1, 16 3214 pxor xmm_tmp2, xmm_tmp2 3215 pshufb xmm_tmp1, xmm_tmp2 3216 movdqa xmm_xpos_int_inc, xmm_tmp1 3217 movdqa xmm_xpos_int_begin, xmm_xpos_int 3218 movdqa xmm_xpos_frac_begin, xmm_xpos_frac 3219 3220 cmp i_scalex, 4 << 16 3221 ja .scalex_above4 3222 cmp i_scalex, 2 << 16 3223 ja .scalex_above2_beloweq4 3224 SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample2xOrLess_8px, 0 3225 jmp .final_row 3226%ifdef X86_32 3227 %undef xmm_yfrac0 3228 %xdefine xmm_yfrac0 xmm_tmp5 3229 %undef xmm_tmp5 3230%endif 3231.scalex_above2_beloweq4: 3232 SSE2_GeneralBilinearDownsampler_loop SSE41_BilinearAccurateDownsample4xOrLess_8px, 0 3233 jmp .final_row 3234.scalex_above4: 3235%xdefine xmm_xfrac0 xmm_xpos_frac 3236%xdefine xmm_xfrac1 xmm_xpos_int 3237%xdefine xmm_xfrac0_begin xmm_xpos_int_begin 3238%xdefine xmm_xfrac1_begin xmm_xpos_frac_begin 3239%xdefine xmm_xfrac_inc xmm_xpos_frac_inc 3240%undef xmm_xpos_int 3241%undef xmm_xpos_frac 3242%undef xmm_xpos_int_begin 3243%undef xmm_xpos_frac_begin 3244%undef xmm_xpos_int_inc 3245%undef xmm_xpos_frac_inc 3246 SSE2_UnpckXFracw xmm_tmp0, xmm_xfrac1, xmm_xfrac0, xmm_7fff 3247 movdqa xmm_xfrac0, xmm_tmp0 3248 movdqa xmm_xfrac0_begin, xmm_xfrac0 3249 movdqa xmm_xfrac1_begin, xmm_xfrac1 3250 pcmpeqw xmm_tmp0, xmm_tmp0 3251 pmullw xmm_tmp0, xmm_xfrac_inc 3252 punpcklwd xmm_tmp0, xmm_xfrac_inc 3253 movdqa xmm_xfrac_inc, xmm_tmp0 3254 SSE2_GeneralBilinearDownsampler_loop SSE41_GeneralBilinearAccurateDownsample_8px, 0 3255 3256.final_row: 3257 mov p_src_row0, i_ypos 3258 shr p_src_row0, 15 3259 imul p_src_row0, i_src_stride 3260 add p_src_row0, p_src 3261 mov i_xpos, 1 << 15 3262 mov i_width_cnt, i_dst_width 3263 3264.final_row_width: 3265 mov r_tmp0, i_xpos 3266 shr r_tmp0, 16 3267 movzx r_tmp0, byte [p_src_row0 + r_tmp0] 3268 mov [p_dst], r_tmp0b 3269 add p_dst, 1 3270 add i_xpos, i_scalex 3271 sub i_width_cnt, 1 3272 jg .final_row_width 3273 3274.done: 3275%ifdef X86_32 3276 mov esp, [esp] 3277%endif 3278 POP_XMM 3279 LOAD_7_PARA_POP 3280%ifndef X86_32 3281%ifdef WIN64 3282 pop rsi 3283 pop rdi 3284%endif 3285 pop rbp 3286 pop rbx 3287 pop r13 3288 pop r12 3289%endif 3290 ret 3291%undef p_dst 3292%undef i_dst_stride_less_width 3293%undef i_dst_width 3294%undef i_dst_height 3295%undef p_src 3296%undef i_src_stride 3297%undef i_scalex 3298%undef i_scalexd 3299%undef i_scaleyd 3300%undef i_xpos 3301%undef i_ypos 3302%undef i_yposd 3303%undef p_src_row0 3304%undef p_src_row1 3305%undef i_width_cnt 3306%undef r_tmp0 3307%undef r_tmp0b 3308%undef xmm_0 3309%undef xmm_xpos_frac 3310%undef xmm_xpos_frac_inc 3311%undef xmm_xpos_int 3312%undef xmm_xpos_int_inc 3313%undef xmm_yfrac0 3314%undef xmm_yfrac1 3315%undef xmm_tmp0 3316%undef xmm_tmp1 3317%undef xmm_tmp2 3318%undef xmm_tmp3 3319%undef xmm_tmp4 3320%undef xmm_tmp5 3321%undef xmm_7fff 3322%undef xmm_xpos_int_begin 3323%undef xmm_xpos_frac_begin 3324%undef xmm_xfrac0 3325%undef xmm_xfrac1 3326%undef xmm_xfrac0_begin 3327%undef xmm_xfrac1_begin 3328%undef xmm_xfrac_inc 3329%undef xmm_db80h 3330%undef xmm_shufb_0000000088888888 3331 3332%ifdef HAVE_AVX2 3333; xpos_int=%1 xpos_frac=%2 inc_int+1=%3 inc_frac=%4 tmp=%5 3334%macro AVX2_BilinearIncXposuw 5 3335 vpaddusw %5, %2, %4 3336 vpaddw %2, %2, %4 3337 vpcmpeqw %5, %5, %2 3338 vpaddb %1, %1, %3 3339 vpaddb %1, %1, %5 ; subtract 1 if no carry 3340%endmacro 3341 3342; outl=%1 outh=%2 in=%3 FFFFh/7FFFh=%4 3343%macro AVX2_UnpckXFrac 4 3344 vpxor %1, %3, %4 3345 vpunpckhwd %2, %1, %3 3346 vpunpcklwd %1, %1, %3 3347%endmacro 3348 3349; out0=%1 out1=%2 xfrac=%3 yfrac0=%4 yfrac1=%5 3350%macro AVX2_BilinearFastCalcXYFrac 5 3351 vpmulhuw %2, %3, %5 3352 vpmulhuw %1, %3, %4 3353%endmacro 3354 3355; [in:dwordsl out:bytes] dwordsh=%2 zero=%3 3356%macro AVX2_BilinearFastPackDwordsToBytes 3 3357 vpsrld %1, %1, 14 3358 vpsrld %2, %2, 14 3359 vpackssdw %1, %1, %2 3360 vpavgw %1, %1, %3 3361 vpackuswb %1, %1, %1 3362%endmacro 3363 3364%macro AVX2_BilinearFastDownsample2xOrLess_16px 0 3365 vpshufb ymm_tmp0, ymm_xpos_int, ymm_0 3366 vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 3367 AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff 3368 mov r_tmp0, i_xpos 3369 shr r_tmp0, 16 3370 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3371 vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] 3372 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3373 lea i_xpos, [i_xpos + 8 * i_scalex2] 3374 shr r_tmp0, 16 3375 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3376 vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 3377 vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int 3378 vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int 3379 AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1 3380 vpunpcklbw ymm_tmp3, ymm_tmp4, ymm_0 3381 vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp3 3382 vpunpcklbw ymm_tmp3, ymm_tmp5, ymm_0 3383 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3 3384 vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2 3385 AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1 3386 vpunpckhbw ymm_tmp2, ymm_tmp4, ymm_0 3387 vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp2 3388 vpunpckhbw ymm_tmp2, ymm_tmp5, ymm_0 3389 vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp2 3390 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 3391 AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 3392 vmovlps [p_dst], xmm_tmp0 3393 vextracti128 [p_dst + 8], ymm_tmp0, 1 3394 add p_dst, 16 3395 AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0 3396%endmacro 3397 3398%macro AVX2_BilinearFastDownsample4xOrLess_16px 0 3399 vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888 3400 vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 3401 vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 3402 AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff 3403 mov r_tmp0, i_xpos 3404 shr r_tmp0, 16 3405 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3406 vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] 3407 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3408 shr r_tmp0, 16 3409 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3410 vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 3411 lea r_tmp0, [i_xpos + 2 * i_scalex2] 3412 lea i_xpos, [r_tmp0 + 4 * i_scalex2] 3413 shr r_tmp0, 16 3414 vpunpcklbw ymm_tmp2, ymm_xpos_int, ymm_ffff 3415 vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2 3416 vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2 3417 AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1 3418 vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4 3419 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3 3420 vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2 3421 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3422 vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] 3423 mov r_tmp0, i_xpos 3424 lea i_xpos, [i_xpos + 2 * i_scalex2] 3425 shr r_tmp0, 16 3426 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3427 vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 3428 vpunpckhbw ymm_tmp2, ymm_xpos_int, ymm_ffff 3429 vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp2 3430 vpshufb ymm_tmp3, ymm_tmp3, ymm_tmp2 3431 AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_tmp1, ymm_yfrac0, ymm_yfrac1 3432 vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4 3433 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp3 3434 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2 3435 AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 3436 vmovlps [p_dst], xmm_tmp0 3437 vextracti128 [p_dst + 8], ymm_tmp0, 1 3438 add p_dst, 16 3439 AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0 3440%endmacro 3441 3442%macro AVX2_BilinearFastDownsample8xOrLess_16px 0 3443 vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC 3444 vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 3445 vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 3446 mov r_tmp0, i_xpos 3447 shr r_tmp0, 16 3448 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3449 vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] 3450 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3451 add i_xpos, i_scalex2 3452 shr r_tmp0, 16 3453 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3454 vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 3455 mov r_tmp0, i_xpos 3456 shr r_tmp0, 16 3457 vmovdqu xmm_tmp0, [p_src_row0 + r_tmp0] 3458 vmovdqu xmm_tmp1, [p_src_row1 + r_tmp0] 3459 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3460 add i_xpos, i_scalex2 3461 shr r_tmp0, 16 3462 vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1 3463 vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1 3464 vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_ffff 3465 vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 3466 vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3 3467 vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3 3468 vpshufb ymm_tmp1, ymm_tmp1, ymm_tmp3 3469 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b 3470 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b 3471 AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_ffff 3472 AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp2, ymm_tmp0, ymm_yfrac0, ymm_yfrac1 3473 vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4 3474 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5 3475 vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp2 3476 mov r_tmp0, i_xpos 3477 shr r_tmp0, 16 3478 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3479 vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] 3480 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3481 add i_xpos, i_scalex2 3482 shr r_tmp0, 16 3483 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3484 vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 3485 mov r_tmp0, i_xpos 3486 lea i_xpos, [i_xpos + 4 * i_scalex2] 3487 shr r_tmp0, 16 3488 vmovdqu xmm_tmp2, [p_src_row0 + r_tmp0] 3489 vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] 3490 mov r_tmp0, i_xpos 3491 add i_xpos, i_scalex2 3492 shr r_tmp0, 16 3493 vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1 3494 vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 3495 vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int 3496 vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int 3497 vpshufb ymm_tmp2, ymm_tmp2, ymm_xpos_int 3498 vpshufb ymm_tmp3, ymm_tmp3, ymm_xpos_int 3499 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b 3500 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b 3501 vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0 3502 vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0 3503 AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp3, ymm_tmp1, ymm_yfrac0, ymm_yfrac1 3504 vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4 3505 vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp5 3506 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 3507 AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 3508 vmovlps [p_dst], xmm_tmp0 3509 vextracti128 [p_dst + 8], ymm_tmp0, 1 3510 add p_dst, 16 3511 AVX2_BilinearIncXposuw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_tmp0 3512%endmacro 3513 3514%macro AVX2_GeneralBilinearFastDownsample_16px 0 3515 mov r_tmp0, i_xpos 3516 shr r_tmp0, 16 3517 vpbroadcastd ymm_tmp4, [p_src_row0 + r_tmp0] 3518 vpbroadcastd ymm_tmp5, [p_src_row1 + r_tmp0] 3519 lea r_tmp0, [i_xpos + 1 * i_scalex] 3520 shr r_tmp0, 16 3521 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3522 vpunpcklwd ymm_tmp4, ymm_tmp4, ymm_tmp0 3523 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3524 vpunpcklwd ymm_tmp5, ymm_tmp5, ymm_tmp0 3525 lea r_tmp0, [i_xpos + 2 * i_scalex] 3526 lea i_xpos, [i_xpos + 4 * i_scalex] 3527 shr r_tmp0, 16 3528 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3529 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b 3530 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3531 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b 3532 mov r_tmp0, i_xpos 3533 sub r_tmp0, i_scalex 3534 shr r_tmp0, 16 3535 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] 3536 vpblendw ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b 3537 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] 3538 vpblendw ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b 3539 mov r_tmp0, i_xpos 3540 shr r_tmp0, 16 3541 vpbroadcastd ymm_tmp2, [p_src_row0 + r_tmp0] 3542 vpbroadcastd ymm_tmp3, [p_src_row1 + r_tmp0] 3543 lea r_tmp0, [i_xpos + 1 * i_scalex] 3544 shr r_tmp0, 16 3545 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3546 vpunpcklwd ymm_tmp2, ymm_tmp2, ymm_tmp0 3547 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3548 vpunpcklwd ymm_tmp3, ymm_tmp3, ymm_tmp0 3549 lea r_tmp0, [i_xpos + 2 * i_scalex] 3550 lea i_xpos, [i_xpos + 4 * i_scalex] 3551 shr r_tmp0, 16 3552 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3553 vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b 3554 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3555 vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b 3556 mov r_tmp0, i_xpos 3557 sub r_tmp0, i_scalex 3558 shr r_tmp0, 16 3559 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] 3560 vpblendw ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b 3561 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] 3562 vpblendw ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b 3563 mov r_tmp0, i_xpos 3564 shr r_tmp0, 16 3565 vmovd xmm_tmp0, [p_src_row0 + r_tmp0] 3566 vmovd xmm_tmp1, [p_src_row1 + r_tmp0] 3567 lea r_tmp0, [i_xpos + i_scalex] 3568 shr r_tmp0, 16 3569 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 3570 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 3571 lea r_tmp0, [i_xpos + 2 * i_scalex] 3572 lea i_xpos, [i_xpos + 4 * i_scalex] 3573 shr r_tmp0, 16 3574 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 3575 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 3576 mov r_tmp0, i_xpos 3577 sub r_tmp0, i_scalex 3578 shr r_tmp0, 16 3579 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 3580 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 3581 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b 3582 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b 3583 mov r_tmp0, i_xpos 3584 shr r_tmp0, 16 3585 vmovd xmm_tmp0, [p_src_row0 + r_tmp0] 3586 vmovd xmm_tmp1, [p_src_row1 + r_tmp0] 3587 lea r_tmp0, [i_xpos + i_scalex] 3588 shr r_tmp0, 16 3589 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 3590 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 3591 lea r_tmp0, [i_xpos + 2 * i_scalex] 3592 lea i_xpos, [i_xpos + 4 * i_scalex] 3593 shr r_tmp0, 16 3594 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 3595 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 3596 mov r_tmp0, i_xpos 3597 sub r_tmp0, i_scalex 3598 shr r_tmp0, 16 3599 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 3600 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 3601 vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b 3602 vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b 3603 vpunpcklbw ymm_tmp4, ymm_tmp4, ymm_0 3604 vpunpcklbw ymm_tmp5, ymm_tmp5, ymm_0 3605 AVX2_BilinearFastCalcXYFrac ymm_tmp0, ymm_tmp1, ymm_xfrac0, ymm_yfrac0, ymm_yfrac1 3606 vpmaddwd ymm_tmp0, ymm_tmp0, ymm_tmp4 3607 vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp5 3608 vpaddd ymm_tmp0, ymm_tmp0, ymm_tmp1 3609 vpunpcklbw ymm_tmp4, ymm_tmp2, ymm_0 3610 vpunpcklbw ymm_tmp5, ymm_tmp3, ymm_0 3611 AVX2_BilinearFastCalcXYFrac ymm_tmp1, ymm_tmp2, ymm_xfrac1, ymm_yfrac0, ymm_yfrac1 3612 vpmaddwd ymm_tmp1, ymm_tmp1, ymm_tmp4 3613 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp5 3614 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp2 3615 AVX2_BilinearFastPackDwordsToBytes ymm_tmp0, ymm_tmp1, ymm_0 3616 vpermq ymm_tmp0, ymm_tmp0, 0010b 3617 vmovdqu [p_dst], xmm_tmp0 3618 add p_dst, 16 3619 vpaddw ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc 3620 vpaddw ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc 3621%endmacro 3622 3623; xpos_int=%1 xpos_frac=%2 inc_int=%3 inc_frac=%4 7FFFh=%5 tmp=%6,%7 3624%macro AVX2_BilinearIncXposw 7 3625 vpaddb %1, %1, %3 3626 vpaddw %6, %2, %4 3627 vpcmpgtw %7, %2, %6 3628 vpsubb %1, %1, %7 ; add carry 3629 vpand %2, %6, %5 3630%endmacro 3631 3632; res>>29=%1 data0=%2 data1=%3 frac0=%4 frac1=%5 tmp=%6 3633%macro AVX2_LinearAccurateInterpolateVerticalDwords 6 3634 vpshufd %1, %2, 10110001b 3635 vpshufd %6, %3, 10110001b 3636 vpmuludq %1, %1, %4 3637 vpmuludq %6, %6, %5 3638 vpaddq %1, %1, %6 3639 vpmuludq %2, %2, %4 3640 vpmuludq %3, %3, %5 3641 vpaddq %2, %2, %3 3642 vpsllq %1, %1, 3 3643 vpsrlq %2, %2, 29 3644 vpblendd %1, %1, %2, 01010101b 3645%endmacro 3646 3647%macro AVX2_BilinearAccurateDownsample2xOrLess_16px 0 3648 vpshufb ymm_tmp0, ymm_xpos_int, ymm_0 3649 vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 3650 AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff 3651 mov r_tmp0, i_xpos 3652 shr r_tmp0, 16 3653 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3654 vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] 3655 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3656 lea i_xpos, [i_xpos + 8 * i_scalex2] 3657 shr r_tmp0, 16 3658 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3659 vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 3660 vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int 3661 vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int 3662 vpunpcklbw ymm_tmp2, ymm_tmp4, ymm_0 3663 vpunpcklbw ymm_tmp3, ymm_tmp5, ymm_0 3664 vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0 3665 vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0 3666 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp0 3667 vpmaddwd ymm_tmp3, ymm_tmp3, ymm_tmp0 3668 vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1 3669 vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp1 3670 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp2, ymm_tmp3, ymm_yfrac0, ymm_yfrac1, ymm_tmp1 3671 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2 3672 vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 3673 vpavgw ymm_tmp0, ymm_tmp0, ymm_0 3674 vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 3675 vmovlps [p_dst], xmm_tmp0 3676 vextracti128 [p_dst + 8], ymm_tmp0, 1 3677 add p_dst, 16 3678 AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1 3679%endmacro 3680 3681%macro AVX2_BilinearAccurateDownsample4xOrLess_16px 0 3682 vbroadcasti128 ymm_tmp0, xmm_shufb_0000000088888888 3683 vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 3684 vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 3685 AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff 3686 mov r_tmp0, i_xpos 3687 shr r_tmp0, 16 3688 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3689 vmovdqu xmm_tmp2, [p_src_row1 + r_tmp0] 3690 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3691 shr r_tmp0, 16 3692 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3693 vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1 3694 lea r_tmp0, [i_xpos + 2 * i_scalex2] 3695 lea i_xpos, [r_tmp0 + 4 * i_scalex2] 3696 shr r_tmp0, 16 3697 vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h 3698 vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 3699 vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3 3700 vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0 3701 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp0 3702 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 3703 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3704 vmovdqu xmm_tmp2, [p_src_row1 + r_tmp0] 3705 mov r_tmp0, i_xpos 3706 lea i_xpos, [i_xpos + 2 * i_scalex2] 3707 shr r_tmp0, 16 3708 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3709 vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row1 + r_tmp0], 1 3710 vpunpckhbw ymm_tmp3, ymm_xpos_int, ymm_db80h 3711 vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 3712 vpshufb ymm_tmp2, ymm_tmp2, ymm_tmp3 3713 vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1 3714 vpmaddwd ymm_tmp2, ymm_tmp2, ymm_tmp1 3715 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp2, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 3716 vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 3717 vpavgw ymm_tmp0, ymm_tmp0, ymm_0 3718 vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 3719 vmovlps [p_dst], xmm_tmp0 3720 vextracti128 [p_dst + 8], ymm_tmp0, 1 3721 add p_dst, 16 3722 AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1 3723%endmacro 3724 3725%macro AVX2_BilinearAccurateDownsample8xOrLess_16px 0 3726 vbroadcasti128 ymm_tmp0, xmm_shufb_000044448888CCCC 3727 vpshufb ymm_tmp0, ymm_xpos_int, ymm_tmp0 3728 vpsubb ymm_xpos_int, ymm_xpos_int, ymm_tmp0 3729 mov r_tmp0, i_xpos 3730 shr r_tmp0, 16 3731 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3732 vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] 3733 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3734 add i_xpos, i_scalex2 3735 shr r_tmp0, 16 3736 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3737 vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 3738 mov r_tmp0, i_xpos 3739 shr r_tmp0, 16 3740 vmovdqu xmm_tmp0, [p_src_row0 + r_tmp0] 3741 vmovdqu xmm_tmp1, [p_src_row1 + r_tmp0] 3742 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3743 add i_xpos, i_scalex2 3744 shr r_tmp0, 16 3745 vinserti128 ymm_tmp0, ymm_tmp0, [p_src_row0 + r_tmp0], 1 3746 vinserti128 ymm_tmp1, ymm_tmp1, [p_src_row1 + r_tmp0], 1 3747 vpunpcklbw ymm_tmp3, ymm_xpos_int, ymm_db80h 3748 vpshufb ymm_tmp4, ymm_tmp4, ymm_tmp3 3749 vpshufb ymm_tmp5, ymm_tmp5, ymm_tmp3 3750 vpshufb ymm_tmp0, ymm_tmp0, ymm_tmp3 3751 vpshufb ymm_tmp1, ymm_tmp1, ymm_tmp3 3752 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 11001100b 3753 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 11001100b 3754 AVX2_UnpckXFrac ymm_tmp0, ymm_tmp1, ymm_xpos_frac, ymm_7fff 3755 vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp0 3756 vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp0 3757 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 3758 mov r_tmp0, i_xpos 3759 shr r_tmp0, 16 3760 vmovdqu xmm_tmp4, [p_src_row0 + r_tmp0] 3761 vmovdqu xmm_tmp5, [p_src_row1 + r_tmp0] 3762 lea r_tmp0, [i_xpos + 4 * i_scalex2] 3763 add i_xpos, i_scalex2 3764 shr r_tmp0, 16 3765 vinserti128 ymm_tmp4, ymm_tmp4, [p_src_row0 + r_tmp0], 1 3766 vinserti128 ymm_tmp5, ymm_tmp5, [p_src_row1 + r_tmp0], 1 3767 mov r_tmp0, i_xpos 3768 lea i_xpos, [i_xpos + 4 * i_scalex2] 3769 shr r_tmp0, 16 3770 vmovdqu xmm_tmp2, [p_src_row0 + r_tmp0] 3771 vmovdqu xmm_tmp3, [p_src_row1 + r_tmp0] 3772 mov r_tmp0, i_xpos 3773 add i_xpos, i_scalex2 3774 shr r_tmp0, 16 3775 vinserti128 ymm_tmp2, ymm_tmp2, [p_src_row0 + r_tmp0], 1 3776 vinserti128 ymm_tmp3, ymm_tmp3, [p_src_row1 + r_tmp0], 1 3777 vpshufb ymm_tmp4, ymm_tmp4, ymm_xpos_int 3778 vpshufb ymm_tmp5, ymm_tmp5, ymm_xpos_int 3779 vpshufb ymm_tmp2, ymm_tmp2, ymm_xpos_int 3780 vpshufb ymm_tmp3, ymm_tmp3, ymm_xpos_int 3781 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp2, 10001000b 3782 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp3, 10001000b 3783 vpunpckhbw ymm_tmp4, ymm_tmp4, ymm_0 3784 vpunpckhbw ymm_tmp5, ymm_tmp5, ymm_0 3785 vpmaddwd ymm_tmp4, ymm_tmp4, ymm_tmp1 3786 vpmaddwd ymm_tmp5, ymm_tmp5, ymm_tmp1 3787 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp3 3788 vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 3789 vpavgw ymm_tmp0, ymm_tmp0, ymm_0 3790 vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 3791 vmovlps [p_dst], xmm_tmp0 3792 vextracti128 [p_dst + 8], ymm_tmp0, 1 3793 add p_dst, 16 3794 AVX2_BilinearIncXposw ymm_xpos_int, ymm_xpos_frac, ymm_xpos_int_inc, ymm_xpos_frac_inc, ymm_7fff, ymm_tmp0, ymm_tmp1 3795%endmacro 3796 3797%macro AVX2_GeneralBilinearAccurateDownsample_16px 0 3798 mov r_tmp0, i_xpos 3799 shr r_tmp0, 16 3800 vpbroadcastd ymm_tmp4, [p_src_row0 + r_tmp0] 3801 vpbroadcastd ymm_tmp5, [p_src_row1 + r_tmp0] 3802 lea r_tmp0, [i_xpos + 1 * i_scalex] 3803 shr r_tmp0, 16 3804 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3805 vpunpcklwd ymm_tmp4, ymm_tmp4, ymm_tmp0 3806 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3807 vpunpcklwd ymm_tmp5, ymm_tmp5, ymm_tmp0 3808 lea r_tmp0, [i_xpos + 2 * i_scalex] 3809 lea i_xpos, [i_xpos + 4 * i_scalex] 3810 shr r_tmp0, 16 3811 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3812 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00100010b 3813 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3814 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp0, 00100010b 3815 mov r_tmp0, i_xpos 3816 sub r_tmp0, i_scalex 3817 shr r_tmp0, 16 3818 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] 3819 vpblendw ymm_tmp4, ymm_tmp4, ymm_tmp0, 1000b 3820 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] 3821 vpblendw ymm_tmp5, ymm_tmp5, ymm_tmp0, 1000b 3822 mov r_tmp0, i_xpos 3823 shr r_tmp0, 16 3824 vpbroadcastd ymm_tmp2, [p_src_row0 + r_tmp0] 3825 vpbroadcastd ymm_tmp3, [p_src_row1 + r_tmp0] 3826 lea r_tmp0, [i_xpos + 1 * i_scalex] 3827 shr r_tmp0, 16 3828 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3829 vpunpcklwd ymm_tmp2, ymm_tmp2, ymm_tmp0 3830 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3831 vpunpcklwd ymm_tmp3, ymm_tmp3, ymm_tmp0 3832 lea r_tmp0, [i_xpos + 2 * i_scalex] 3833 lea i_xpos, [i_xpos + 4 * i_scalex] 3834 shr r_tmp0, 16 3835 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0] 3836 vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00100010b 3837 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0] 3838 vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp0, 00100010b 3839 mov r_tmp0, i_xpos 3840 sub r_tmp0, i_scalex 3841 shr r_tmp0, 16 3842 vpbroadcastd ymm_tmp0, [p_src_row0 + r_tmp0 - 2] 3843 vpblendw ymm_tmp2, ymm_tmp2, ymm_tmp0, 1000b 3844 vpbroadcastd ymm_tmp0, [p_src_row1 + r_tmp0 - 2] 3845 vpblendw ymm_tmp3, ymm_tmp3, ymm_tmp0, 1000b 3846 mov r_tmp0, i_xpos 3847 shr r_tmp0, 16 3848 vmovd xmm_tmp0, [p_src_row0 + r_tmp0] 3849 vmovd xmm_tmp1, [p_src_row1 + r_tmp0] 3850 lea r_tmp0, [i_xpos + i_scalex] 3851 shr r_tmp0, 16 3852 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 3853 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 3854 lea r_tmp0, [i_xpos + 2 * i_scalex] 3855 lea i_xpos, [i_xpos + 4 * i_scalex] 3856 shr r_tmp0, 16 3857 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 3858 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 3859 mov r_tmp0, i_xpos 3860 sub r_tmp0, i_scalex 3861 shr r_tmp0, 16 3862 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 3863 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 3864 vpblendd ymm_tmp4, ymm_tmp4, ymm_tmp0, 00001111b 3865 vpblendd ymm_tmp5, ymm_tmp5, ymm_tmp1, 00001111b 3866 mov r_tmp0, i_xpos 3867 shr r_tmp0, 16 3868 vmovd xmm_tmp0, [p_src_row0 + r_tmp0] 3869 vmovd xmm_tmp1, [p_src_row1 + r_tmp0] 3870 lea r_tmp0, [i_xpos + i_scalex] 3871 shr r_tmp0, 16 3872 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 1 3873 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 1 3874 lea r_tmp0, [i_xpos + 2 * i_scalex] 3875 lea i_xpos, [i_xpos + 4 * i_scalex] 3876 shr r_tmp0, 16 3877 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 2 3878 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 2 3879 mov r_tmp0, i_xpos 3880 sub r_tmp0, i_scalex 3881 shr r_tmp0, 16 3882 vpinsrw xmm_tmp0, [p_src_row0 + r_tmp0], 3 3883 vpinsrw xmm_tmp1, [p_src_row1 + r_tmp0], 3 3884 vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp0, 00001111b 3885 vpblendd ymm_tmp3, ymm_tmp3, ymm_tmp1, 00001111b 3886 vpunpcklbw ymm_tmp4, ymm_tmp4, ymm_0 3887 vpunpcklbw ymm_tmp5, ymm_tmp5, ymm_0 3888 vpmaddwd ymm_tmp4, ymm_tmp4, ymm_xfrac0 3889 vpmaddwd ymm_tmp5, ymm_tmp5, ymm_xfrac0 3890 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp0, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp1 3891 vpunpcklbw ymm_tmp4, ymm_tmp2, ymm_0 3892 vpunpcklbw ymm_tmp5, ymm_tmp3, ymm_0 3893 vpmaddwd ymm_tmp4, ymm_tmp4, ymm_xfrac1 3894 vpmaddwd ymm_tmp5, ymm_tmp5, ymm_xfrac1 3895 AVX2_LinearAccurateInterpolateVerticalDwords ymm_tmp1, ymm_tmp4, ymm_tmp5, ymm_yfrac0, ymm_yfrac1, ymm_tmp2 3896 vpackssdw ymm_tmp0, ymm_tmp0, ymm_tmp1 3897 vpavgw ymm_tmp0, ymm_tmp0, ymm_0 3898 vpackuswb ymm_tmp0, ymm_tmp0, ymm_tmp0 3899 vextracti128 [p_dst], ymm_tmp0, 1 3900 vmovlps [p_dst + 8], xmm_tmp0 3901 add p_dst, 16 3902 vpaddw ymm_xfrac0, ymm_xfrac0, ymm_xfrac_inc 3903 vpaddw ymm_xfrac1, ymm_xfrac1, ymm_xfrac_inc 3904 vpand ymm_xfrac0, ymm_xfrac0, ymm_7fff 3905 vpand ymm_xfrac1, ymm_xfrac1, ymm_7fff 3906%endmacro 3907 3908; downsample_16px_macro=%1 b_fast=%2 3909%macro AVX2_GeneralBilinearDownsampler_loop 2 3910%%height: 3911 mov p_src_row0, i_ypos 3912 shr p_src_row0, 15 3913 imul p_src_row0, i_src_stride 3914 add p_src_row0, p_src 3915 mov p_src_row1, p_src_row0 3916 add p_src_row1, i_src_stride 3917%ifdef X86_32 3918%if %2 3919 vpbroadcastw ymm_tmp1, i_ypos 3920 vpsllw ymm_tmp1, ymm_tmp1, 1 3921 vpsrlw ymm_tmp1, ymm_tmp1, 1 3922 vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 3923 vpsrlw ymm_tmp0, ymm_tmp0, 1 3924%else 3925 vpbroadcastd ymm_tmp1, i_ypos 3926 vpslld ymm_tmp1, ymm_tmp1, 17 3927 vpsrld ymm_tmp1, ymm_tmp1, 17 3928 vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 3929 vpsrld ymm_tmp0, ymm_tmp0, 17 3930%endif 3931 vpxor ymm_tmp0, ymm_tmp0, ymm_tmp1 3932 vmovdqa ymm_yfrac0, ymm_tmp0 3933 vmovdqa ymm_yfrac1, ymm_tmp1 3934%else 3935 vmovd xmm_tmp0, i_yposd 3936 vpbroadcastw ymm_yfrac1, xmm_tmp0 3937%if %2 3938 vpsllw ymm_yfrac1, ymm_yfrac1, 1 3939 vpsrlw ymm_yfrac1, ymm_yfrac1, 1 3940 vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0 3941 vpsrlw ymm_yfrac0, ymm_yfrac0, 1 3942%else 3943 vpslld ymm_yfrac1, ymm_yfrac1, 17 3944 vpsrld ymm_yfrac1, ymm_yfrac1, 17 3945 vpcmpeqw ymm_yfrac0, ymm_yfrac0, ymm_yfrac0 3946 vpsrld ymm_yfrac0, ymm_yfrac0, 17 3947%endif 3948 vpxor ymm_yfrac0, ymm_yfrac0, ymm_yfrac1 3949%endif 3950 3951 mov i_xpos, 1 << 15 3952 mov i_width_cnt, i_dst_width 3953 sub i_width_cnt, 1 3954 3955%ifdef ymm_xpos_int 3956 vmovdqa ymm_xpos_int, ymm_xpos_int_begin 3957 vmovdqa ymm_xpos_frac, ymm_xpos_frac_begin 3958%else 3959 vmovdqa ymm_xfrac0, ymm_xfrac0_begin 3960 vmovdqa ymm_xfrac1, ymm_xfrac1_begin 3961%endif 3962 3963%%width: 3964 %1 3965 sub i_width_cnt, 16 3966 jg %%width 3967 3968 lea p_dst, [p_dst + i_width_cnt + 1] 3969%ifdef i_scalex2 3970 mov r_tmp0, i_scalex2 3971 shr r_tmp0, 1 3972 imul i_width_cnt, r_tmp0 3973%else 3974 imul i_width_cnt, i_scalex 3975%endif 3976 add i_xpos, i_width_cnt 3977 shr i_xpos, 16 3978 movzx r_tmp0, byte [p_src_row0 + i_xpos] 3979 mov [p_dst - 1], r_tmp0b 3980%ifdef X86_32 3981 mov r_tmp0, i_scaleyd 3982 add i_yposd, r_tmp0 3983%else 3984 add i_yposd, i_scaleyd 3985%endif 3986 add p_dst, i_dst_stride_less_width 3987 sub i_dst_height, 1 3988 jg %%height 3989%endmacro 3990 3991;************************************************************************************************************** 3992;void GeneralBilinearFastDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, 3993; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, 3994; uint32_t uiScaleY); 3995; 3996;************************************************************************************************************** 3997 3998WELS_EXTERN GeneralBilinearFastDownsampler_avx2 3999 %assign push_num 0 4000%ifndef X86_32 4001 push r12 4002 push r13 4003 push rbx 4004 push rbp 4005 %assign push_num 4 4006%ifdef WIN64 4007 push rdi 4008 push rsi 4009 %assign push_num push_num + 2 4010%endif 4011%endif 4012 LOAD_7_PARA 4013 PUSH_XMM 16 4014 SIGN_EXTENSION r1, r1d 4015 SIGN_EXTENSION r2, r2d 4016 SIGN_EXTENSION r3, r3d 4017 SIGN_EXTENSION r5, r5d 4018 ZERO_EXTENSION r6d 4019 sub r1, r2 ; dst_stride - dst_width 4020%ifdef X86_32 4021 vmovd xmm0, arg8 4022 vmovd xmm1, esp 4023 and esp, -32 4024%ifdef X86_32_PICASM 4025 sub esp, 8 * 4 + 9 * 32 4026%else 4027 sub esp, 8 * 4 + 8 * 32 4028%endif 4029 vmovd [esp], xmm1 4030 %define p_dst r0 4031 %define i_dst_stride_less_width [esp + 1 * 4] 4032 %define i_dst_width [esp + 2 * 4] 4033 %define i_dst_height dword [esp + 3 * 4] 4034 %define p_src [esp + 4 * 4] 4035 %define i_src_stride [esp + 5 * 4] 4036 %define i_scalex r6 4037 %define i_scalexd r6d 4038 %define i_scaleyd [esp + 6 * 4] 4039 %define i_xpos r2 4040 %define i_ypos [esp + 7 * 4] 4041 %define i_yposd dword [esp + 7 * 4] 4042 %define p_src_row0 r3 4043 %define p_src_row1 r4 4044 %define i_width_cnt r5 4045 %define r_tmp0 r1 4046 %define r_tmp0b r1b 4047 %define ymm_xpos_frac ymm1 4048 %define ymm_xpos_frac_inc [esp + 8 * 4] 4049 %define ymm_xpos_int ymm3 4050 %define ymm_xpos_int_inc [esp + 8 * 4 + 1 * 32] 4051 %define ymm_yfrac0 [esp + 8 * 4 + 2 * 32] 4052 %define ymm_yfrac1 [esp + 8 * 4 + 3 * 32] 4053 %define xmm_tmp0 xmm7 4054 %define ymm_tmp0 ymm7 4055 %define xmm_tmp1 xmm0 4056 %define ymm_tmp1 ymm0 4057 %define xmm_tmp2 xmm2 4058 %define ymm_tmp2 ymm2 4059 %define xmm_tmp3 xmm4 4060 %define ymm_tmp3 ymm4 4061 %define xmm_tmp4 xmm5 4062 %define ymm_tmp4 ymm5 4063 %define xmm_tmp5 xmm6 4064 %define ymm_tmp5 ymm6 4065 %define ymm_0 [esp + 8 * 4 + 4 * 32] 4066 %define ymm_ffff [esp + 8 * 4 + 5 * 32] 4067 %define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32] 4068 %define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32] 4069%ifdef X86_32_PICASM 4070 %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 8 * 32] 4071 %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 8 * 32 + 16] 4072 vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 4073 vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5 4074 vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5 4075 vpsllw ymm_tmp3, ymm_tmp4, 3 4076 vpslldq ymm_tmp3, ymm_tmp3, 8 4077 vmovdqa xmm_shufb_0000000088888888, xmm_tmp3 4078 vpsllq ymm_tmp5, ymm_tmp4, 34 4079 vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3 4080 vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5 4081%else 4082 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 4083 %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] 4084%endif 4085 mov i_dst_stride_less_width, r1 4086 mov i_dst_width, r2 4087 mov i_dst_height, r3 4088 mov p_src, r4 4089 mov i_src_stride, r5 4090 vmovd i_scaleyd, xmm0 4091 vpxor xmm0, xmm0, xmm0 4092 vmovdqa ymm_0, ymm0 4093 vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 4094 vmovdqa ymm_ffff, ymm_tmp0 4095%else 4096 %define p_dst r0 4097 %define i_dst_stride_less_width r1 4098 %define i_dst_width r2 4099 %define i_dst_height r3 4100 %define p_src r4 4101 %define i_src_stride r5 4102 %define i_scalex r6 4103 %define i_scalexd r6d 4104 %define i_scaleyd dword arg8d 4105 %define i_xpos r12 4106 %define i_ypos r13 4107 %define i_yposd r13d 4108 %define p_src_row0 rbp 4109%ifdef WIN64 4110 %define p_src_row1 rsi 4111 %define i_width_cnt rdi 4112%else 4113 %define p_src_row1 r11 4114 %define i_width_cnt rax 4115%endif 4116 %define r_tmp0 rbx 4117 %define r_tmp0b bl 4118 %define ymm_0 ymm0 4119 %define ymm_xpos_frac ymm1 4120 %define ymm_xpos_frac_inc ymm2 4121 %define ymm_xpos_int ymm3 4122 %define ymm_xpos_int_inc ymm4 4123 %define ymm_yfrac0 ymm5 4124 %define ymm_yfrac1 ymm6 4125 %define xmm_tmp0 xmm7 4126 %define ymm_tmp0 ymm7 4127 %define xmm_tmp1 xmm8 4128 %define ymm_tmp1 ymm8 4129 %define xmm_tmp2 xmm9 4130 %define ymm_tmp2 ymm9 4131 %define xmm_tmp3 xmm10 4132 %define ymm_tmp3 ymm10 4133 %define xmm_tmp4 xmm11 4134 %define ymm_tmp4 ymm11 4135 %define xmm_tmp5 xmm12 4136 %define ymm_tmp5 ymm12 4137 %define ymm_ffff ymm13 4138 %define ymm_xpos_int_begin ymm14 4139 %define ymm_xpos_frac_begin ymm15 4140 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 4141 %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] 4142 vpxor ymm_0, ymm_0, ymm_0 4143 vpcmpeqw ymm_ffff, ymm_ffff, ymm_ffff 4144%endif 4145 4146 sub i_dst_height, 1 4147 je .final_row 4148 jl .done 4149 4150 mov i_yposd, 1 << 14 4151 vmovd xmm_tmp0, i_scalexd 4152 vpbroadcastd ymm_tmp0, xmm_tmp0 4153 vpslld ymm_tmp1, ymm_tmp0, 2 4154 vpslld ymm_tmp2, ymm_tmp0, 3 4155 vpaddd ymm_tmp3, ymm_tmp1, ymm_tmp2 4156 vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 4157 vpblendd ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b 4158 vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b 4159 vpaddd ymm_tmp3, ymm_tmp0, ymm_tmp0 4160 vpblendd ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b 4161 vpblendd ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b 4162 vpaddd ymm_tmp0, ymm_tmp3, ymm_tmp0 4163 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp0 4164 vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp0 4165 vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 4166 vpsrld ymm_tmp3, ymm_tmp3, 31 4167 vpslld ymm_tmp3, ymm_tmp3, 15 4168 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 4169 vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp3 4170 vpsrld ymm_xpos_int, ymm_tmp1, 16 4171 vpsrld ymm_tmp0, ymm_tmp2, 16 4172 vpackssdw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 4173 vpermq ymm_xpos_int, ymm_xpos_int, 11011000b 4174 vpackuswb ymm_xpos_int, ymm_xpos_int, ymm_xpos_int 4175 vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 4176 vpsubb ymm_tmp0, ymm_xpos_int, ymm_tmp3 4177 vpunpcklbw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 4178 vpslld ymm_tmp1, ymm_tmp1, 16 4179 vpsrld ymm_tmp1, ymm_tmp1, 16 4180 vpslld ymm_tmp2, ymm_tmp2, 16 4181 vpsrld ymm_tmp2, ymm_tmp2, 16 4182 vpackusdw ymm_xpos_frac, ymm_tmp1, ymm_tmp2 4183 vpermq ymm_xpos_frac, ymm_xpos_frac, 11011000b 4184 vmovd xmm_tmp0, i_scalexd 4185 vpslld xmm_tmp0, xmm_tmp0, 4 4186 vpbroadcastw ymm_tmp1, xmm_tmp0 4187 vmovdqa ymm_xpos_frac_inc, ymm_tmp1 4188 vpsrld xmm_tmp0, xmm_tmp0, 16 4189 vpsubw ymm_tmp0, ymm_tmp0, ymm_tmp3 4190 vpbroadcastb ymm_tmp0, xmm_tmp0 4191 vmovdqa ymm_xpos_int_inc, ymm_tmp0 4192 vmovdqa ymm_xpos_int_begin, ymm_xpos_int 4193 vmovdqa ymm_xpos_frac_begin, ymm_xpos_frac 4194 4195 cmp i_scalex, 4 << 16 4196 ja .scalex_above4 4197 cmp i_scalex, 2 << 16 4198 ja .scalex_above2_beloweq4 4199 add i_scalex, i_scalex 4200%xdefine i_scalex2 i_scalex 4201%undef i_scalex 4202 AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample2xOrLess_16px, 1 4203 shr i_scalex2, 1 4204%xdefine i_scalex i_scalex2 4205%undef i_scalex2 4206 jmp .final_row 4207.scalex_above2_beloweq4: 4208 add i_scalex, i_scalex 4209%xdefine i_scalex2 i_scalex 4210%undef i_scalex 4211 AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample4xOrLess_16px, 1 4212 shr i_scalex2, 1 4213%xdefine i_scalex i_scalex2 4214%undef i_scalex2 4215 jmp .final_row 4216.scalex_above4: 4217 cmp i_scalex, 8 << 16 4218 ja .scalex_above8 4219 add i_scalex, i_scalex 4220%xdefine i_scalex2 i_scalex 4221%undef i_scalex 4222 AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearFastDownsample8xOrLess_16px, 1 4223 shr i_scalex2, 1 4224%xdefine i_scalex i_scalex2 4225%undef i_scalex2 4226 jmp .final_row 4227.scalex_above8: 4228%xdefine ymm_xfrac0 ymm_xpos_frac 4229%xdefine ymm_xfrac1 ymm_xpos_int 4230%xdefine ymm_xfrac0_begin ymm_xpos_int_begin 4231%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin 4232%xdefine ymm_xfrac_inc ymm_xpos_frac_inc 4233%undef ymm_xpos_int 4234%undef ymm_xpos_frac 4235%undef ymm_xpos_int_begin 4236%undef ymm_xpos_frac_begin 4237%undef ymm_xpos_int_inc 4238%undef ymm_xpos_frac_inc 4239 AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_ffff 4240 vpermq ymm_xfrac0, ymm_tmp0, 01001110b 4241 vpermq ymm_xfrac1, ymm_xfrac1, 01001110b 4242 vmovdqa ymm_xfrac0_begin, ymm_xfrac0 4243 vmovdqa ymm_xfrac1_begin, ymm_xfrac1 4244 vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 4245 vpmullw ymm_tmp0, ymm_tmp0, ymm_xfrac_inc 4246 vpunpcklwd ymm_tmp0, ymm_tmp0, ymm_xfrac_inc 4247 vmovdqa ymm_xfrac_inc, ymm_tmp0 4248 AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearFastDownsample_16px, 1 4249 4250.final_row: 4251 mov p_src_row0, i_ypos 4252 shr p_src_row0, 15 4253 imul p_src_row0, i_src_stride 4254 add p_src_row0, p_src 4255 mov i_xpos, 1 << 15 4256 mov i_width_cnt, i_dst_width 4257 4258.final_row_width: 4259 mov r_tmp0, i_xpos 4260 shr r_tmp0, 16 4261 movzx r_tmp0, byte [p_src_row0 + r_tmp0] 4262 mov [p_dst], r_tmp0b 4263 add p_dst, 1 4264 add i_xpos, i_scalex 4265 sub i_width_cnt, 1 4266 jg .final_row_width 4267 4268.done: 4269 vzeroupper 4270%ifdef X86_32 4271 mov esp, [esp] 4272%endif 4273 POP_XMM 4274 LOAD_7_PARA_POP 4275%ifndef X86_32 4276%ifdef WIN64 4277 pop rsi 4278 pop rdi 4279%endif 4280 pop rbp 4281 pop rbx 4282 pop r13 4283 pop r12 4284%endif 4285 ret 4286%undef p_dst 4287%undef i_dst_stride_less_width 4288%undef i_dst_width 4289%undef i_dst_height 4290%undef p_src 4291%undef i_src_stride 4292%undef i_scalex 4293%undef i_scalexd 4294%undef i_scaleyd 4295%undef i_xpos 4296%undef i_ypos 4297%undef i_yposd 4298%undef p_src_row0 4299%undef p_src_row1 4300%undef i_width_cnt 4301%undef r_tmp0 4302%undef r_tmp0b 4303%undef ymm_xpos_frac 4304%undef ymm_xpos_frac_inc 4305%undef ymm_xpos_int 4306%undef ymm_xpos_int_inc 4307%undef ymm_yfrac0 4308%undef ymm_yfrac1 4309%undef xmm_tmp0 4310%undef ymm_tmp0 4311%undef xmm_tmp1 4312%undef ymm_tmp1 4313%undef xmm_tmp2 4314%undef ymm_tmp2 4315%undef xmm_tmp3 4316%undef ymm_tmp3 4317%undef xmm_tmp4 4318%undef ymm_tmp4 4319%undef xmm_tmp5 4320%undef ymm_tmp5 4321%undef ymm_ffff 4322%undef ymm_0 4323%undef ymm_xpos_int_begin 4324%undef ymm_xpos_frac_begin 4325%undef ymm_xfrac0 4326%undef ymm_xfrac1 4327%undef ymm_xfrac0_begin 4328%undef ymm_xfrac1_begin 4329%undef ymm_xfrac_inc 4330%undef xmm_shufb_0000000088888888 4331%undef xmm_shufb_000044448888CCCC 4332 4333;************************************************************************************************************** 4334;void GeneralBilinearAccurateDownsampler_avx2 (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, 4335; int32_t iDstHeight, uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, 4336; uint32_t uiScaleY); 4337; 4338;************************************************************************************************************** 4339 4340WELS_EXTERN GeneralBilinearAccurateDownsampler_avx2 4341 %assign push_num 0 4342%ifndef X86_32 4343 push r12 4344 push r13 4345 push rbx 4346 push rbp 4347 %assign push_num 4 4348%ifdef WIN64 4349 push rdi 4350 push rsi 4351 %assign push_num push_num + 2 4352%endif 4353%endif 4354 LOAD_7_PARA 4355 PUSH_XMM 16 4356 SIGN_EXTENSION r1, r1d 4357 SIGN_EXTENSION r2, r2d 4358 SIGN_EXTENSION r3, r3d 4359 SIGN_EXTENSION r5, r5d 4360 ZERO_EXTENSION r6d 4361 sub r1, r2 ; dst_stride - dst_width 4362 add r6, r6 ; 2 * scalex 4363%ifdef X86_32 4364 vmovd xmm0, arg8 4365 vmovd xmm1, esp 4366 and esp, -32 4367%ifdef X86_32_PICASM 4368 sub esp, 8 * 4 + 10 * 32 4369%else 4370 sub esp, 8 * 4 + 8 * 32 4371%endif 4372 vmovd [esp], xmm1 4373 %define p_dst r0 4374 %define i_dst_stride_less_width [esp + 1 * 4] 4375 %define i_dst_width [esp + 2 * 4] 4376 %define i_dst_height dword [esp + 3 * 4] 4377 %define p_src [esp + 4 * 4] 4378 %define i_src_stride [esp + 5 * 4] 4379 %define i_scalex r6 4380 %define i_scalexd r6d 4381 %define i_scaleyd [esp + 6 * 4] 4382 %define i_xpos r2 4383 %define i_ypos [esp + 7 * 4] 4384 %define i_yposd dword [esp + 7 * 4] 4385 %define p_src_row0 r3 4386 %define p_src_row1 r4 4387 %define i_width_cnt r5 4388 %define r_tmp0 r1 4389 %define r_tmp0b r1b 4390 %define ymm_xpos_frac ymm1 4391 %define ymm_xpos_frac_inc [esp + 8 * 4] 4392 %define ymm_xpos_int ymm3 4393 %define ymm_xpos_int_inc [esp + 8 * 4 + 1 * 32] 4394 %define ymm_yfrac0 [esp + 8 * 4 + 2 * 32] 4395 %define ymm_yfrac1 [esp + 8 * 4 + 3 * 32] 4396 %define xmm_tmp0 xmm7 4397 %define ymm_tmp0 ymm7 4398 %define xmm_tmp1 xmm0 4399 %define ymm_tmp1 ymm0 4400 %define xmm_tmp2 xmm2 4401 %define ymm_tmp2 ymm2 4402 %define xmm_tmp3 xmm4 4403 %define ymm_tmp3 ymm4 4404 %define xmm_tmp4 xmm5 4405 %define ymm_tmp4 ymm5 4406 %define xmm_tmp5 xmm6 4407 %define ymm_tmp5 ymm6 4408 %define ymm_0 [esp + 8 * 4 + 4 * 32] 4409 %define ymm_7fff [esp + 8 * 4 + 5 * 32] 4410 %define ymm_xpos_int_begin [esp + 8 * 4 + 6 * 32] 4411 %define ymm_xpos_frac_begin [esp + 8 * 4 + 7 * 32] 4412%ifdef X86_32_PICASM 4413 %define ymm_db80h [esp + 8 * 4 + 8 * 32] 4414 %define xmm_shufb_0000000088888888 [esp + 8 * 4 + 9 * 32] 4415 %define xmm_shufb_000044448888CCCC [esp + 8 * 4 + 9 * 32 + 16] 4416 vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 4417 vpcmpeqb ymm_tmp5, ymm_tmp5, ymm_tmp5 4418 vpsubb ymm_tmp4, ymm_tmp4, ymm_tmp5 4419 vpsllw ymm_tmp3, ymm_tmp4, 3 4420 vpslldq ymm_tmp3, ymm_tmp3, 8 4421 vmovdqa xmm_shufb_0000000088888888, xmm_tmp3 4422 vpsllq ymm_tmp5, ymm_tmp4, 34 4423 vpaddb ymm_tmp5, ymm_tmp5, ymm_tmp3 4424 vmovdqa xmm_shufb_000044448888CCCC, xmm_tmp5 4425 vpsllw ymm_tmp4, ymm_tmp4, 7 4426 vmovdqa ymm_db80h, ymm_tmp4 4427%else 4428 %define ymm_db80h [db80h_256] 4429 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 4430 %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] 4431%endif 4432 mov i_dst_stride_less_width, r1 4433 mov i_dst_width, r2 4434 mov i_dst_height, r3 4435 mov p_src, r4 4436 mov i_src_stride, r5 4437 vmovd i_scaleyd, xmm0 4438 vpxor xmm0, xmm0, xmm0 4439 vmovdqa ymm_0, ymm0 4440 vpcmpeqw ymm0, ymm0, ymm0 4441 vpsrlw ymm0, ymm0, 1 4442 vmovdqa ymm_7fff, ymm0 4443%else 4444 %define p_dst r0 4445 %define i_dst_stride_less_width r1 4446 %define i_dst_width r2 4447 %define i_dst_height r3 4448 %define p_src r4 4449 %define i_src_stride r5 4450 %define i_scalex r6 4451 %define i_scalexd r6d 4452 %define i_scaleyd dword arg8d 4453 %define i_xpos r12 4454 %define i_ypos r13 4455 %define i_yposd r13d 4456 %define p_src_row0 rbp 4457%ifdef WIN64 4458 %define p_src_row1 rsi 4459 %define i_width_cnt rdi 4460%else 4461 %define p_src_row1 r11 4462 %define i_width_cnt rax 4463%endif 4464 %define r_tmp0 rbx 4465 %define r_tmp0b bl 4466 %define ymm_0 ymm0 4467 %define ymm_xpos_frac ymm1 4468 %define ymm_xpos_int ymm3 4469 %define ymm_xpos_frac_inc ymm2 4470 %define ymm_xpos_int_inc ymm4 4471 %define ymm_yfrac0 ymm5 4472 %define ymm_yfrac1 ymm6 4473 %define xmm_tmp0 xmm7 4474 %define ymm_tmp0 ymm7 4475 %define xmm_tmp1 xmm8 4476 %define ymm_tmp1 ymm8 4477 %define xmm_tmp2 xmm9 4478 %define ymm_tmp2 ymm9 4479 %define xmm_tmp3 xmm10 4480 %define ymm_tmp3 ymm10 4481 %define xmm_tmp4 xmm11 4482 %define ymm_tmp4 ymm11 4483 %define xmm_tmp5 xmm12 4484 %define ymm_tmp5 ymm12 4485 %define ymm_7fff ymm13 4486 %define ymm_xpos_int_begin ymm14 4487 %define ymm_xpos_frac_begin ymm15 4488 %define ymm_db80h [db80h_256] 4489 %define xmm_shufb_0000000088888888 [shufb_0000000088888888] 4490 %define xmm_shufb_000044448888CCCC [shufb_000044448888CCCC] 4491 vpxor ymm_0, ymm_0, ymm_0 4492 vpcmpeqw ymm_7fff, ymm_7fff, ymm_7fff 4493 vpsrlw ymm_7fff, ymm_7fff, 1 4494%endif 4495 4496 sub i_dst_height, 1 4497 je .final_row 4498 jl .done 4499 4500 mov i_yposd, 1 << 14 4501 vmovd xmm_tmp0, i_scalexd 4502 vpbroadcastd ymm_tmp0, xmm_tmp0 4503 vpslld ymm_tmp1, ymm_tmp0, 2 4504 vpslld ymm_tmp2, ymm_tmp0, 3 4505 vpaddd ymm_tmp3, ymm_tmp1, ymm_tmp2 4506 vpxor ymm_tmp4, ymm_tmp4, ymm_tmp4 4507 vpblendd ymm_tmp1, ymm_tmp4, ymm_tmp1, 11110000b 4508 vpblendd ymm_tmp2, ymm_tmp2, ymm_tmp3, 11110000b 4509 vpaddd ymm_tmp3, ymm_tmp0, ymm_tmp0 4510 vpblendd ymm_tmp3, ymm_tmp4, ymm_tmp3, 11001100b 4511 vpblendd ymm_tmp0, ymm_tmp4, ymm_tmp0, 10101010b 4512 vpaddd ymm_tmp0, ymm_tmp3, ymm_tmp0 4513 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp0 4514 vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp0 4515 vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 4516 vpsrld ymm_tmp3, ymm_tmp3, 31 4517 vpslld ymm_tmp3, ymm_tmp3, 15 4518 vpaddd ymm_tmp1, ymm_tmp1, ymm_tmp3 4519 vpaddd ymm_tmp2, ymm_tmp2, ymm_tmp3 4520 vpsrld ymm_xpos_int, ymm_tmp1, 16 4521 vpsrld ymm_tmp0, ymm_tmp2, 16 4522 vpackssdw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 4523 vpermq ymm_xpos_int, ymm_xpos_int, 11011000b 4524 vpackuswb ymm_xpos_int, ymm_xpos_int, ymm_xpos_int 4525 vpcmpeqw ymm_tmp3, ymm_tmp3, ymm_tmp3 4526 vpsubb ymm_tmp0, ymm_xpos_int, ymm_tmp3 4527 vpunpcklbw ymm_xpos_int, ymm_xpos_int, ymm_tmp0 4528 vpslld ymm_tmp1, ymm_tmp1, 16 4529 vpsrld ymm_tmp1, ymm_tmp1, 16 4530 vpslld ymm_tmp2, ymm_tmp2, 16 4531 vpsrld ymm_tmp2, ymm_tmp2, 16 4532 vpackusdw ymm_xpos_frac, ymm_tmp1, ymm_tmp2 4533 vpermq ymm_xpos_frac, ymm_xpos_frac, 11011000b 4534 vpsrlw ymm_xpos_frac, ymm_xpos_frac, 1 4535 vmovd xmm_tmp0, i_scalexd 4536 vpslld xmm_tmp0, xmm_tmp0, 4 4537 vpbroadcastw ymm_tmp1, xmm_tmp0 4538 vpsrlw ymm_tmp1, ymm_tmp1, 1 4539 vmovdqa ymm_xpos_frac_inc, ymm_tmp1 4540 vpsrld xmm_tmp0, xmm_tmp0, 16 4541 vpsubw ymm_tmp0, ymm_tmp0, ymm_tmp3 4542 vpbroadcastb ymm_tmp0, xmm_tmp0 4543 vmovdqa ymm_xpos_int_inc, ymm_tmp0 4544 vmovdqa ymm_xpos_int_begin, ymm_xpos_int 4545 vmovdqa ymm_xpos_frac_begin, ymm_xpos_frac 4546 4547 cmp i_scalex, 4 << 16 4548 ja .scalex_above4 4549 cmp i_scalex, 2 << 16 4550 ja .scalex_above2_beloweq4 4551 add i_scalex, i_scalex 4552%xdefine i_scalex2 i_scalex 4553%undef i_scalex 4554 AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample2xOrLess_16px, 0 4555 shr i_scalex2, 1 4556%xdefine i_scalex i_scalex2 4557%undef i_scalex2 4558 jmp .final_row 4559.scalex_above2_beloweq4: 4560 add i_scalex, i_scalex 4561%xdefine i_scalex2 i_scalex 4562%undef i_scalex 4563 AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample4xOrLess_16px, 0 4564 shr i_scalex2, 1 4565%xdefine i_scalex i_scalex2 4566%undef i_scalex2 4567 jmp .final_row 4568.scalex_above4: 4569 cmp i_scalex, 8 << 16 4570 ja .scalex_above8 4571 add i_scalex, i_scalex 4572%xdefine i_scalex2 i_scalex 4573%undef i_scalex 4574 AVX2_GeneralBilinearDownsampler_loop AVX2_BilinearAccurateDownsample8xOrLess_16px, 0 4575 shr i_scalex2, 1 4576%xdefine i_scalex i_scalex2 4577%undef i_scalex2 4578 jmp .final_row 4579.scalex_above8: 4580%xdefine ymm_xfrac0 ymm_xpos_frac 4581%xdefine ymm_xfrac1 ymm_xpos_int 4582%xdefine ymm_xfrac0_begin ymm_xpos_int_begin 4583%xdefine ymm_xfrac1_begin ymm_xpos_frac_begin 4584%xdefine ymm_xfrac_inc ymm_xpos_frac_inc 4585%undef ymm_xpos_int 4586%undef ymm_xpos_frac 4587%undef ymm_xpos_int_begin 4588%undef ymm_xpos_frac_begin 4589%undef ymm_xpos_int_inc 4590%undef ymm_xpos_frac_inc 4591 AVX2_UnpckXFrac ymm_tmp0, ymm_xfrac1, ymm_xfrac0, ymm_7fff 4592 vpermq ymm_xfrac0, ymm_tmp0, 01001110b 4593 vpermq ymm_xfrac1, ymm_xfrac1, 01001110b 4594 vmovdqa ymm_xfrac0_begin, ymm_xfrac0 4595 vmovdqa ymm_xfrac1_begin, ymm_xfrac1 4596 vpcmpeqw ymm_tmp0, ymm_tmp0, ymm_tmp0 4597 vpmullw ymm_tmp0, ymm_tmp0, ymm_xfrac_inc 4598 vpunpcklwd ymm_tmp0, ymm_tmp0, ymm_xfrac_inc 4599 vmovdqa ymm_xfrac_inc, ymm_tmp0 4600 AVX2_GeneralBilinearDownsampler_loop AVX2_GeneralBilinearAccurateDownsample_16px, 0 4601 4602.final_row: 4603 mov p_src_row0, i_ypos 4604 shr p_src_row0, 15 4605 imul p_src_row0, i_src_stride 4606 add p_src_row0, p_src 4607 mov i_xpos, 1 << 15 4608 mov i_width_cnt, i_dst_width 4609 4610.final_row_width: 4611 mov r_tmp0, i_xpos 4612 shr r_tmp0, 16 4613 movzx r_tmp0, byte [p_src_row0 + r_tmp0] 4614 mov [p_dst], r_tmp0b 4615 add p_dst, 1 4616 add i_xpos, i_scalex 4617 sub i_width_cnt, 1 4618 jg .final_row_width 4619 4620.done: 4621 vzeroupper 4622%ifdef X86_32 4623 mov esp, [esp] 4624%endif 4625 POP_XMM 4626 LOAD_7_PARA_POP 4627%ifndef X86_32 4628%ifdef WIN64 4629 pop rsi 4630 pop rdi 4631%endif 4632 pop rbp 4633 pop rbx 4634 pop r13 4635 pop r12 4636%endif 4637 ret 4638%undef p_dst 4639%undef i_dst_stride_less_width 4640%undef i_dst_width 4641%undef i_dst_height 4642%undef p_src 4643%undef i_src_stride 4644%undef i_scalex 4645%undef i_scalexd 4646%undef i_scaleyd 4647%undef i_xpos 4648%undef i_ypos 4649%undef i_yposd 4650%undef p_src_row0 4651%undef p_src_row1 4652%undef i_width_cnt 4653%undef r_tmp0 4654%undef r_tmp0b 4655%undef ymm_xpos_frac 4656%undef ymm_xpos_frac_inc 4657%undef ymm_xpos_int 4658%undef ymm_xpos_int_inc 4659%undef ymm_yfrac0 4660%undef ymm_yfrac1 4661%undef xmm_tmp0 4662%undef ymm_tmp0 4663%undef xmm_tmp1 4664%undef ymm_tmp1 4665%undef xmm_tmp2 4666%undef ymm_tmp2 4667%undef xmm_tmp3 4668%undef ymm_tmp3 4669%undef xmm_tmp4 4670%undef ymm_tmp4 4671%undef xmm_tmp5 4672%undef ymm_tmp5 4673%undef ymm_0 4674%undef ymm_7fff 4675%undef ymm_xpos_int_begin 4676%undef ymm_xpos_frac_begin 4677%undef ymm_xfrac0 4678%undef ymm_xfrac1 4679%undef ymm_xfrac0_begin 4680%undef ymm_xfrac1_begin 4681%undef ymm_xfrac_inc 4682%undef ymm_db80h 4683%undef xmm_shufb_0000000088888888 4684%undef xmm_shufb_000044448888CCCC 4685 4686%endif 4687