1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* quant.asm 33;* 34;* Abstract 35;* sse2 quantize inter-block 36;* 37;* History 38;* 7/6/2009 Created 39;* 40;* 41;*************************************************************************/ 42 43%include "asm_inc.asm" 44 45 46SECTION .text 47;************************************************ 48;NEW_QUANT 49;************************************************ 50 51%macro SSE2_Quant8 5 52 MOVDQ %1, %5 53 pxor %2, %2 54 pcmpgtw %2, %1 55 pxor %1, %2 56 psubw %1, %2 57 paddusw %1, %3 58 pmulhuw %1, %4 59 pxor %1, %2 60 psubw %1, %2 61 MOVDQ %5, %1 62%endmacro 63 64%macro SSE2_QuantMax8 6 65 MOVDQ %1, %5 66 pxor %2, %2 67 pcmpgtw %2, %1 68 pxor %1, %2 69 psubw %1, %2 70 paddusw %1, %3 71 pmulhuw %1, %4 72 pmaxsw %6, %1 73 pxor %1, %2 74 psubw %1, %2 75 MOVDQ %5, %1 76%endmacro 77 78%define pDct esp + 4 79%define ff esp + 8 80%define mf esp + 12 81%define max esp + 16 82;*********************************************************************** 83; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); 84;*********************************************************************** 85WELS_EXTERN WelsQuant4x4_sse2 86 %assign push_num 0 87 LOAD_3_PARA 88 movdqa xmm2, [r1] 89 movdqa xmm3, [r2] 90 91 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] 92 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] 93 94 ret 95 96;*********************************************************************** 97;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf); 98;*********************************************************************** 99WELS_EXTERN WelsQuant4x4Dc_sse2 100 %assign push_num 0 101 LOAD_3_PARA 102 SIGN_EXTENSIONW r1, r1w 103 SIGN_EXTENSIONW r2, r2w 104 SSE2_Copy8Times xmm3, r2d 105 106 SSE2_Copy8Times xmm2, r1d 107 108 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] 109 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] 110 111 ret 112 113;*********************************************************************** 114; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf); 115;*********************************************************************** 116WELS_EXTERN WelsQuantFour4x4_sse2 117 %assign push_num 0 118 LOAD_3_PARA 119 MOVDQ xmm2, [r1] 120 MOVDQ xmm3, [r2] 121 122 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0] 123 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10] 124 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20] 125 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30] 126 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40] 127 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50] 128 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60] 129 SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70] 130 131 ret 132 133;*********************************************************************** 134; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max); 135;*********************************************************************** 136WELS_EXTERN WelsQuantFour4x4Max_sse2 137 %assign push_num 0 138 LOAD_4_PARA 139 PUSH_XMM 8 140 MOVDQ xmm2, [r1] 141 MOVDQ xmm3, [r2] 142 143 pxor xmm4, xmm4 144 pxor xmm5, xmm5 145 pxor xmm6, xmm6 146 pxor xmm7, xmm7 147 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4 148 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4 149 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5 150 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5 151 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6 152 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6 153 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7 154 SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7 155 156 SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0 157 pmaxsw xmm0, xmm4 158 pmaxsw xmm0, xmm5 159 pmaxsw xmm0, xmm7 160 movdqa xmm1, xmm0 161 punpckhqdq xmm0, xmm1 162 pmaxsw xmm0, xmm1 163 164 movq [r3], xmm0 165 POP_XMM 166 LOAD_4_PARA_POP 167 ret 168 169%macro MMX_Copy4Times 2 170 movd %1, %2 171 punpcklwd %1, %1 172 punpckldq %1, %1 173%endmacro 174 175SECTION .text 176 177%macro MMX_Quant4 4 178 pxor %2, %2 179 pcmpgtw %2, %1 180 pxor %1, %2 181 psubw %1, %2 182 paddusw %1, %3 183 pmulhuw %1, %4 184 pxor %1, %2 185 psubw %1, %2 186%endmacro 187 188;*********************************************************************** 189;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block); 190;*********************************************************************** 191WELS_EXTERN WelsHadamardQuant2x2_mmx 192 %assign push_num 0 193 LOAD_5_PARA 194 SIGN_EXTENSIONW r1, r1w 195 SIGN_EXTENSIONW r2, r2w 196 movd mm0, [r0] 197 movd mm1, [r0 + 0x20] 198 punpcklwd mm0, mm1 199 movd mm3, [r0 + 0x40] 200 movd mm1, [r0 + 0x60] 201 punpcklwd mm3, mm1 202 203 ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 204 movq mm5, mm3 205 paddw mm3, mm0 206 psubw mm0, mm5 207 punpcklwd mm3, mm0 208 movq mm1, mm3 209 psrlq mm1, 32 210 movq mm5, mm1 211 paddw mm1, mm3 212 psubw mm3, mm5 213 punpcklwd mm1, mm3 214 215 ;quant_2x2_dc 216 MMX_Copy4Times mm3, r2d 217 MMX_Copy4Times mm2, r1d 218 MMX_Quant4 mm1, mm0, mm2, mm3 219 220 ; store dct_2x2 221 movq [r3], mm1 222 movq [r4], mm1 223 224 ; pNonZeroCount of dct_2x2 225 pcmpeqb mm2, mm2 ; mm2 = FF 226 pxor mm3, mm3 227 packsswb mm1, mm3 228 pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal 229 psubsb mm1, mm2 ; set 0 if equal, 1 if not equal 230 psadbw mm1, mm3 ; 231 mov r1w, 0 232 mov [r0], r1w 233 mov [r0 + 0x20], r1w 234 mov [r0 + 0x40], r1w 235 mov [r0 + 0x60], r1w 236 237 238 movd retrd, mm1 239 240 WELSEMMS 241 LOAD_5_PARA_POP 242 ret 243 244;*********************************************************************** 245;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf); 246;*********************************************************************** 247WELS_EXTERN WelsHadamardQuant2x2Skip_mmx 248 %assign push_num 0 249 LOAD_3_PARA 250 SIGN_EXTENSIONW r1, r1w 251 SIGN_EXTENSIONW r2, r2w 252 movd mm0, [r0] 253 movd mm1, [r0 + 0x20] 254 punpcklwd mm0, mm1 255 movd mm3, [r0 + 0x40] 256 movd mm1, [r0 + 0x60] 257 punpcklwd mm3, mm1 258 259 ;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3 260 movq mm5, mm3 261 paddw mm3, mm0 262 psubw mm0, mm5 263 punpcklwd mm3, mm0 264 movq mm1, mm3 265 psrlq mm1, 32 266 movq mm5, mm1 267 paddw mm1, mm3 268 psubw mm3, mm5 269 punpcklwd mm1, mm3 270 271 ;quant_2x2_dc 272 MMX_Copy4Times mm3, r2d 273 MMX_Copy4Times mm2, r1d 274 MMX_Quant4 mm1, mm0, mm2, mm3 275 276 ; pNonZeroCount of dct_2x2 277 pcmpeqb mm2, mm2 ; mm2 = FF 278 pxor mm3, mm3 279 packsswb mm1, mm3 280 pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal 281 psubsb mm1, mm2 ; set 0 if equal, 1 if not equal 282 psadbw mm1, mm3 ; 283 movd retrd, mm1 284 285 WELSEMMS 286 ret 287 288 289%macro SSE2_DeQuant8 3 290 MOVDQ %2, %1 291 pmullw %2, %3 292 MOVDQ %1, %2 293%endmacro 294 295 296;*********************************************************************** 297; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf); 298;*********************************************************************** 299WELS_EXTERN WelsDequant4x4_sse2 300 %assign push_num 0 301 LOAD_2_PARA 302 303 movdqa xmm1, [r1] 304 SSE2_DeQuant8 [r0 ], xmm0, xmm1 305 SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1 306 307 ret 308 309;*********************************************************************** 310;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf); 311;*********************************************************************** 312 313WELS_EXTERN WelsDequantFour4x4_sse2 314 %assign push_num 0 315 LOAD_2_PARA 316 317 movdqa xmm1, [r1] 318 SSE2_DeQuant8 [r0 ], xmm0, xmm1 319 SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1 320 SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1 321 SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1 322 SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1 323 SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1 324 SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1 325 SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1 326 327 ret 328 329;*********************************************************************** 330;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf); 331;*********************************************************************** 332WELS_EXTERN WelsDequantIHadamard4x4_sse2 333 %assign push_num 0 334 LOAD_2_PARA 335 %ifndef X86_32 336 movzx r1, r1w 337 %endif 338 339 ; WelsDequantLumaDc4x4 340 SSE2_Copy8Times xmm1, r1d 341 ;psrlw xmm1, 2 ; for the (>>2) in ihdm 342 MOVDQ xmm0, [r0] 343 MOVDQ xmm2, [r0+0x10] 344 pmullw xmm0, xmm1 345 pmullw xmm2, xmm1 346 347 ; ihdm_4x4 348 movdqa xmm1, xmm0 349 psrldq xmm1, 8 350 movdqa xmm3, xmm2 351 psrldq xmm3, 8 352 353 SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3 354 SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2 355 SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2 356 SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1 357 358 SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4 359 SSE2_SumSub xmm2, xmm4, xmm5 360 SSE2_SumSub xmm1, xmm0, xmm5 361 SSE2_SumSub xmm4, xmm0, xmm5 362 SSE2_SumSub xmm2, xmm1, xmm5 363 SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3 364 365 punpcklqdq xmm0, xmm1 366 MOVDQ [r0], xmm0 367 368 punpcklqdq xmm2, xmm3 369 MOVDQ [r0+16], xmm2 370 ret 371 372 373%ifdef HAVE_AVX2 374; data=%1 abs_out=%2 ff=%3 mf=%4 7FFFh=%5 375%macro AVX2_Quant 5 376 vpabsw %2, %1 377 vpor %1, %1, %5 ; ensure non-zero before vpsignw 378 vpaddusw %2, %2, %3 379 vpmulhuw %2, %2, %4 380 vpsignw %1, %2, %1 381%endmacro 382 383 384;*********************************************************************** 385; void WelsQuant4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf); 386;*********************************************************************** 387 388WELS_EXTERN WelsQuant4x4_avx2 389 %assign push_num 0 390 LOAD_3_PARA 391 PUSH_XMM 5 392 vbroadcasti128 ymm0, [r1] 393 vbroadcasti128 ymm1, [r2] 394 WELS_DW32767_VEX ymm2 395 vmovdqu ymm3, [r0] 396 AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2 397 vmovdqu [r0], ymm3 398 vzeroupper 399 POP_XMM 400 ret 401 402 403;*********************************************************************** 404;void WelsQuant4x4Dc_avx2(int16_t *pDct, int16_t ff, int16_t mf); 405;*********************************************************************** 406 407WELS_EXTERN WelsQuant4x4Dc_avx2 408 %assign push_num 0 409 LOAD_1_PARA 410 PUSH_XMM 5 411%ifidni r1, arg2 412 vmovd xmm0, arg2d 413 vpbroadcastw ymm0, xmm0 414%else 415 vpbroadcastw ymm0, arg2 416%endif 417%ifidni r2, arg3 418 vmovd xmm1, arg3d 419 vpbroadcastw ymm1, xmm1 420%else 421 vpbroadcastw ymm1, arg3 422%endif 423 WELS_DW32767_VEX ymm2 424 vmovdqu ymm3, [r0] 425 AVX2_Quant ymm3, ymm4, ymm0, ymm1, ymm2 426 vmovdqu [r0], ymm3 427 vzeroupper 428 POP_XMM 429 ret 430 431 432;*********************************************************************** 433; void WelsQuantFour4x4_avx2(int16_t *pDct, int16_t* ff, int16_t *mf); 434;*********************************************************************** 435 436WELS_EXTERN WelsQuantFour4x4_avx2 437 %assign push_num 0 438 LOAD_3_PARA 439 PUSH_XMM 6 440 vbroadcasti128 ymm0, [r1] 441 vbroadcasti128 ymm1, [r2] 442 WELS_DW32767_VEX ymm4 443 vmovdqu ymm3, [r0 + 0x00] 444 vmovdqu ymm5, [r0 + 0x20] 445 AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4 446 vmovdqu [r0 + 0x00], ymm3 447 AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4 448 vmovdqu [r0 + 0x20], ymm5 449 vmovdqu ymm3, [r0 + 0x40] 450 vmovdqu ymm5, [r0 + 0x60] 451 AVX2_Quant ymm3, ymm2, ymm0, ymm1, ymm4 452 vmovdqu [r0 + 0x40], ymm3 453 AVX2_Quant ymm5, ymm2, ymm0, ymm1, ymm4 454 vmovdqu [r0 + 0x60], ymm5 455 vzeroupper 456 POP_XMM 457 ret 458 459 460;*********************************************************************** 461; void WelsQuantFour4x4Max_avx2(int16_t *pDct, int32_t* ff, int16_t *mf, int16_t *max); 462;*********************************************************************** 463 464WELS_EXTERN WelsQuantFour4x4Max_avx2 465 %assign push_num 0 466 LOAD_4_PARA 467 PUSH_XMM 7 468 vbroadcasti128 ymm0, [r1] 469 vbroadcasti128 ymm1, [r2] 470 WELS_DW32767_VEX ymm6 471 vmovdqu ymm4, [r0 + 0x00] 472 vmovdqu ymm5, [r0 + 0x20] 473 AVX2_Quant ymm4, ymm2, ymm0, ymm1, ymm6 474 vmovdqu [r0 + 0x00], ymm4 475 AVX2_Quant ymm5, ymm3, ymm0, ymm1, ymm6 476 vmovdqu [r0 + 0x20], ymm5 477 vperm2i128 ymm4, ymm2, ymm3, 00100000b 478 vperm2i128 ymm3, ymm2, ymm3, 00110001b 479 vpmaxsw ymm2, ymm4, ymm3 480 vmovdqu ymm4, [r0 + 0x40] 481 vmovdqu ymm5, [r0 + 0x60] 482 AVX2_Quant ymm4, ymm3, ymm0, ymm1, ymm6 483 vmovdqu [r0 + 0x40], ymm4 484 AVX2_Quant ymm5, ymm4, ymm0, ymm1, ymm6 485 vmovdqu [r0 + 0x60], ymm5 486 vperm2i128 ymm5, ymm3, ymm4, 00100000b 487 vperm2i128 ymm4, ymm3, ymm4, 00110001b 488 vpmaxsw ymm3, ymm5, ymm4 489 vpxor ymm2, ymm2, ymm6 ; flip bits so as to enable use of vphminposuw to find max value. 490 vpxor ymm3, ymm3, ymm6 ; flip bits so as to enable use of vphminposuw to find max value. 491 vextracti128 xmm4, ymm2, 1 492 vextracti128 xmm5, ymm3, 1 493 vphminposuw xmm2, xmm2 494 vphminposuw xmm3, xmm3 495 vphminposuw xmm4, xmm4 496 vphminposuw xmm5, xmm5 497 vpunpcklwd xmm2, xmm2, xmm4 498 vpunpcklwd xmm3, xmm3, xmm5 499 vpunpckldq xmm2, xmm2, xmm3 500 vpxor xmm2, xmm2, xmm6 ; restore non-flipped values. 501 vmovq [r3], xmm2 ; store max values. 502 vzeroupper 503 POP_XMM 504 LOAD_4_PARA_POP 505 ret 506%endif 507 508