1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* deblock.asm 33;* 34;* Abstract 35;* edge loop 36;* 37;* History 38;* 08/07/2009 Created 39;* 40;* 41;*************************************************************************/ 42%include "asm_inc.asm" 43 44;******************************************************************************* 45; Macros and other preprocessor constants 46;******************************************************************************* 47 48%ifdef X86_32_PICASM 49SECTION .text align=16 50%else 51SECTION .rodata align=16 52%endif 53 54ALIGN 16 55FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4 56 57ALIGN 16 58WELS_DB1_16: 59 times 16 db 1 60WELS_DB127_16: 61 times 16 db 127 62WELS_DB96_16: 63 times 16 db 96 64WELS_SHUFB0000111122223333: 65 times 4 db 0 66 times 4 db 1 67 times 4 db 2 68 times 4 db 3 69 70 71SECTION .text 72 73; Unsigned byte absolute difference. 74; a=%1 b=%2 clobber=%3 75; Subtract once in each direction with saturation and return the maximum. 76%macro SSE2_AbsDiffUB 3 77 movdqa %3, %2 78 psubusb %3, %1 79 psubusb %1, %2 80 por %1, %3 81%endmacro 82 83; Unsigned byte compare less than. 84; lhs=%1 rhs^0x7f=%2 0x7f=%3 85; No unsigned byte lt/gt compare instruction available; xor by 0x7f and use a 86; signed compare. Some other options do exist. This one allows modifying the lhs 87; without mov and uses a bitwise op which can be executed on most ports on 88; common architectures. 89%macro SSE2_CmpltUB 3 90 pxor %1, %3 91 pcmpgtb %1, %2 92%endmacro 93 94; Unsigned byte compare greater than or equal. 95%macro SSE2_CmpgeUB 2 96 pminub %1, %2 97 pcmpeqb %1, %2 98%endmacro 99 100; Clip unsigned bytes to ref +/- diff. 101; data=%1 ref=%2 maxdiff_from_ref=%3 clobber=%4 102%macro SSE2_ClipUB 4 103 movdqa %4, %2 104 psubusb %4, %3 105 paddusb %3, %2 106 pmaxub %1, %4 107 pminub %1, %3 108%endmacro 109 110; (a + b + 1 - c) >> 1 111; a=%1 b=%2 c=%3 [out:a^b&c]=%4 112%macro SSE2_AvgbFloor1 4 113 movdqa %4, %1 114 pxor %4, %2 115 pavgb %1, %2 116 pand %4, %3 117 psubb %1, %4 118%endmacro 119 120; (a + b + carry) >> 1 121; a=%1 b=%2 carry-1=%3 122%macro SSE2_AvgbFloor2 3 123 pxor %1, %3 124 pxor %2, %3 125 pavgb %1, %2 126 pxor %1, %3 127%endmacro 128 129; a = (a & m) | (b & ~m) 130; a=%1 b=%2 m=%3 131%macro SSE2_Blend 3 132 pand %1, %3 133 pandn %3, %2 134 por %1, %3 135%endmacro 136 137; Compute 138; p0 = clip(p0 + clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255) 139; q0 = clip(q0 - clip((q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1, -iTc, iTc), 0, 255) 140; 16-wide parallel in packed byte representation in xmm registers. 141; 142; p1=%1 p0=%2 q0=%3 q1=%4 iTc=%5 FFh=%6 xmmclobber=%7,%8 143%macro SSE2_DeblockP0Q0_Lt4 8 144 ; (q0 - p0 + ((p1 - q1) >> 2) + 1) >> 1 clipped to [-96, 159] and biased to [0, 255]. 145 ; A limited range is sufficient because the value is clipped to [-iTc, iTc] later. 146 ; Bias so that unsigned saturation can be used. 147 ; Get ((p1 - q1) >> 2) + 192 via a pxor and two pavgbs. 148 ; q0 - p0 is split into a non-negative and non-positive part. The latter is 149 ; subtracted from the biased value. 150 movdqa %7, %2 151 psubusb %7, %3 ; clip(p0 - q0, 0, 255) 152 ; ((p1 - q1) >> 2) + 0xc0 153 pxor %4, %6 ; q1 ^ 0xff aka -q1 - 1 & 0xff 154 pavgb %1, %4 ; (((p1 - q1 + 0x100) >> 1) 155 pavgb %1, %6 ; + 0x100) >> 1 156 psubusb %1, %7 ; -= clip(p0 - q0, 0, 255) saturate. 157 movdqa %8, %3 158 psubusb %8, %2 ; (clip(q0 - p0, 0, 255) 159 pavgb %8, %1 ; + clip(((p1 - q1 + 0x300) >> 2) - clip(p0 - q0, 0, 255), 0, 255) + 1) >> 1 160 161 ; Unbias and split into a non-negative and a non-positive part. 162 ; Clip each part to iTc via minub. 163 ; Add/subtract each part to/from p0/q0 and clip. 164 movdqa %6, [pic(WELS_DB96_16)] 165 psubusb %6, %8 166 psubusb %8, [pic(WELS_DB96_16)] 167 pminub %6, %5 168 pminub %8, %5 169 psubusb %2, %6 170 paddusb %2, %8 ; p0 171 paddusb %3, %6 172 psubusb %3, %8 ; q0 173%endmacro 174 175 176;******************************************************************************* 177; void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 178; int32_t iBeta, int8_t * pTC) 179;******************************************************************************* 180 181WELS_EXTERN DeblockLumaLt4V_ssse3 182 %assign push_num 0 183 INIT_X86_32_PIC r5 184 LOAD_5_PARA 185 PUSH_XMM 8 186 SIGN_EXTENSION r1, r1d 187 movd xmm1, arg3d 188 movd xmm2, arg4d 189 pxor xmm3, xmm3 190 pxor xmm1, [pic(WELS_DB127_16)] 191 pxor xmm2, [pic(WELS_DB127_16)] 192 pshufb xmm1, xmm3 ; iAlpha ^ 0x7f 193 pshufb xmm2, xmm3 ; iBeta ^ 0x7f 194 mov r2, r1 ; iStride 195 neg r1 ; -iStride 196 lea r3, [r0 + r1] ; pPix - iStride 197 198 ; Compute masks to enable/disable deblocking. 199 MOVDQ xmm6, [r3 + 0 * r1] ; p0 200 MOVDQ xmm7, [r3 + 1 * r1] ; p1 201 MOVDQ xmm0, [r0 + 0 * r2] ; q0 202 movdqa xmm4, xmm6 203 SSE2_AbsDiffUB xmm6, xmm0, xmm3 ; |p0 - q0| 204 SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)] ; bDeltaP0Q0 = |p0 - q0| < iAlpha 205 MOVDQ xmm1, [r0 + 1 * r2] ; q1 206 SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p1 - p0| 207 SSE2_AbsDiffUB xmm0, xmm1, xmm3 ; |q1 - q0| 208 pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|) 209 SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta 210 pand xmm6, xmm7 ; bDeltaP0Q0P1P0Q1Q0 = bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0 211 MOVDQ xmm7, [r3 + 2 * r1] ; p2 212 movdqa xmm0, xmm7 213 SSE2_AbsDiffUB xmm7, xmm4, xmm3 ; |p2 - p0| 214 SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP2P0 = |p2 - p0| < iBeta 215 MOVDQ xmm5, [r0 + 2 * r2] ; q2 216 MOVDQ xmm3, [r0 + 0 * r2] ; q0 217 movdqa xmm1, xmm5 218 SSE2_AbsDiffUB xmm5, xmm3, xmm4 ; |q2 - q0| 219 SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)] ; bDeltaQ2Q0 = |q2 - q0| < iBeta 220 221 pavgb xmm3, [r3 + 0 * r1] 222 pcmpeqw xmm2, xmm2 ; FFh 223 pxor xmm3, xmm2 224 ; (p2 + ((p0 + q0 + 1) >> 1)) >> 1 225 pxor xmm0, xmm2 226 pavgb xmm0, xmm3 227 pxor xmm0, xmm2 228 ; (q2 + ((p0 + q0 + 1) >> 1)) >> 1 229 pxor xmm1, xmm2 230 pavgb xmm1, xmm3 231 pxor xmm1, xmm2 232 233 movd xmm3, [r4] 234 pshufb xmm3, [pic(WELS_SHUFB0000111122223333)] ; iTc 235 movdqa xmm4, xmm3 ; iTc0 = iTc 236 pcmpgtb xmm3, xmm2 ; iTc > -1 ? 0xff : 0x00 237 pand xmm6, xmm3 ; bDeltaP0Q0P1P0Q1Q0 &= iTc > -1 238 movdqa xmm3, xmm4 239 psubb xmm3, xmm7 ; iTc -= bDeltaP2P0 ? -1 : 0 240 psubb xmm3, xmm5 ; iTc -= bDeltaQ2Q0 ? -1 : 0 241 pand xmm3, xmm6 ; iTc &= bDeltaP0Q0P1P0Q1Q0 ? 0xff : 0 242 pand xmm7, xmm6 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0 243 pand xmm5, xmm6 ; bDeltaQ2Q0 &= bDeltaP0Q0P1P0Q1Q0 244 pand xmm7, xmm4 ; iTc0 & (bDeltaP2P0 ? 0xff : 0) 245 pand xmm5, xmm4 ; iTc0 & (bDeltaQ2Q0 ? 0xff : 0) 246 247 MOVDQ xmm4, [r3 + 1 * r1] 248 SSE2_ClipUB xmm0, xmm4, xmm7, xmm6 ; clip p1. 249 MOVDQ xmm6, [r0 + 1 * r2] 250 MOVDQ [r3 + 1 * r1], xmm0 ; store p1. 251 SSE2_ClipUB xmm1, xmm6, xmm5, xmm7 ; clip q1. 252 MOVDQ [r0 + 1 * r2], xmm1 ; store q1. 253 254 MOVDQ xmm1, [r3 + 0 * r1] ; p0 255 MOVDQ xmm0, [r0 + 0 * r2] ; q0 256 SSE2_DeblockP0Q0_Lt4 xmm4, xmm1, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7 257 MOVDQ [r3 + 0 * r1], xmm1 ; store p0. 258 MOVDQ [r0 + 0 * r2], xmm0 ; store q0. 259 260 POP_XMM 261 LOAD_5_PARA_POP 262 DEINIT_X86_32_PIC 263 ret 264 265 266; Deblock 3x16 luma pixels for the eq4 case. 267; 268; Compose 8-bit averages from pavgbs. Ie. (p1 + p0 + p2 + q0 + 2) >> 2 can be 269; written as (((p1 + p0) >> 1) + ((p2 + q0 + (p1 ^ p0 & 1)) >> 1) + 1) >> 1, 270; which maps to 3 pavgbs. 271; 272; pPix=%1 iStride=%2 [in:q0,out:p0]=%3 [in:q1,out:p1]=%4 bDeltaP0Q0P1P0Q1Q0=%5 bDeltaP2P0=%6 clobber=%7,%8,%9,%10 preserve_p0p1=%11 db1=%12 273%macro SSE2_DeblockLumaEq4_3x16P 12 274 movdqa %7, %3 275 movdqa %8, %6 276 MOVDQ %10, [%1 + 1 * %2] ; p1 277 SSE2_Blend %7, %10, %8 ; t0 = bDeltaP2P0 ? q0 : p1 278 movdqa %8, %6 279 MOVDQ %9, [%1 + 2 * %2] ; p2 280 SSE2_Blend %9, %4, %8 ; t1 = bDeltaP2P0 ? p2 : q1 281 SSE2_AvgbFloor1 %4, %9, %12, %8 ; t1 = (t1 + q1) >> 1 282 SSE2_AvgbFloor1 %10, [%1], %12, %8 ; (p0 + p1) >> 1, p0 ^ p1 283 pxor %8, %12 284 SSE2_AvgbFloor1 %7, %4, %8, %9 ; (t0 + t1 + (p0 ^ p1 & 1)) >> 1 285 MOVDQ %9, [%1 + 2 * %2] ; p2 286 SSE2_AvgbFloor1 %3, %9, %8, %4 ; (p2 + q0 + (p0 ^ p1 & 1)) >> 1 287 pavgb %7, %10 ; p0' = (p0 + p1 + t0 + t1 + 2) >> 2 288 movdqa %8, %10 289 pxor %8, %3 ; (p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 290 pand %8, %12 ; & 1 291 pavgb %10, %3 ; p1' = (p0 + p1 + p2 + q0 + 2) >> 2 292 pand %6, %5 ; bDeltaP2P0 &= bDeltaP0Q0P1P0Q1Q0 293%if %11 294 MOVDQ %3, [%1 + 0 * %2] ; p0 295 movdqa %4, %5 296 SSE2_Blend %7, %3, %4 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 297%else 298 SSE2_Blend %7, [%1 + 0 * %2], %5 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 299%endif 300 MOVDQ [%1 + 0 * %2], %7 ; store p0 301 add %1, %2 302 movdqa %7, %10 303 psubb %10, %8 ; (p0 + p1 + p2 + q0) >> 2 304 psubb %8, %12 305 MOVDQ %4, [%1 + (3 - 1) * %2] ; p3 306 SSE2_AvgbFloor2 %4, %9, %8 ; (p2 + p3 + ((p0 + p1) >> 1 ^ (p2 + q0 + (p0 ^ p1 & 1)) >> 1 & 1)) >> 1 307 pavgb %10, %4 ; p2' = (((p0 + p1 + p2 + q0) >> 1) + p2 + p3 + 2) >> 2 308 movdqa %8, %6 309 SSE2_Blend %10, [%1 + (2 - 1) * %2], %8 ; p2out = bDeltaP2P0 ? p2' : p2 310 MOVDQ [%1 + (2 - 1) * %2], %10 ; store p2 311%if %11 312 MOVDQ %4, [%1 + (1 - 1) * %2] ; p1 313 SSE2_Blend %7, %4, %6 ; p1out = bDeltaP2P0 ? p1' : p1 314%else 315 SSE2_Blend %7, [%1 + (1 - 1) * %2], %6 ; p1out = bDeltaP2P0 ? p1' : p1 316%endif 317 MOVDQ [%1 + (1 - 1) * %2], %7 ; store p1 318%endmacro 319 320 321;******************************************************************************* 322; void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 323; int32_t iBeta) 324;******************************************************************************* 325 326WELS_EXTERN DeblockLumaEq4V_ssse3 327 %assign push_num 0 328 INIT_X86_32_PIC r4 329 LOAD_4_PARA 330 PUSH_XMM 10 331 SIGN_EXTENSION r1, r1d 332 movd xmm1, arg3d 333 movd xmm2, arg4d 334 shr r2, 2 335 add r2, 1 336 movd xmm3, r2d 337 pxor xmm4, xmm4 338 pxor xmm1, [pic(WELS_DB127_16)] 339 pxor xmm2, [pic(WELS_DB127_16)] 340 pshufb xmm1, xmm4 ; iAlpha ^ 0x7f 341 pshufb xmm2, xmm4 ; iBeta ^ 0x7f 342 pshufb xmm3, xmm4 ; (iAlpha >> 2) + 1 343 mov r2, r1 ; iStride 344 neg r1 ; -iStride 345 lea r3, [r0 + r1] ; pPix - iStride 346 347 ; Compute masks to enable/disable filtering. 348 MOVDQ xmm7, [r3 + 1 * r1] ; p1 349 MOVDQ xmm6, [r3 + 0 * r1] ; p0 350 MOVDQ xmm0, [r0 + 0 * r2] ; q0 351 movdqa xmm4, xmm6 352 SSE2_AbsDiffUB xmm6, xmm0, xmm5 ; |p0 - q0| 353 SSE2_CmpgeUB xmm3, xmm6 ; |p0 - q0| < (iAlpha >> 2) + 2 354 SSE2_CmpltUB xmm6, xmm1, [pic(WELS_DB127_16)] ; bDeltaP0Q0 = |p0 - q0| < iAlpha 355 MOVDQ xmm1, [r0 + 1 * r2] ; q1 356 SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p1 - p0| 357 SSE2_AbsDiffUB xmm0, xmm1, xmm5 ; |q1 - q0| 358 pmaxub xmm7, xmm0 ; max(|p1 - p0|, |q1 - q0|) 359 SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP1P0 & bDeltaQ1Q0 = max(|p1 - p0|, |q1 - q0|) < iBeta 360 pand xmm6, xmm7 ; & bDeltaP0Q0 361 362 MOVDQ xmm7, [r3 + 2 * r1] ; p2 363 SSE2_AbsDiffUB xmm7, xmm4, xmm5 ; |p2 - p0| 364 SSE2_CmpltUB xmm7, xmm2, [pic(WELS_DB127_16)] ; bDeltaP2P0 = |p2 - p0| < iBeta 365 pand xmm7, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2 366 367 MOVDQ xmm0, [r0 + 0 * r2] ; q0 368 MOVDQ xmm5, [r0 + 2 * r2] ; q2 369 SSE2_AbsDiffUB xmm5, xmm0, xmm4 ; |q2 - q0| 370 SSE2_CmpltUB xmm5, xmm2, [pic(WELS_DB127_16)] ; bDeltaQ2Q0 = |q2 - q0| < iBeta 371 pand xmm5, xmm3 ; &= |p0 - q0| < (iAlpha >> 2) + 2 372 373%ifdef X86_32 374 ; Push xmm5 to free up one register. Align stack so as to ensure that failed 375 ; store forwarding penalty cannot occur (up to ~50 cycles for 128-bit on IVB). 376 mov r2, esp 377 sub esp, 16 378 and esp, -16 379 movdqa [esp], xmm5 380 SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm5, xmm4, 1, [pic(WELS_DB1_16)] 381 movdqa xmm5, [esp] 382 mov esp, r2 383 neg r1 384 SSE2_DeblockLumaEq4_3x16P r0, r1, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, [pic(WELS_DB1_16)] 385%else 386 movdqa xmm9, [WELS_DB1_16] 387 SSE2_DeblockLumaEq4_3x16P r3, r1, xmm0, xmm1, xmm6, xmm7, xmm2, xmm3, xmm8, xmm4, 1, xmm9 388 SSE2_DeblockLumaEq4_3x16P r0, r2, xmm0, xmm1, xmm6, xmm5, xmm2, xmm3, xmm7, xmm4, 0, xmm9 389%endif 390 391 POP_XMM 392 LOAD_4_PARA_POP 393 DEINIT_X86_32_PIC 394 ret 395 396 397; [out:p1,p0,q0,q1]=%1,%2,%3,%4 pPixCb=%5 pPixCr=%6 iStride=%7 3*iStride-1=%8 xmmclobber=%9,%10,%11 398%macro SSE2_LoadCbCr_4x16H 11 399 movd %1, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 0 400 movd %2, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 2 401 punpcklbw %1, %2 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 0,2 402 movd %2, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 4 403 movd %9, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 6 404 punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 4,6 405 punpcklwd %1, %2 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 0,2,4,6 406 movd %2, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 0 407 movd %9, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 2 408 punpcklbw %2, %9 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 0,2 409 movd %9, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 4 410 movd %10, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 6 411 punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 4,6 412 punpcklwd %2, %9 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 0,2,4,6 413 add %5, %7 ; pPixCb += iStride 414 add %6, %7 ; pPixCr += iStride 415 movd %9, [%5 + 0 * %7 - 2] ; [p1,p0,q0,q1] cb line 1 416 movd %10, [%5 + 2 * %7 - 2] ; [p1,p0,q0,q1] cb line 3 417 punpcklbw %9, %10 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 1,3 418 movd %10, [%5 + 4 * %7 - 2] ; [p1,p0,q0,q1] cb line 5 419 movd %3, [%5 + 2 * %8] ; [p1,p0,q0,q1] cb line 7 420 punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cb line 5,7 421 punpcklwd %9, %10 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cb line 1,3,5,7 422 movd %10, [%6 + 0 * %7 - 2] ; [p1,p0,q0,q1] cr line 1 423 movd %3, [%6 + 2 * %7 - 2] ; [p1,p0,q0,q1] cr line 3 424 punpcklbw %10, %3 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 1,3 425 movd %3, [%6 + 4 * %7 - 2] ; [p1,p0,q0,q1] cr line 5 426 movd %4, [%6 + 2 * %8] ; [p1,p0,q0,q1] cr line 7 427 punpcklbw %3, %4 ; [p1,p1,p0,p0,q0,q0,q1,q1] cr line 5,7 428 punpcklwd %10, %3 ; [p1,p1,p1,p1,p0,p0,p0,p0,q0,q0,q0,q0,q1,q1,q1,q1] cr line 1,3,5,7 429 movdqa %3, %1 430 punpckldq %1, %2 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6 431 punpckhdq %3, %2 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6 432 movdqa %11, %9 433 punpckldq %9, %10 ; [p1,p1,p1,p1,p1,p1,p1,p1,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 1,3,5,7 434 punpckhdq %11, %10 ; [q0,q0,q0,q0,q0,q0,q0,q0,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 1,3,5,7 435 movdqa %2, %1 436 punpcklqdq %1, %9 ; [p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1,p1] cb/cr line 0,2,4,6,1,3,5,7 437 punpckhqdq %2, %9 ; [p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0,p0] cb/cr line 0,2,4,6,1,3,5,7 438 movdqa %4, %3 439 punpcklqdq %3, %11 ; [q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0,q0] cb/cr line 0,2,4,6,1,3,5,7 440 punpckhqdq %4, %11 ; [q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1,q1] cb/cr line 0,2,4,6,1,3,5,7 441%endmacro 442 443; pPixCb+iStride=%1 pPixCr+iStride=%2 iStride=%3 3*iStride-1=%4 p0=%5 q0=%6 rclobber=%7 dwclobber={%8,%9} xmmclobber=%10 444%macro SSE2_StoreCbCr_4x16H 10 445 movdqa %10, %5 446 punpcklbw %10, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 447 punpckhbw %5, %6 ; [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 448 mov %7, r7 ; preserve stack pointer 449 and r7, -16 ; align stack pointer 450 sub r7, 32 ; allocate stack space 451 movdqa [r7 ], %10 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 0,2,4,6 on the stack 452 movdqa [r7 + 16], %5 ; store [p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0,p0,q0] cb/cr line 1,3,5,7 on the stack 453 mov %8, [r7 + 16] ; [p0,q0,p0,q0] cb line 1,3 454 mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 1 455 shr %8, 16 ; [p0,q0] cb line 3 456 mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 3 457 mov %8, [r7 + 20] ; [p0,q0,p0,q0] cb line 5,7 458 mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 5 459 shr %8, 16 ; [p0,q0] cb line 7 460 mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 7 461 mov %8, [r7 + 24] ; [p0,q0,p0,q0] cr line 1,3 462 mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 1 463 shr %8, 16 ; [p0,q0] cr line 3 464 mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 3 465 mov %8, [r7 + 28] ; [p0,q0,p0,q0] cr line 5,7 466 mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 5 467 shr %8, 16 ; [p0,q0] cr line 7 468 mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 7 469 sub %1, %3 ; pPixCb -= iStride 470 sub %2, %3 ; pPixCr -= iStride 471 mov %8, [r7 ] ; [p0,q0,p0,q0] cb line 0,2 472 mov [%1 + 0 * %3 - 1], %9 ; store [p0,q0] cb line 0 473 shr %8, 16 ; [p0,q0] cb line 2 474 mov [%1 + 2 * %3 - 1], %9 ; store [p0,q0] cb line 2 475 mov %8, [r7 + 4] ; [p0,q0,p0,q0] cb line 4,6 476 mov [%1 + 4 * %3 - 1], %9 ; store [p0,q0] cb line 4 477 shr %8, 16 ; [p0,q0] cb line 6 478 mov [%1 + 2 * %4 + 1], %9 ; store [p0,q0] cb line 6 479 mov %8, [r7 + 8] ; [p0,q0,p0,q0] cr line 0,2 480 mov [%2 + 0 * %3 - 1], %9 ; store [p0,q0] cr line 0 481 shr %8, 16 ; [p0,q0] cr line 2 482 mov [%2 + 2 * %3 - 1], %9 ; store [p0,q0] cr line 2 483 mov %8, [r7 + 12] ; [p0,q0,p0,q0] cr line 4,6 484 mov [%2 + 4 * %3 - 1], %9 ; store [p0,q0] cr line 4 485 shr %8, 16 ; [p0,q0] cr line 6 486 mov [%2 + 2 * %4 + 1], %9 ; store [p0,q0] cr line 6 487 mov r7, %7 ; restore stack pointer 488%endmacro 489 490; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 pTC=%7 xmmclobber=%8,%9,%10 interleaveTC=%11 491%macro SSSE3_DeblockChromaLt4 11 492 movdqa %8, %3 493 SSE2_AbsDiffUB %8, %2, %9 ; |p0 - q0| 494 SSE2_CmpgeUB %8, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha 495 movdqa %9, %4 496 SSE2_AbsDiffUB %9, %3, %5 ; |q1 - q0| 497 movdqa %10, %1 498 SSE2_AbsDiffUB %10, %2, %5 ; |p1 - p0| 499 pmaxub %9, %10 ; max(|q1 - q0|, |p1 - p0|) 500 pxor %10, %10 501 movd %5, %6 502 pshufb %5, %10 ; iBeta 503 SSE2_CmpgeUB %9, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta 504 por %8, %9 ; | !bDeltaP0Q0 505 movd %5, [%7] 506%if %11 507 punpckldq %5, %5 508 punpcklbw %5, %5 ; iTc 509%else 510 pshufd %5, %5, 0 ; iTc 511%endif 512 pcmpeqw %10, %10 ; FFh 513 movdqa %9, %5 514 pcmpgtb %9, %10 ; iTc > -1 ? FFh : 00h 515 pandn %8, %5 ; iTc & bDeltaP0Q0 & bDeltaP1P0 & bDeltaQ1Q0 516 pand %8, %9 ; &= (iTc > -1 ? FFh : 00h) 517 SSE2_DeblockP0Q0_Lt4 %1, %2, %3, %4, %8, %10, %5, %9 518%endmacro 519 520; p1=%1 p0=%2 q0=%3 q1=%4 iAlpha=%5 iBeta=%6 xmmclobber=%7,%8,%9 521%macro SSSE3_DeblockChromaEq4 9 522 movdqa %7, %3 523 SSE2_AbsDiffUB %7, %2, %8 ; |p0 - q0| 524 SSE2_CmpgeUB %7, %5 ; !bDeltaP0Q0 = |p0 - q0| >= iAlpha 525 movdqa %8, %4 526 SSE2_AbsDiffUB %8, %3, %5 ; |q1 - q0| 527 movdqa %9, %1 528 SSE2_AbsDiffUB %9, %2, %5 ; |p1 - p0| 529 pmaxub %8, %9 ; max(|q1 - q0|, |p1 - p0|) 530 pxor %9, %9 531 movd %5, %6 532 pshufb %5, %9 ; iBeta 533 SSE2_CmpgeUB %8, %5 ; !bDeltaQ1Q0 | !bDeltaP1P0 = max(|q1 - q0|, |p1 - p0|) >= iBeta 534 por %7, %8 ; !bDeltaP0Q0P1P0Q1Q0 = !bDeltaP0Q0 | !bDeltaQ1Q0 | !bDeltaP1P0 535 WELS_DB1 %5 536 movdqa %8, %2 537 SSE2_AvgbFloor1 %8, %4, %5, %9 ; (p0 + q1) >> 1 538 pavgb %8, %1 ; p0' = (p1 + ((p0 + q1) >> 1) + 1) >> 1 539 movdqa %9, %7 540 SSE2_Blend %2, %8, %7 ; p0out = bDeltaP0Q0P1P0Q1Q0 ? p0' : p0 541 SSE2_AvgbFloor1 %1, %3, %5, %7 ; (q0 + p1) >> 1 542 pavgb %1, %4 ; q0' = (q1 + ((q0 + p1) >> 1) + 1) >> 1 543 SSE2_Blend %3, %1, %9 ; q0out = bDeltaP0Q0P1P0Q1Q0 ? q0' : q0 544%endmacro 545 546 547;****************************************************************************** 548; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 549; int32_t iAlpha, int32_t iBeta, int8_t * pTC); 550;******************************************************************************* 551 552WELS_EXTERN DeblockChromaLt4V_ssse3 553 %assign push_num 0 554 INIT_X86_32_PIC r4 555 LOAD_4_PARA 556 PUSH_XMM 8 557 SIGN_EXTENSION r2, r2d 558 movd xmm7, arg4d 559 pxor xmm0, xmm0 560 pshufb xmm7, xmm0 ; iAlpha 561 mov r3, r2 562 neg r3 ; -iStride 563 564 movq xmm0, [r0 + 0 * r2] ; q0 cb 565 movhps xmm0, [r1 + 0 * r2] ; q0 cr 566 movq xmm2, [r0 + 1 * r3] ; p0 cb 567 movhps xmm2, [r1 + 1 * r3] ; p0 cr 568 movq xmm1, [r0 + 1 * r2] ; q1 cb 569 movhps xmm1, [r1 + 1 * r2] ; q1 cr 570 movq xmm3, [r0 + 2 * r3] ; p1 cb 571 movhps xmm3, [r1 + 2 * r3] ; p1 cr 572 573%ifidni arg6, r5 574 SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, arg6, xmm4, xmm5, xmm6, 1 575%else 576 mov r2, arg6 577 SSSE3_DeblockChromaLt4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, r2, xmm4, xmm5, xmm6, 1 578%endif 579 580 movlps [r0 + 1 * r3], xmm2 ; store p0 cb 581 movhps [r1 + 1 * r3], xmm2 ; store p0 cr 582 movlps [r0 ], xmm0 ; store q0 cb 583 movhps [r1 ], xmm0 ; store q0 cr 584 585 POP_XMM 586 LOAD_4_PARA_POP 587 DEINIT_X86_32_PIC 588 ret 589 590 591;******************************************************************************** 592; void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 593; int32_t iAlpha, int32_t iBeta) 594;******************************************************************************** 595 596WELS_EXTERN DeblockChromaEq4V_ssse3 597 %assign push_num 0 598 LOAD_4_PARA 599 PUSH_XMM 8 600 SIGN_EXTENSION r2, r2d 601 movd xmm7, arg4d 602 pxor xmm0, xmm0 603 pshufb xmm7, xmm0 ; iAlpha 604 mov r3, r2 605 neg r3 ; -iStride 606 607 movq xmm0, [r0 + 0 * r2] ; q0 cb 608 movhps xmm0, [r1 + 0 * r2] ; q0 cr 609 movq xmm2, [r0 + 1 * r3] ; p0 cb 610 movhps xmm2, [r1 + 1 * r3] ; p0 cr 611 movq xmm1, [r0 + 1 * r2] ; q1 cb 612 movhps xmm1, [r1 + 1 * r2] ; q1 cr 613 movq xmm3, [r0 + 2 * r3] ; p1 cb 614 movhps xmm3, [r1 + 2 * r3] ; p1 cr 615 616 SSSE3_DeblockChromaEq4 xmm3, xmm2, xmm0, xmm1, xmm7, arg5d, xmm4, xmm5, xmm6 617 618 movlps [r0 + 1 * r3], xmm2 ; store p0 cb 619 movhps [r1 + 1 * r3], xmm2 ; store p0 cr 620 movlps [r0 + 0 * r2], xmm0 ; store q0 cb 621 movhps [r1 + 0 * r2], xmm0 ; store q0 cr 622 623 POP_XMM 624 LOAD_4_PARA_POP 625 ret 626 627 628;******************************************************************************* 629; void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 630; int32_t iAlpha, int32_t iBeta, int8_t * pTC); 631;******************************************************************************* 632 633WELS_EXTERN DeblockChromaLt4H_ssse3 634 %assign push_num 0 635 LOAD_6_PARA 636 PUSH_XMM 8 637 SIGN_EXTENSION r2, r2d 638 movd xmm7, arg4d 639 pxor xmm0, xmm0 640 pshufb xmm7, xmm0 ; iAlpha 641 lea r3, [3 * r2 - 1] ; 3 * iStride - 1 642 643 SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6 644 INIT_X86_32_PIC r1 645 SSSE3_DeblockChromaLt4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, r5, xmm2, xmm3, xmm6, 0 646 DEINIT_X86_32_PIC 647 SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 648 649 POP_XMM 650 LOAD_6_PARA_POP 651 ret 652 653 654;*************************************************************************** 655; void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 656; int32_t iAlpha, int32_t iBeta) 657;*************************************************************************** 658 659WELS_EXTERN DeblockChromaEq4H_ssse3 660 %assign push_num 0 661 LOAD_4_PARA 662 PUSH_XMM 8 663 SIGN_EXTENSION r2, r2d 664 movd xmm7, arg4d 665 pxor xmm0, xmm0 666 pshufb xmm7, xmm0 ; iAlpha 667 lea r3, [3 * r2 - 1] ; 3 * iStride - 1 668 669 SSE2_LoadCbCr_4x16H xmm0, xmm1, xmm4, xmm5, r0, r1, r2, r3, xmm2, xmm3, xmm6 670 SSSE3_DeblockChromaEq4 xmm0, xmm1, xmm4, xmm5, xmm7, arg5d, xmm2, xmm3, xmm6 671%ifdef X86_32 672 push r4 673 push r5 674 SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 675 pop r5 676 pop r4 677%else 678 SSE2_StoreCbCr_4x16H r0, r1, r2, r3, xmm1, xmm4, r5, r4d, r4w, xmm0 679%endif 680 681 POP_XMM 682 LOAD_4_PARA_POP 683 ret 684 685 686;******************************************************************************** 687; 688; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst); 689; 690;******************************************************************************** 691 692WELS_EXTERN DeblockLumaTransposeH2V_sse2 693 push r3 694 push r4 695 push r5 696 697%assign push_num 3 698 LOAD_3_PARA 699 PUSH_XMM 8 700 701 SIGN_EXTENSION r1, r1d 702 703 mov r5, r7 704 mov r3, r7 705 and r3, 0Fh 706 sub r7, r3 707 sub r7, 10h 708 709 lea r3, [r0 + r1 * 8] 710 lea r4, [r1 * 3] 711 712 movq xmm0, [r0] 713 movq xmm7, [r3] 714 punpcklqdq xmm0, xmm7 715 movq xmm1, [r0 + r1] 716 movq xmm7, [r3 + r1] 717 punpcklqdq xmm1, xmm7 718 movq xmm2, [r0 + r1*2] 719 movq xmm7, [r3 + r1*2] 720 punpcklqdq xmm2, xmm7 721 movq xmm3, [r0 + r4] 722 movq xmm7, [r3 + r4] 723 punpcklqdq xmm3, xmm7 724 725 lea r0, [r0 + r1 * 4] 726 lea r3, [r3 + r1 * 4] 727 movq xmm4, [r0] 728 movq xmm7, [r3] 729 punpcklqdq xmm4, xmm7 730 movq xmm5, [r0 + r1] 731 movq xmm7, [r3 + r1] 732 punpcklqdq xmm5, xmm7 733 movq xmm6, [r0 + r1*2] 734 movq xmm7, [r3 + r1*2] 735 punpcklqdq xmm6, xmm7 736 737 movdqa [r7], xmm0 738 movq xmm7, [r0 + r4] 739 movq xmm0, [r3 + r4] 740 punpcklqdq xmm7, xmm0 741 movdqa xmm0, [r7] 742 743 SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7] 744 ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 745 746 movdqa [r2], xmm4 747 movdqa [r2 + 10h], xmm2 748 movdqa [r2 + 20h], xmm3 749 movdqa [r2 + 30h], xmm7 750 movdqa [r2 + 40h], xmm5 751 movdqa [r2 + 50h], xmm1 752 movdqa [r2 + 60h], xmm6 753 movdqa [r2 + 70h], xmm0 754 755 mov r7, r5 756 POP_XMM 757 pop r5 758 pop r4 759 pop r3 760 ret 761 762 763;******************************************************************************************* 764; 765; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc); 766; 767;******************************************************************************************* 768 769WELS_EXTERN DeblockLumaTransposeV2H_sse2 770 push r3 771 push r4 772 773%assign push_num 2 774 LOAD_3_PARA 775 PUSH_XMM 8 776 777 SIGN_EXTENSION r1, r1d 778 779 mov r4, r7 780 mov r3, r7 781 and r3, 0Fh 782 sub r7, r3 783 sub r7, 10h 784 785 movdqa xmm0, [r2] 786 movdqa xmm1, [r2 + 10h] 787 movdqa xmm2, [r2 + 20h] 788 movdqa xmm3, [r2 + 30h] 789 movdqa xmm4, [r2 + 40h] 790 movdqa xmm5, [r2 + 50h] 791 movdqa xmm6, [r2 + 60h] 792 movdqa xmm7, [r2 + 70h] 793 794 SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7] 795 ;pOut: m5, m3, m4, m8, m6, m2, m7, m1 796 797 lea r2, [r1 * 3] 798 799 movq [r0], xmm4 800 movq [r0 + r1], xmm2 801 movq [r0 + r1*2], xmm3 802 movq [r0 + r2], xmm7 803 804 lea r0, [r0 + r1*4] 805 movq [r0], xmm5 806 movq [r0 + r1], xmm1 807 movq [r0 + r1*2], xmm6 808 movq [r0 + r2], xmm0 809 810 psrldq xmm4, 8 811 psrldq xmm2, 8 812 psrldq xmm3, 8 813 psrldq xmm7, 8 814 psrldq xmm5, 8 815 psrldq xmm1, 8 816 psrldq xmm6, 8 817 psrldq xmm0, 8 818 819 lea r0, [r0 + r1*4] 820 movq [r0], xmm4 821 movq [r0 + r1], xmm2 822 movq [r0 + r1*2], xmm3 823 movq [r0 + r2], xmm7 824 825 lea r0, [r0 + r1*4] 826 movq [r0], xmm5 827 movq [r0 + r1], xmm1 828 movq [r0 + r1*2], xmm6 829 movq [r0 + r2], xmm0 830 831 832 mov r7, r4 833 POP_XMM 834 pop r4 835 pop r3 836 ret 837 838WELS_EXTERN WelsNonZeroCount_sse2 839 %assign push_num 0 840 LOAD_1_PARA 841 movdqu xmm0, [r0] 842 movq xmm1, [r0+16] 843 WELS_DB1 xmm2 844 pminub xmm0, xmm2 845 pminub xmm1, xmm2 846 movdqu [r0], xmm0 847 movq [r0+16], xmm1 848 ret 849