1;*! 2;* \copy 3;* Copyright (c) 2010-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* vaa.asm 33;* 34;* Abstract 35;* sse2 for pVaa routines 36;* 37;* History 38;* 04/14/2010 Created 39;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3) 40;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement 41;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 42;* 43;*************************************************************************/ 44%include "asm_inc.asm" 45 46 47;*********************************************************************** 48; Macros and other preprocessor constants 49;*********************************************************************** 50 51; by comparing it outperforms than phaddw(SSSE3) sets 52%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp 53 ; @sum_8x2 begin 54 pshufd %2, %1, 04Eh ; 01001110 B 55 paddw %1, %2 56 pshuflw %2, %1, 04Eh ; 01001110 B 57 paddw %1, %2 58 pshuflw %2, %1, 0B1h ; 10110001 B 59 paddw %1, %2 60 ; end of @sum_8x2 61%endmacro ; END of SUM_WORD_8x2_SSE2 62 63 64%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4 65 movdqa %1, [r0 ] ; line 0 66 movdqa %2, [r0+r1] ; line 1 67 movdqa %3, %1 68 punpcklbw %1, xmm7 69 punpckhbw %3, xmm7 70 movdqa %4, %2 71 punpcklbw %4, xmm7 72 punpckhbw %2, xmm7 73 paddw %1, %4 74 paddw %2, %3 75 movdqa %3, [r0+r2] ; line 2 76 movdqa %4, [r0+r3] ; line 3 77 movdqa %5, %3 78 punpcklbw %3, xmm7 79 punpckhbw %5, xmm7 80 movdqa %6, %4 81 punpcklbw %6, xmm7 82 punpckhbw %4, xmm7 83 paddw %3, %6 84 paddw %4, %5 85 paddw %1, %3 ; block 0, 1 86 paddw %2, %4 ; block 2, 3 87 pshufd %3, %1, 0B1h 88 pshufd %4, %2, 0B1h 89 paddw %1, %3 90 paddw %2, %4 91 movdqa %3, %1 92 movdqa %4, %2 93 pshuflw %5, %1, 0B1h 94 pshufhw %6, %3, 0B1h 95 paddw %1, %5 96 paddw %3, %6 97 pshuflw %5, %2, 0B1h 98 pshufhw %6, %4, 0B1h 99 paddw %2, %5 100 paddw %4, %6 101 punpcklwd %1, %2 102 punpckhwd %3, %4 103 punpcklwd %1, %3 104 psraw %1, $04 105%endmacro 106 107%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4 108 movdqa %1, [r0 ] ; line 0 109 movdqa %2, [r0+r1] ; line 1 110 movdqa %3, %1 111 punpcklbw %1, xmm7 112 punpckhbw %3, xmm7 113 movdqa %4, %2 114 punpcklbw %4, xmm7 115 punpckhbw %2, xmm7 116 paddw %1, %4 117 paddw %2, %3 118 movdqa %3, [r0+r2] ; line 2 119 movdqa %4, [r0+r3] ; line 3 120 movdqa %5, %3 121 punpcklbw %3, xmm7 122 punpckhbw %5, xmm7 123 movdqa %6, %4 124 punpcklbw %6, xmm7 125 punpckhbw %4, xmm7 126 paddw %3, %6 127 paddw %4, %5 128 paddw %1, %3 ; block 0, 1 129 paddw %2, %4 ; block 2, 3 130 phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; .. 131 phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; .... 132 psraw %1, $04 133%endmacro 134 135 136 137;*********************************************************************** 138; Code 139;*********************************************************************** 140 141SECTION .text 142 143; , 6/7/2010 144 145;*********************************************************************** 146; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize ); 147;*********************************************************************** 148WELS_EXTERN AnalysisVaaInfoIntra_sse2 149 150 %assign push_num 0 151 LOAD_2_PARA 152 PUSH_XMM 8 153 SIGN_EXTENSION r1,r1d 154 155%ifdef X86_32 156 push r3 157 push r4 158 push r5 159 push r6 160 %assign push_num push_num+4 161%endif 162 163 mov r5,r7 164 and r5,0fh 165 sub r7,r5 166 sub r7,32 167 168 169 mov r2,r1 170 sal r2,$01 ;r2 = 2*iLineSize 171 mov r3,r2 172 add r3,r1 ;r3 = 3*iLineSize 173 174 mov r4,r2 175 sal r4,$01 ;r4 = 4*iLineSize 176 177 pxor xmm7, xmm7 178 179 ; loops 180 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 181 movq [r7], xmm0 182 183 lea r0, [r0+r4] 184 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 185 movq [r7+8], xmm0 186 187 lea r0, [r0+r4] 188 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 189 movq [r7+16], xmm0 190 191 lea r0, [r0+r4] 192 VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 193 movq [r7+24], xmm0 194 195 movdqa xmm0, [r7] ; block 0~7 196 movdqa xmm1, [r7+16] ; block 8~15 197 movdqa xmm2, xmm0 198 paddw xmm0, xmm1 199 SUM_WORD_8x2_SSE2 xmm0, xmm3 200 201 pmullw xmm1, xmm1 202 pmullw xmm2, xmm2 203 movdqa xmm3, xmm1 204 movdqa xmm4, xmm2 205 punpcklwd xmm1, xmm7 206 punpckhwd xmm3, xmm7 207 punpcklwd xmm2, xmm7 208 punpckhwd xmm4, xmm7 209 paddd xmm1, xmm2 210 paddd xmm3, xmm4 211 paddd xmm1, xmm3 212 pshufd xmm2, xmm1, 01Bh 213 paddd xmm1, xmm2 214 pshufd xmm2, xmm1, 0B1h 215 paddd xmm1, xmm2 216 217 218 219 movd r2d, xmm0 220 and r2, 0ffffh ; effective low work truncated 221 mov r3, r2 222 imul r2, r3 223 sar r2, $04 224 movd retrd, xmm1 225 sub retrd, r2d 226 227 add r7,32 228 add r7,r5 229 230%ifdef X86_32 231 pop r6 232 pop r5 233 pop r4 234 pop r3 235%endif 236 POP_XMM 237 238 ret 239 240;*********************************************************************** 241; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize ); 242;*********************************************************************** 243WELS_EXTERN AnalysisVaaInfoIntra_ssse3 244 245 %assign push_num 0 246 LOAD_2_PARA 247 PUSH_XMM 8 248 SIGN_EXTENSION r1,r1d 249 250%ifdef X86_32 251 push r3 252 push r4 253 push r5 254 push r6 255 %assign push_num push_num+4 256%endif 257 258 mov r5,r7 259 and r5,0fh 260 sub r7,r5 261 sub r7,32 262 263 264 mov r2,r1 265 sal r2,$01 ;r2 = 2*iLineSize 266 mov r3,r2 267 add r3,r1 ;r3 = 3*iLineSize 268 269 mov r4,r2 270 sal r4,$01 ;r4 = 4*iLineSize 271 272 pxor xmm7, xmm7 273 274 ; loops 275 VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 276 movq [r7],xmm0 277 278 lea r0,[r0+r4] 279 VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 280 movq [r7+8],xmm1 281 282 283 lea r0,[r0+r4] 284 VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 285 movq [r7+16],xmm0 286 287 lea r0,[r0+r4] 288 VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 289 movq [r7+24],xmm1 290 291 292 movdqa xmm0,[r7] 293 movdqa xmm1,[r7+16] 294 movdqa xmm2, xmm0 295 paddw xmm0, xmm1 296 SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets 297 298 pmullw xmm1, xmm1 299 pmullw xmm2, xmm2 300 movdqa xmm3, xmm1 301 movdqa xmm4, xmm2 302 punpcklwd xmm1, xmm7 303 punpckhwd xmm3, xmm7 304 punpcklwd xmm2, xmm7 305 punpckhwd xmm4, xmm7 306 paddd xmm1, xmm2 307 paddd xmm3, xmm4 308 paddd xmm1, xmm3 309 pshufd xmm2, xmm1, 01Bh 310 paddd xmm1, xmm2 311 pshufd xmm2, xmm1, 0B1h 312 paddd xmm1, xmm2 313 314 315 movd r2d, xmm0 316 and r2, 0ffffh ; effective low work truncated 317 mov r3, r2 318 imul r2, r3 319 sar r2, $04 320 movd retrd, xmm1 321 sub retrd, r2d 322 323 add r7,32 324 add r7,r5 325%ifdef X86_32 326 pop r6 327 pop r5 328 pop r4 329 pop r3 330%endif 331 POP_XMM 332 333 ret 334 335;*********************************************************************** 336; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 ) 337;*********************************************************************** 338WELS_EXTERN MdInterAnalysisVaaInfo_sse41 339 %assign push_num 0 340 LOAD_1_PARA 341 movdqa xmm0,[r0] 342 pshufd xmm1, xmm0, 01Bh 343 paddd xmm1, xmm0 344 pshufd xmm2, xmm1, 0B1h 345 paddd xmm1, xmm2 346 psrad xmm1, 02h ; iAverageSad 347 movdqa xmm2, xmm1 348 psrad xmm2, 06h 349 movdqa xmm3, xmm0 ; iSadBlock 350 psrad xmm3, 06h 351 psubd xmm3, xmm2 352 pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets 353 pshufd xmm4, xmm3, 01Bh 354 paddd xmm4, xmm3 355 pshufd xmm3, xmm4, 0B1h 356 paddd xmm3, xmm4 357 movd r0d, xmm3 358 cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD 359 360 jb near .threshold_exit 361 pshufd xmm0, xmm0, 01Bh 362 pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad 363 movmskps retrd, xmm0 364 ret 365.threshold_exit: 366 mov retrd, 15 367 ret 368 369;*********************************************************************** 370; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 ) 371;*********************************************************************** 372WELS_EXTERN MdInterAnalysisVaaInfo_sse2 373 %assign push_num 0 374 LOAD_1_PARA 375 movdqa xmm0, [r0] 376 pshufd xmm1, xmm0, 01Bh 377 paddd xmm1, xmm0 378 pshufd xmm2, xmm1, 0B1h 379 paddd xmm1, xmm2 380 psrad xmm1, 02h ; iAverageSad 381 movdqa xmm2, xmm1 382 psrad xmm2, 06h 383 movdqa xmm3, xmm0 ; iSadBlock 384 psrad xmm3, 06h 385 psubd xmm3, xmm2 386 387 ; to replace pmulld functionality as below 388 movdqa xmm2, xmm3 389 pmuludq xmm2, xmm3 390 pshufd xmm4, xmm3, 0B1h 391 pmuludq xmm4, xmm4 392 movdqa xmm5, xmm2 393 punpckldq xmm5, xmm4 394 punpckhdq xmm2, xmm4 395 punpcklqdq xmm5, xmm2 396 397 pshufd xmm4, xmm5, 01Bh 398 paddd xmm4, xmm5 399 pshufd xmm5, xmm4, 0B1h 400 paddd xmm5, xmm4 401 402 movd r0d, xmm5 403 cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD 404 jb near .threshold_exit 405 pshufd xmm0, xmm0, 01Bh 406 pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad 407 movmskps retrd, xmm0 408 ret 409.threshold_exit: 410 mov retrd, 15 411 ret 412