1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* score.asm 33;* 34;* Abstract 35;* scan/score/count of sse2 36;* 37;* History 38;* 8/21/2009 Created 39;* 40;* 41;*************************************************************************/ 42 43%include "asm_inc.asm" 44 45;*********************************************************************** 46; Macros 47;*********************************************************************** 48 49;*********************************************************************** 50; Local Data (Read Only) 51;*********************************************************************** 52%ifdef X86_32_PICASM 53SECTION .text align=16 54%else 55SECTION .rodata align=16 56%endif 57 58;align 16 59;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2 60align 16 61sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1 62align 16 63sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 64i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 65align 16 66sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0 67align 16 68sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8 69align 16 70sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1 71align 16 72pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13 73align 16 74pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15 75align 16 76pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1 77align 16 78pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128 79 80align 16 81nozero_count_table: 82db 0,1,1,2,1,2,2,3,1,2 83db 2,3,2,3,3,4,1,2,2,3 84db 2,3,3,4,2,3,3,4,3,4 85db 4,5,1,2,2,3,2,3,3,4 86db 2,3,3,4,3,4,4,5,2,3 87db 3,4,3,4,4,5,3,4,4,5 88db 4,5,5,6,1,2,2,3,2,3 89db 3,4,2,3,3,4,3,4,4,5 90db 2,3,3,4,3,4,4,5,3,4 91db 4,5,4,5,5,6,2,3,3,4 92db 3,4,4,5,3,4,4,5,4,5 93db 5,6,3,4,4,5,4,5,5,6 94db 4,5,5,6,5,6,6,7,1,2 95db 2,3,2,3,3,4,2,3,3,4 96db 3,4,4,5,2,3,3,4,3,4 97db 4,5,3,4,4,5,4,5,5,6 98db 2,3,3,4,3,4,4,5,3,4 99db 4,5,4,5,5,6,3,4,4,5 100db 4,5,5,6,4,5,5,6,5,6 101db 6,7,2,3,3,4,3,4,4,5 102db 3,4,4,5,4,5,5,6,3,4 103db 4,5,4,5,5,6,4,5,5,6 104db 5,6,6,7,3,4,4,5,4,5 105db 5,6,4,5,5,6,5,6,6,7 106db 4,5,5,6,5,6,6,7,5,6 107db 6,7,6,7,7,8 108 109align 16 110high_mask_table: 111 db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2 112 db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5 113 db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8 114 db 9,12, 0, 1, 1, 4, 2, 4, 5, 8 115 db 2, 4, 4, 7, 5, 7, 8,11, 3, 4 116 db 5, 8, 5, 7, 8,11, 6, 8, 8,11 117 db 9,11,12,15, 0, 1, 1, 4, 1, 3 118 db 4, 7, 2, 4, 4, 7, 5, 7, 8,11 119 db 2, 3, 4, 7, 4, 6, 7,10, 5, 7 120 db 7,10, 8,10,11,14, 3, 4, 4, 7 121 db 5, 7, 8,11, 5, 7, 7,10, 8,10 122 db 11,14, 6, 7, 8,11, 8,10,11,14 123 db 9,11,11,14,12,14,15,18, 0, 0 124 db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6 125 db 4, 6, 7,10, 2, 3, 4, 7, 4, 6 126 db 7,10, 5, 7, 7,10, 8,10,11,14 127 db 2, 3, 3, 6, 4, 6, 7,10, 4, 6 128 db 6, 9, 7, 9,10,13, 5, 6, 7,10 129 db 7, 9,10,13, 8,10,10,13,11,13 130 db 14,17, 3, 4, 4, 7, 4, 6, 7,10 131 db 5, 7, 7,10, 8,10,11,14, 5, 6 132 db 7,10, 7, 9,10,13, 8,10,10,13 133 db 11,13,14,17, 6, 7, 7,10, 8,10 134 db 11,14, 8,10,10,13,11,13,14,17 135 db 9,10,11,14,11,13,14,17,12,14 136 db 14,17,15,17,18,21 137 138align 16 139low_mask_table: 140 db 0, 3, 2, 6, 2, 5, 5, 9, 1, 5 141 db 4, 8, 5, 8, 8,12, 1, 4, 4, 8 142 db 4, 7, 7,11, 4, 8, 7,11, 8,11 143 db 11,15, 1, 4, 3, 7, 4, 7, 7,11 144 db 3, 7, 6,10, 7,10,10,14, 4, 7 145 db 7,11, 7,10,10,14, 7,11,10,14 146 db 11,14,14,18, 0, 4, 3, 7, 3, 6 147 db 6,10, 3, 7, 6,10, 7,10,10,14 148 db 3, 6, 6,10, 6, 9, 9,13, 6,10 149 db 9,13,10,13,13,17, 4, 7, 6,10 150 db 7,10,10,14, 6,10, 9,13,10,13 151 db 13,17, 7,10,10,14,10,13,13,17 152 db 10,14,13,17,14,17,17,21, 0, 3 153 db 3, 7, 3, 6, 6,10, 2, 6, 5, 9 154 db 6, 9, 9,13, 3, 6, 6,10, 6, 9 155 db 9,13, 6,10, 9,13,10,13,13,17 156 db 3, 6, 5, 9, 6, 9, 9,13, 5, 9 157 db 8,12, 9,12,12,16, 6, 9, 9,13 158 db 9,12,12,16, 9,13,12,16,13,16 159 db 16,20, 3, 7, 6,10, 6, 9, 9,13 160 db 6,10, 9,13,10,13,13,17, 6, 9 161 db 9,13, 9,12,12,16, 9,13,12,16 162 db 13,16,16,20, 7,10, 9,13,10,13 163 db 13,17, 9,13,12,16,13,16,16,20 164 db 10,13,13,17,13,16,16,20,13,17 165 db 16,20,17,20,20,24 166 167 168SECTION .text 169 170;*********************************************************************** 171;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct ) 172;*********************************************************************** 173WELS_EXTERN WelsScan4x4DcAc_sse2 174 %ifdef X86_32 175 push r3 176 %assign push_num 1 177 %else 178 %assign push_num 0 179 %endif 180 LOAD_2_PARA 181 movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0 182 movdqa xmm1, [r1+16] ; f e d c b a 9 8 183 pextrw r2d, xmm0, 7 ; ecx = 7 184 pextrw r3d, xmm1, 2 ; edx = a 185 pextrw r1d, xmm0, 5 ; eax = 5 186 pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8 187 pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0 188 pextrw r2d, xmm1, 0 ; ecx = 8 189 pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0 190 pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a 191 pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0 192 pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a 193 pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0 194 pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9 195 movdqa [r0],xmm0 196 movdqa [r0+16], xmm1 197 %ifdef X86_32 198 pop r3 199 %endif 200 ret 201 202;*********************************************************************** 203;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct ) 204;*********************************************************************** 205WELS_EXTERN WelsScan4x4DcAc_ssse3 206 %assign push_num 0 207 INIT_X86_32_PIC r3 208 LOAD_2_PARA 209 movdqa xmm0, [r1] 210 movdqa xmm1, [r1+16] 211 pextrw r2d, xmm0, 7 ; ecx = [7] 212 pextrw r1d, xmm1, 0 ; eax = [8] 213 pinsrw xmm0, r1d, 7 ; xmm0[7] = [8] 214 pinsrw xmm1, r2d, 0 ; xmm1[0] = [7] 215 pshufb xmm1, [pic(pb_scanacdc_maskb)] 216 pshufb xmm0, [pic(pb_scanacdc_maska)] 217 218 movdqa [r0],xmm0 219 movdqa [r0+16], xmm1 220 DEINIT_X86_32_PIC 221 ret 222;*********************************************************************** 223;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct ) 224;*********************************************************************** 225WELS_EXTERN WelsScan4x4Ac_sse2 226 %assign push_num 0 227 LOAD_2_PARA 228 movdqa xmm0, [r1] 229 movdqa xmm1, [r1+16] 230 movdqa xmm2, xmm0 231 punpcklqdq xmm0, xmm1 232 punpckhqdq xmm2, xmm1 233 234 movdqa xmm3, xmm0 235 punpckldq xmm0, xmm2 236 punpckhdq xmm3, xmm2 237 pextrw r1d , xmm0, 3 238 pextrw r2d , xmm0, 7 239 pinsrw xmm0, r1d, 7 240 pextrw r1d, xmm3, 4 241 pinsrw xmm3, r2d, 4 242 pextrw r2d, xmm3, 0 243 pinsrw xmm3, r1d, 0 244 pinsrw xmm0, r2d, 3 245 246 pshufhw xmm1, xmm0, 0x93 247 pshuflw xmm2, xmm3, 0x39 248 249 movdqa xmm3, xmm2 250 psrldq xmm1, 2 251 pslldq xmm3, 14 252 por xmm1, xmm3 253 psrldq xmm2, 2 254 movdqa [r0],xmm1 255 movdqa [r0+16], xmm2 256 ret 257 258 259;*********************************************************************** 260;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct ); 261;*********************************************************************** 262WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 263 %ifdef X86_32 264 push r3 265 %assign push_num 1 266 %else 267 %assign push_num 0 268 %endif 269 INIT_X86_32_PIC r4 270 LOAD_1_PARA 271 movdqa xmm0, [r0] 272 movdqa xmm1, [r0+16] 273 274 packsswb xmm0, xmm1 275 ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx 276 xor r3, r3 277 pxor xmm3, xmm3 278 pcmpeqb xmm0, xmm3 279 pmovmskb r3d, xmm0 280 281 xor r3, 0xffff 282 283 xor r0, r0 284 mov r2, 7 285 mov r1, 8 286.loop_low8_find1: 287 bt r3, r2 288 jc .loop_high8_find1 289 dec r2 290 jnz .loop_low8_find1 291.loop_high8_find1: 292 bt r3, r1 293 jc .find1end 294 inc r1 295 cmp r1,16 296 jb .loop_high8_find1 297.find1end: 298 sub r1, r2 299 sub r1, 1 300 lea r2, [pic(i_ds_table)] 301 add r0b, [r2+r1] 302 mov r1, r3 303 and r3, 0xff 304 shr r1, 8 305 and r1, 0xff 306 lea r2 , [pic(low_mask_table)] 307 add r0b, [r2 +r3] 308 lea r2, [pic(high_mask_table)] 309 add r0b, [r2+r1] 310 DEINIT_X86_32_PIC 311 %ifdef X86_32 312 pop r3 313 %else 314 mov retrd, r0d 315 %endif 316 ret 317 318 319;*********************************************************************** 320; int32_t WelsGetNoneZeroCount_sse2(int16_t* level); 321;*********************************************************************** 322WELS_EXTERN WelsGetNoneZeroCount_sse2 323 %assign push_num 0 324 INIT_X86_32_PIC r3 325 LOAD_1_PARA 326 movdqa xmm0, [r0] 327 movdqa xmm1, [r0+16] 328 pxor xmm2, xmm2 329 pcmpeqw xmm0, xmm2 330 pcmpeqw xmm1, xmm2 331 packsswb xmm1, xmm0 332 xor r1, r1 333 pmovmskb r1d, xmm1 334 xor r1d, 0xffff 335 mov r2, r1 336 and r1, 0xff 337 shr r2, 8 338; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet 339; xor retr, retr 340 ;add al, [nozero_count_table+r2] 341 lea r0 , [pic(nozero_count_table)] 342 movzx r2, byte [r0+r2] 343 movzx r1, byte [r0+r1] 344 mov retrq, r2 345 add retrq, r1 346 ;add al, [nozero_count_table+r1] 347 DEINIT_X86_32_PIC 348 ret 349 350;*********************************************************************** 351; int32_t WelsGetNoneZeroCount_sse42(int16_t* level); 352;*********************************************************************** 353WELS_EXTERN WelsGetNoneZeroCount_sse42 354 %assign push_num 0 355 LOAD_1_PARA 356 movdqa xmm0, [r0] 357 packsswb xmm0, [r0 + 16] 358 pxor xmm1, xmm1 359 pcmpeqb xmm0, xmm1 360 pmovmskb retrd, xmm0 361 xor retrd, 0FFFFh 362 popcnt retrd, retrd 363 ret 364