1;*! 2;* \copy 3;* Copyright (c) 2010-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* memzero.asm 33;* 34;* Abstract 35;* cavlc 36;* 37;* History 38;* 09/08/2010 Created 39;* 40;* 41;*************************************************************************/ 42 43%include "asm_inc.asm" 44 45%ifdef X86_32_PICASM 46SECTION .text align=16 47%else 48SECTION .rodata align=16 49%endif 50 51align 16 52 53wels_shufb_rev: 54 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 55 56; 4-bit table giving number of preceding zeros for each set bit as well as the 57; eventual next bit. For the case where all 4 bits are set, this requires 5 58; zeros. The 5th zero can either be read from beyond the final table entry or 59; implied via zero-initializing the location being read into. 60wels_cavlc_param_cal_run_lut: 61 db 4, 0, 0, 0 62 db 0, 3, 0, 0 63 db 1, 2, 0, 0 64 db 0, 0, 2, 0 65 db 2, 1, 0, 0 66 db 0, 1, 1, 0 67 db 1, 0, 1, 0 68 db 0, 0, 0, 1 69 db 3, 0, 0, 0 70 db 0, 2, 0, 0 71 db 1, 1, 0, 0 72 db 0, 0, 1, 0 73 db 2, 0, 0, 0 74 db 0, 1, 0, 0 75 db 1, 0, 0, 0 76 db 0, 0, 0, 0 77; db 0 78; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing 79; the words that match zero bits and concatenating in reverse order. 80wels_cavlc_param_cal_shufb_lut: 81 db 0, 0, 0, 0, 0, 0, 0, 0 82 db 6, 7, 0, 0, 0, 0, 0, 0 83 db 4, 5, 0, 0, 0, 0, 0, 0 84 db 6, 7, 4, 5, 0, 0, 0, 0 85 db 2, 3, 0, 0, 0, 0, 0, 0 86 db 6, 7, 2, 3, 0, 0, 0, 0 87 db 4, 5, 2, 3, 0, 0, 0, 0 88 db 6, 7, 4, 5, 2, 3, 0, 0 89 db 0, 1, 0, 0, 0, 0, 0, 0 90 db 6, 7, 0, 1, 0, 0, 0, 0 91 db 4, 5, 0, 1, 0, 0, 0, 0 92 db 6, 7, 4, 5, 0, 1, 0, 0 93 db 2, 3, 0, 1, 0, 0, 0, 0 94 db 6, 7, 2, 3, 0, 1, 0, 0 95 db 4, 5, 2, 3, 0, 1, 0, 0 96 db 6, 7, 4, 5, 2, 3, 0, 1 97 98 99%ifdef X86_32 100 101align 16 102sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8 103 104ALIGN 16 105sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1 106 107align 16 108byte_1pos_table: 109 db 0,0,0,0,0,0,0,0, ;0 110 db 0,0,0,0,0,0,0,1, ;1 111 db 1,0,0,0,0,0,0,1, ;2 112 db 1,0,0,0,0,0,0,2, ;3 113 db 2,0,0,0,0,0,0,1, ;4 114 db 2,0,0,0,0,0,0,2, ;5 115 db 2,1,0,0,0,0,0,2, ;6 116 db 2,1,0,0,0,0,0,3, ;7 117 db 3,0,0,0,0,0,0,1, ;8 118 db 3,0,0,0,0,0,0,2, ;9 119 db 3,1,0,0,0,0,0,2, ;10 120 db 3,1,0,0,0,0,0,3, ;11 121 db 3,2,0,0,0,0,0,2, ;12 122 db 3,2,0,0,0,0,0,3, ;13 123 db 3,2,1,0,0,0,0,3, ;14 124 db 3,2,1,0,0,0,0,4, ;15 125 db 4,0,0,0,0,0,0,1, ;16 126 db 4,0,0,0,0,0,0,2, ;17 127 db 4,1,0,0,0,0,0,2, ;18 128 db 4,1,0,0,0,0,0,3, ;19 129 db 4,2,0,0,0,0,0,2, ;20 130 db 4,2,0,0,0,0,0,3, ;21 131 db 4,2,1,0,0,0,0,3, ;22 132 db 4,2,1,0,0,0,0,4, ;23 133 db 4,3,0,0,0,0,0,2, ;24 134 db 4,3,0,0,0,0,0,3, ;25 135 db 4,3,1,0,0,0,0,3, ;26 136 db 4,3,1,0,0,0,0,4, ;27 137 db 4,3,2,0,0,0,0,3, ;28 138 db 4,3,2,0,0,0,0,4, ;29 139 db 4,3,2,1,0,0,0,4, ;30 140 db 4,3,2,1,0,0,0,5, ;31 141 db 5,0,0,0,0,0,0,1, ;32 142 db 5,0,0,0,0,0,0,2, ;33 143 db 5,1,0,0,0,0,0,2, ;34 144 db 5,1,0,0,0,0,0,3, ;35 145 db 5,2,0,0,0,0,0,2, ;36 146 db 5,2,0,0,0,0,0,3, ;37 147 db 5,2,1,0,0,0,0,3, ;38 148 db 5,2,1,0,0,0,0,4, ;39 149 db 5,3,0,0,0,0,0,2, ;40 150 db 5,3,0,0,0,0,0,3, ;41 151 db 5,3,1,0,0,0,0,3, ;42 152 db 5,3,1,0,0,0,0,4, ;43 153 db 5,3,2,0,0,0,0,3, ;44 154 db 5,3,2,0,0,0,0,4, ;45 155 db 5,3,2,1,0,0,0,4, ;46 156 db 5,3,2,1,0,0,0,5, ;47 157 db 5,4,0,0,0,0,0,2, ;48 158 db 5,4,0,0,0,0,0,3, ;49 159 db 5,4,1,0,0,0,0,3, ;50 160 db 5,4,1,0,0,0,0,4, ;51 161 db 5,4,2,0,0,0,0,3, ;52 162 db 5,4,2,0,0,0,0,4, ;53 163 db 5,4,2,1,0,0,0,4, ;54 164 db 5,4,2,1,0,0,0,5, ;55 165 db 5,4,3,0,0,0,0,3, ;56 166 db 5,4,3,0,0,0,0,4, ;57 167 db 5,4,3,1,0,0,0,4, ;58 168 db 5,4,3,1,0,0,0,5, ;59 169 db 5,4,3,2,0,0,0,4, ;60 170 db 5,4,3,2,0,0,0,5, ;61 171 db 5,4,3,2,1,0,0,5, ;62 172 db 5,4,3,2,1,0,0,6, ;63 173 db 6,0,0,0,0,0,0,1, ;64 174 db 6,0,0,0,0,0,0,2, ;65 175 db 6,1,0,0,0,0,0,2, ;66 176 db 6,1,0,0,0,0,0,3, ;67 177 db 6,2,0,0,0,0,0,2, ;68 178 db 6,2,0,0,0,0,0,3, ;69 179 db 6,2,1,0,0,0,0,3, ;70 180 db 6,2,1,0,0,0,0,4, ;71 181 db 6,3,0,0,0,0,0,2, ;72 182 db 6,3,0,0,0,0,0,3, ;73 183 db 6,3,1,0,0,0,0,3, ;74 184 db 6,3,1,0,0,0,0,4, ;75 185 db 6,3,2,0,0,0,0,3, ;76 186 db 6,3,2,0,0,0,0,4, ;77 187 db 6,3,2,1,0,0,0,4, ;78 188 db 6,3,2,1,0,0,0,5, ;79 189 db 6,4,0,0,0,0,0,2, ;80 190 db 6,4,0,0,0,0,0,3, ;81 191 db 6,4,1,0,0,0,0,3, ;82 192 db 6,4,1,0,0,0,0,4, ;83 193 db 6,4,2,0,0,0,0,3, ;84 194 db 6,4,2,0,0,0,0,4, ;85 195 db 6,4,2,1,0,0,0,4, ;86 196 db 6,4,2,1,0,0,0,5, ;87 197 db 6,4,3,0,0,0,0,3, ;88 198 db 6,4,3,0,0,0,0,4, ;89 199 db 6,4,3,1,0,0,0,4, ;90 200 db 6,4,3,1,0,0,0,5, ;91 201 db 6,4,3,2,0,0,0,4, ;92 202 db 6,4,3,2,0,0,0,5, ;93 203 db 6,4,3,2,1,0,0,5, ;94 204 db 6,4,3,2,1,0,0,6, ;95 205 db 6,5,0,0,0,0,0,2, ;96 206 db 6,5,0,0,0,0,0,3, ;97 207 db 6,5,1,0,0,0,0,3, ;98 208 db 6,5,1,0,0,0,0,4, ;99 209 db 6,5,2,0,0,0,0,3, ;100 210 db 6,5,2,0,0,0,0,4, ;101 211 db 6,5,2,1,0,0,0,4, ;102 212 db 6,5,2,1,0,0,0,5, ;103 213 db 6,5,3,0,0,0,0,3, ;104 214 db 6,5,3,0,0,0,0,4, ;105 215 db 6,5,3,1,0,0,0,4, ;106 216 db 6,5,3,1,0,0,0,5, ;107 217 db 6,5,3,2,0,0,0,4, ;108 218 db 6,5,3,2,0,0,0,5, ;109 219 db 6,5,3,2,1,0,0,5, ;110 220 db 6,5,3,2,1,0,0,6, ;111 221 db 6,5,4,0,0,0,0,3, ;112 222 db 6,5,4,0,0,0,0,4, ;113 223 db 6,5,4,1,0,0,0,4, ;114 224 db 6,5,4,1,0,0,0,5, ;115 225 db 6,5,4,2,0,0,0,4, ;116 226 db 6,5,4,2,0,0,0,5, ;117 227 db 6,5,4,2,1,0,0,5, ;118 228 db 6,5,4,2,1,0,0,6, ;119 229 db 6,5,4,3,0,0,0,4, ;120 230 db 6,5,4,3,0,0,0,5, ;121 231 db 6,5,4,3,1,0,0,5, ;122 232 db 6,5,4,3,1,0,0,6, ;123 233 db 6,5,4,3,2,0,0,5, ;124 234 db 6,5,4,3,2,0,0,6, ;125 235 db 6,5,4,3,2,1,0,6, ;126 236 db 6,5,4,3,2,1,0,7, ;127 237 db 7,0,0,0,0,0,0,1, ;128 238 db 7,0,0,0,0,0,0,2, ;129 239 db 7,1,0,0,0,0,0,2, ;130 240 db 7,1,0,0,0,0,0,3, ;131 241 db 7,2,0,0,0,0,0,2, ;132 242 db 7,2,0,0,0,0,0,3, ;133 243 db 7,2,1,0,0,0,0,3, ;134 244 db 7,2,1,0,0,0,0,4, ;135 245 db 7,3,0,0,0,0,0,2, ;136 246 db 7,3,0,0,0,0,0,3, ;137 247 db 7,3,1,0,0,0,0,3, ;138 248 db 7,3,1,0,0,0,0,4, ;139 249 db 7,3,2,0,0,0,0,3, ;140 250 db 7,3,2,0,0,0,0,4, ;141 251 db 7,3,2,1,0,0,0,4, ;142 252 db 7,3,2,1,0,0,0,5, ;143 253 db 7,4,0,0,0,0,0,2, ;144 254 db 7,4,0,0,0,0,0,3, ;145 255 db 7,4,1,0,0,0,0,3, ;146 256 db 7,4,1,0,0,0,0,4, ;147 257 db 7,4,2,0,0,0,0,3, ;148 258 db 7,4,2,0,0,0,0,4, ;149 259 db 7,4,2,1,0,0,0,4, ;150 260 db 7,4,2,1,0,0,0,5, ;151 261 db 7,4,3,0,0,0,0,3, ;152 262 db 7,4,3,0,0,0,0,4, ;153 263 db 7,4,3,1,0,0,0,4, ;154 264 db 7,4,3,1,0,0,0,5, ;155 265 db 7,4,3,2,0,0,0,4, ;156 266 db 7,4,3,2,0,0,0,5, ;157 267 db 7,4,3,2,1,0,0,5, ;158 268 db 7,4,3,2,1,0,0,6, ;159 269 db 7,5,0,0,0,0,0,2, ;160 270 db 7,5,0,0,0,0,0,3, ;161 271 db 7,5,1,0,0,0,0,3, ;162 272 db 7,5,1,0,0,0,0,4, ;163 273 db 7,5,2,0,0,0,0,3, ;164 274 db 7,5,2,0,0,0,0,4, ;165 275 db 7,5,2,1,0,0,0,4, ;166 276 db 7,5,2,1,0,0,0,5, ;167 277 db 7,5,3,0,0,0,0,3, ;168 278 db 7,5,3,0,0,0,0,4, ;169 279 db 7,5,3,1,0,0,0,4, ;170 280 db 7,5,3,1,0,0,0,5, ;171 281 db 7,5,3,2,0,0,0,4, ;172 282 db 7,5,3,2,0,0,0,5, ;173 283 db 7,5,3,2,1,0,0,5, ;174 284 db 7,5,3,2,1,0,0,6, ;175 285 db 7,5,4,0,0,0,0,3, ;176 286 db 7,5,4,0,0,0,0,4, ;177 287 db 7,5,4,1,0,0,0,4, ;178 288 db 7,5,4,1,0,0,0,5, ;179 289 db 7,5,4,2,0,0,0,4, ;180 290 db 7,5,4,2,0,0,0,5, ;181 291 db 7,5,4,2,1,0,0,5, ;182 292 db 7,5,4,2,1,0,0,6, ;183 293 db 7,5,4,3,0,0,0,4, ;184 294 db 7,5,4,3,0,0,0,5, ;185 295 db 7,5,4,3,1,0,0,5, ;186 296 db 7,5,4,3,1,0,0,6, ;187 297 db 7,5,4,3,2,0,0,5, ;188 298 db 7,5,4,3,2,0,0,6, ;189 299 db 7,5,4,3,2,1,0,6, ;190 300 db 7,5,4,3,2,1,0,7, ;191 301 db 7,6,0,0,0,0,0,2, ;192 302 db 7,6,0,0,0,0,0,3, ;193 303 db 7,6,1,0,0,0,0,3, ;194 304 db 7,6,1,0,0,0,0,4, ;195 305 db 7,6,2,0,0,0,0,3, ;196 306 db 7,6,2,0,0,0,0,4, ;197 307 db 7,6,2,1,0,0,0,4, ;198 308 db 7,6,2,1,0,0,0,5, ;199 309 db 7,6,3,0,0,0,0,3, ;200 310 db 7,6,3,0,0,0,0,4, ;201 311 db 7,6,3,1,0,0,0,4, ;202 312 db 7,6,3,1,0,0,0,5, ;203 313 db 7,6,3,2,0,0,0,4, ;204 314 db 7,6,3,2,0,0,0,5, ;205 315 db 7,6,3,2,1,0,0,5, ;206 316 db 7,6,3,2,1,0,0,6, ;207 317 db 7,6,4,0,0,0,0,3, ;208 318 db 7,6,4,0,0,0,0,4, ;209 319 db 7,6,4,1,0,0,0,4, ;210 320 db 7,6,4,1,0,0,0,5, ;211 321 db 7,6,4,2,0,0,0,4, ;212 322 db 7,6,4,2,0,0,0,5, ;213 323 db 7,6,4,2,1,0,0,5, ;214 324 db 7,6,4,2,1,0,0,6, ;215 325 db 7,6,4,3,0,0,0,4, ;216 326 db 7,6,4,3,0,0,0,5, ;217 327 db 7,6,4,3,1,0,0,5, ;218 328 db 7,6,4,3,1,0,0,6, ;219 329 db 7,6,4,3,2,0,0,5, ;220 330 db 7,6,4,3,2,0,0,6, ;221 331 db 7,6,4,3,2,1,0,6, ;222 332 db 7,6,4,3,2,1,0,7, ;223 333 db 7,6,5,0,0,0,0,3, ;224 334 db 7,6,5,0,0,0,0,4, ;225 335 db 7,6,5,1,0,0,0,4, ;226 336 db 7,6,5,1,0,0,0,5, ;227 337 db 7,6,5,2,0,0,0,4, ;228 338 db 7,6,5,2,0,0,0,5, ;229 339 db 7,6,5,2,1,0,0,5, ;230 340 db 7,6,5,2,1,0,0,6, ;231 341 db 7,6,5,3,0,0,0,4, ;232 342 db 7,6,5,3,0,0,0,5, ;233 343 db 7,6,5,3,1,0,0,5, ;234 344 db 7,6,5,3,1,0,0,6, ;235 345 db 7,6,5,3,2,0,0,5, ;236 346 db 7,6,5,3,2,0,0,6, ;237 347 db 7,6,5,3,2,1,0,6, ;238 348 db 7,6,5,3,2,1,0,7, ;239 349 db 7,6,5,4,0,0,0,4, ;240 350 db 7,6,5,4,0,0,0,5, ;241 351 db 7,6,5,4,1,0,0,5, ;242 352 db 7,6,5,4,1,0,0,6, ;243 353 db 7,6,5,4,2,0,0,5, ;244 354 db 7,6,5,4,2,0,0,6, ;245 355 db 7,6,5,4,2,1,0,6, ;246 356 db 7,6,5,4,2,1,0,7, ;247 357 db 7,6,5,4,3,0,0,5, ;248 358 db 7,6,5,4,3,0,0,6, ;249 359 db 7,6,5,4,3,1,0,6, ;250 360 db 7,6,5,4,3,1,0,7, ;251 361 db 7,6,5,4,3,2,0,6, ;252 362 db 7,6,5,4,3,2,0,7, ;253 363 db 7,6,5,4,3,2,1,7, ;254 364 db 7,6,5,4,3,2,1,8, ;255 365 366%endif ; X86_32 367 368;*********************************************************************** 369; Code 370;*********************************************************************** 371SECTION .text 372 373 374%ifdef X86_32 375 376;*********************************************************************** 377;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); 378;*********************************************************************** 379WELS_EXTERN CavlcParamCal_sse2 380 push ebx 381 push edi 382 push esi 383 %assign push_num 3 384 INIT_X86_32_PIC ebp 385 386 mov eax, arg1 ;coffLevel 387 mov edi, arg3 ;Level 388 mov ebx, arg5 ;endIdx 389 cmp ebx, 3 390 jne .Level16 391 pxor xmm1, xmm1 392 movq xmm0, [eax] ; removed QWORD 393 jmp .Cal_begin 394.Level16: 395 movdqa xmm0, [eax] 396 movdqa xmm1, [eax+16] 397.Cal_begin: 398 movdqa xmm2, xmm0 399 packsswb xmm0, xmm1 400 movdqa xmm4, xmm0 401 pxor xmm3, xmm3 402 pcmpgtb xmm0, xmm3 403 pcmpgtb xmm3, xmm4 404 por xmm0, xmm3 405 pmovmskb edx, xmm0 406 cmp edx, 0 407 je near .return 408 movdqa xmm6, [pic(sse2_b_1)] 409 pcmpeqw xmm7, xmm7 ;generate -1 410 mov ebx, 0xff 411 ;pinsrw xmm6, ebx, 3 412 413 mov bl, dh 414 415 lea ebx, [pic(byte_1pos_table+8*ebx)] 416 movq xmm0, [ebx] 417 pextrw ecx, xmm0, 3 418 shr ecx, 8 419 mov dh, cl 420 421.loopHighFind0: 422 cmp ecx, 0 423 je .loopHighFind0End 424 ;mov esi, [ebx] 425 ;and esi, 0xff 426 movzx esi, byte [ebx] 427 add esi, 8 428 mov esi, [eax+2*esi] 429 mov [edi], si 430 add edi, 2 431 ;add ebx, 1 432 inc ebx 433 dec ecx 434 jmp .loopHighFind0 435.loopHighFind0End: 436 mov cl, dh 437 cmp cl, 8 438 pand xmm0, xmm6 439 jne .LowByteFind0 440 sub edi, 2 441 mov esi, [eax+16] 442 mov [edi], esi 443 add edi, 2 444.LowByteFind0: 445 and edx, 0xff 446 lea ebx, [pic(byte_1pos_table+8*edx)] 447 movq xmm1, [ebx] 448 pextrw esi, xmm1, 3 449 or esi, 0xff 450 or ecx, 0xff00 451 and ecx, esi 452 shr esi, 8 453 pand xmm1, xmm6 454.loopLowFind0: 455 cmp esi, 0 456 je .loopLowFind0End 457 ;mov edx, [ebx] 458 ;and edx, 0xff 459 movzx edx, byte [ebx] 460 mov edx, [eax+2*edx] 461 mov [edi], dx 462 add edi, 2 463 ;add ebx, 1 464 inc ebx 465 dec esi 466 jmp .loopLowFind0 467.loopLowFind0End: 468 cmp ch, 8 469 jne .getLevelEnd 470 sub edi, 2 471 mov edx, [eax] 472 mov [edi], dx 473.getLevelEnd: 474 mov edx, arg4 ;total_coeffs 475 ;mov ebx, ecx 476 ;and ebx, 0xff 477 movzx ebx, byte cl 478 add cl, ch 479 mov [edx], cl 480;getRun 481 movq xmm5, [pic(sse2_b8)] 482 paddb xmm0, xmm5 483 pxor xmm2, xmm2 484 pxor xmm3, xmm3 485 mov eax, 8 486 sub eax, ebx 487 shl eax, 3 488 shl ebx, 3 489 pinsrw xmm2, ebx, 0 490 pinsrw xmm3, eax, 0 491 psllq xmm0, xmm3 492 psrlq xmm0, xmm3 493 movdqa xmm4, xmm1 494 psllq xmm1, xmm2 495 psrlq xmm4, xmm3 496 punpcklqdq xmm1, xmm4 497 por xmm0, xmm1 498 499 pextrw eax, xmm0, 0 500 and eax, 0xff 501 inc eax 502 sub al, cl 503 movdqa xmm1, xmm0 504 paddb xmm1, xmm7 505 psrldq xmm0, 1 506 psubb xmm1, xmm0 507 mov ecx, arg2 ;run 508 movdqa [ecx], xmm1 509;getRunEnd 510.return: 511 DEINIT_X86_32_PIC 512 pop esi 513 pop edi 514 pop ebx 515 ret 516%endif ;%ifdef X86_32 517 518;*********************************************************************** 519;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); 520;*********************************************************************** 521 522WELS_EXTERN CavlcParamCal_sse42 523%define i_endidxd dword arg5d 524 525%ifdef X86_32 526 push r3 527 push r4 528 push r5 529 push r6 530 %assign push_num 4 531%ifdef X86_32_PICASM 532 %define p_total_coeffs r1 533%else 534 %define p_total_coeffs r0 535%endif 536 %define r_tmp r1 537 %define r_tmpd r1d 538 %define r_tmpb r1b 539 %define p_level r2 540 %define p_coeff_level r3 541 %define p_run r6 542 %define r_mask r5 543 %define r_maskd r5d 544 %define p_shufb_lut pic(wels_cavlc_param_cal_shufb_lut) 545 %define p_run_lut pic(wels_cavlc_param_cal_run_lut) 546 mov p_coeff_level, arg1 547 mov p_run, arg2 548 mov p_level, arg3 549 mov p_total_coeffs, arg4 550%elifdef WIN64 551 push rbx 552 %assign push_num 1 553 %define p_coeff_level r0 554 %define p_run r1 555 %define p_level r2 556 %define p_total_coeffs r3 557 %define r_mask rbx 558 %define r_maskd ebx 559 %define p_shufb_lut r5 560 %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut)) 561 lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut] 562 ; Free up rcx/ecx because only cl is accepted as shift amount operand. 563 mov r6, r0 564 %undef p_coeff_level 565 %define p_coeff_level r6 566 %define r_tmp r0 567 %define r_tmpd r0d 568 %define r_tmpb r0b 569%else 570 %assign push_num 0 571 %define p_coeff_level r0 572 %define p_run r1 573 %define p_level r2 574 %define p_total_coeffs r3 575 %define r_mask rax 576 %define r_maskd eax 577 %define p_shufb_lut r5 578 %define i_total_zeros r6 579 %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut)) 580 lea p_shufb_lut, [wels_cavlc_param_cal_shufb_lut] 581%endif 582 INIT_X86_32_PIC_NOPRESERVE r0 583 584 ; Acquire a bitmask indicating which words are non-zero. 585 ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3. 586 ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero. 587 ; Assumptions are taken from previous implementations. 588 pxor xmm1, xmm1 589 cmp i_endidxd, 3 590 jg .load16 591 movq xmm0, [p_coeff_level] 592 packsswb xmm0, xmm1 593 jmp .load_done 594.load16: 595 movdqa xmm0, [p_coeff_level] 596 packsswb xmm0, [p_coeff_level + 16] 597.load_done: 598 movdqa [p_run], xmm1 ; Zero-initialize because we may read back implied zeros. 599 pcmpeqb xmm0, xmm1 600 pshufb xmm0, [pic(wels_shufb_rev)] 601 pmovmskb r_maskd, xmm0 602 xor r_maskd, 0FFFFh 603%undef i_endidxd 604%define r_tmp2 r4 605%define r_tmp2d r4d 606 popcnt r_tmp2d, r_maskd 607 mov [p_total_coeffs], r_tmp2d 608 ; Recycle p_total_coeffs. 609%ifidni p_total_coeffs, rcx 610 %define r_tmp rcx 611 %define r_tmpd ecx 612 %define r_tmpb cl 613%else 614 %xdefine i_total_zeros p_total_coeffs 615%endif 616%undef p_total_coeffs 617%ifdef X86_32_PICASM 618 push r_tmp2 619 %undef i_total_zeros 620 %define i_total_zeros dword [esp] 621%else 622 mov i_total_zeros, r_tmp2 623%endif 624 jz .done 625 bsf r_tmpd, r_maskd ; Find first set bit. 626 lea r_tmp2, [r_tmp2 + r_tmp - 16] 627 neg r_tmp2 628 mov i_total_zeros, r_tmp2 629 ; Skip trailing zeros. 630 ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores. 631 and r_tmpd, -4 632 shr r_maskd, r_tmpb 633 add r_tmpd, r_tmpd 634 sub p_coeff_level, r_tmp 635 ; Handle first quadruple containing a non-zero value. 636 mov r_tmp, r_mask 637 and r_tmpd, 0Fh 638 movq xmm0, [p_coeff_level + 24] 639 movq xmm1, [p_shufb_lut + 8 * r_tmp] 640 pshufb xmm0, xmm1 641 mov r_tmp2d, [p_run_lut + 4 * r_tmp] 642 shr r_tmp2d, 8 ; Skip initial zero run. 643 movlps [p_level], xmm0 ; Store levels for the first quadruple. 644 mov [p_run], r_tmp2d ; Store accompanying zero runs thus far. 645 shr r_maskd, 4 646 jz .done 647.loop: 648 ; Increment pointers. 649 popcnt r_tmpd, r_tmpd ; Number of non-zero values handled. 650 lea p_level, [p_level + 2 * r_tmp] 651 add p_run, r_tmp 652 ; Handle next quadruple. 653 mov r_tmp, r_mask 654 and r_tmpd, 0Fh 655 movq xmm0, [p_coeff_level + 16] 656 sub p_coeff_level, 8 657 movq xmm1, [p_shufb_lut + 8 * r_tmp] 658 pshufb xmm0, xmm1 659 movzx r_tmp2d, byte [p_run - 1] 660 add r_tmp2d, [p_run_lut + 4 * r_tmp] ; Add to previous run and get eventual new runs. 661 movlps [p_level], xmm0 ; Store levels (potentially none). 662 mov [p_run - 1], r_tmp2d ; Update previous run and store eventual new runs. 663 shr r_maskd, 4 664 jnz .loop 665.done: 666%ifnidni retrq, i_total_zeros 667 %ifdef X86_32_PICASM 668 pop retrq 669 %else 670 mov retrq, i_total_zeros 671 %endif 672%endif 673 DEINIT_X86_32_PIC 674%ifdef X86_32 675 pop r6 676 pop r5 677 pop r4 678 pop r3 679%elifdef WIN64 680 pop rbx 681%endif 682 ret 683%undef p_coeff_level 684%undef p_run 685%undef p_level 686%undef i_total_zeros 687%undef r_mask 688%undef r_maskd 689%undef r_tmp 690%undef r_tmpd 691%undef r_tmpb 692%undef r_tmp2 693%undef r_tmp2d 694%undef p_shufb_lut 695%undef p_run_lut 696