1; 2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) 3; 4; Copyright (C) 2016, 2018, Matthieu Darbois 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; This file contains an SSE2 implementation of data preparation for progressive 17; Huffman encoding. See jcphuff.c for more details. 18; 19; [TAB8] 20 21%include "jsimdext.inc" 22 23; -------------------------------------------------------------------------- 24 SECTION SEG_TEXT 25 BITS 32 26 27; -------------------------------------------------------------------------- 28; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 29; jsimd_encode_mcu_AC_refine_prepare_sse2() 30 31%macro LOAD16 0 32 pxor N0, N0 33 pxor N1, N1 34 35 mov T0, INT [LUT + 0*SIZEOF_INT] 36 mov T1, INT [LUT + 8*SIZEOF_INT] 37 pinsrw X0, word [BLOCK + T0 * 2], 0 38 pinsrw X1, word [BLOCK + T1 * 2], 0 39 40 mov T0, INT [LUT + 1*SIZEOF_INT] 41 mov T1, INT [LUT + 9*SIZEOF_INT] 42 pinsrw X0, word [BLOCK + T0 * 2], 1 43 pinsrw X1, word [BLOCK + T1 * 2], 1 44 45 mov T0, INT [LUT + 2*SIZEOF_INT] 46 mov T1, INT [LUT + 10*SIZEOF_INT] 47 pinsrw X0, word [BLOCK + T0 * 2], 2 48 pinsrw X1, word [BLOCK + T1 * 2], 2 49 50 mov T0, INT [LUT + 3*SIZEOF_INT] 51 mov T1, INT [LUT + 11*SIZEOF_INT] 52 pinsrw X0, word [BLOCK + T0 * 2], 3 53 pinsrw X1, word [BLOCK + T1 * 2], 3 54 55 mov T0, INT [LUT + 4*SIZEOF_INT] 56 mov T1, INT [LUT + 12*SIZEOF_INT] 57 pinsrw X0, word [BLOCK + T0 * 2], 4 58 pinsrw X1, word [BLOCK + T1 * 2], 4 59 60 mov T0, INT [LUT + 5*SIZEOF_INT] 61 mov T1, INT [LUT + 13*SIZEOF_INT] 62 pinsrw X0, word [BLOCK + T0 * 2], 5 63 pinsrw X1, word [BLOCK + T1 * 2], 5 64 65 mov T0, INT [LUT + 6*SIZEOF_INT] 66 mov T1, INT [LUT + 14*SIZEOF_INT] 67 pinsrw X0, word [BLOCK + T0 * 2], 6 68 pinsrw X1, word [BLOCK + T1 * 2], 6 69 70 mov T0, INT [LUT + 7*SIZEOF_INT] 71 mov T1, INT [LUT + 15*SIZEOF_INT] 72 pinsrw X0, word [BLOCK + T0 * 2], 7 73 pinsrw X1, word [BLOCK + T1 * 2], 7 74%endmacro 75 76%macro LOAD15 0 77 pxor N0, N0 78 pxor N1, N1 79 pxor X1, X1 80 81 mov T0, INT [LUT + 0*SIZEOF_INT] 82 mov T1, INT [LUT + 8*SIZEOF_INT] 83 pinsrw X0, word [BLOCK + T0 * 2], 0 84 pinsrw X1, word [BLOCK + T1 * 2], 0 85 86 mov T0, INT [LUT + 1*SIZEOF_INT] 87 pinsrw X0, word [BLOCK + T0 * 2], 1 88 89 mov T0, INT [LUT + 2*SIZEOF_INT] 90 pinsrw X0, word [BLOCK + T0 * 2], 2 91 92 mov T0, INT [LUT + 3*SIZEOF_INT] 93 pinsrw X0, word [BLOCK + T0 * 2], 3 94 95 mov T0, INT [LUT + 4*SIZEOF_INT] 96 pinsrw X0, word [BLOCK + T0 * 2], 4 97 98 mov T0, INT [LUT + 5*SIZEOF_INT] 99 pinsrw X0, word [BLOCK + T0 * 2], 5 100 101 mov T0, INT [LUT + 6*SIZEOF_INT] 102 pinsrw X0, word [BLOCK + T0 * 2], 6 103 104 mov T0, INT [LUT + 7*SIZEOF_INT] 105 pinsrw X0, word [BLOCK + T0 * 2], 7 106 107 cmp LENEND, 2 108 jl %%.ELOAD15 109 mov T1, INT [LUT + 9*SIZEOF_INT] 110 pinsrw X1, word [BLOCK + T1 * 2], 1 111 112 cmp LENEND, 3 113 jl %%.ELOAD15 114 mov T1, INT [LUT + 10*SIZEOF_INT] 115 pinsrw X1, word [BLOCK + T1 * 2], 2 116 117 cmp LENEND, 4 118 jl %%.ELOAD15 119 mov T1, INT [LUT + 11*SIZEOF_INT] 120 pinsrw X1, word [BLOCK + T1 * 2], 3 121 122 cmp LENEND, 5 123 jl %%.ELOAD15 124 mov T1, INT [LUT + 12*SIZEOF_INT] 125 pinsrw X1, word [BLOCK + T1 * 2], 4 126 127 cmp LENEND, 6 128 jl %%.ELOAD15 129 mov T1, INT [LUT + 13*SIZEOF_INT] 130 pinsrw X1, word [BLOCK + T1 * 2], 5 131 132 cmp LENEND, 7 133 jl %%.ELOAD15 134 mov T1, INT [LUT + 14*SIZEOF_INT] 135 pinsrw X1, word [BLOCK + T1 * 2], 6 136%%.ELOAD15: 137%endmacro 138 139%macro LOAD8 0 140 pxor N0, N0 141 142 mov T0, INT [LUT + 0*SIZEOF_INT] 143 pinsrw X0, word [BLOCK + T0 * 2], 0 144 145 mov T0, INT [LUT + 1*SIZEOF_INT] 146 pinsrw X0, word [BLOCK + T0 * 2], 1 147 148 mov T0, INT [LUT + 2*SIZEOF_INT] 149 pinsrw X0, word [BLOCK + T0 * 2], 2 150 151 mov T0, INT [LUT + 3*SIZEOF_INT] 152 pinsrw X0, word [BLOCK + T0 * 2], 3 153 154 mov T0, INT [LUT + 4*SIZEOF_INT] 155 pinsrw X0, word [BLOCK + T0 * 2], 4 156 157 mov T0, INT [LUT + 5*SIZEOF_INT] 158 pinsrw X0, word [BLOCK + T0 * 2], 5 159 160 mov T0, INT [LUT + 6*SIZEOF_INT] 161 pinsrw X0, word [BLOCK + T0 * 2], 6 162 163 mov T0, INT [LUT + 7*SIZEOF_INT] 164 pinsrw X0, word [BLOCK + T0 * 2], 7 165%endmacro 166 167%macro LOAD7 0 168 pxor N0, N0 169 pxor X0, X0 170 171 mov T1, INT [LUT + 0*SIZEOF_INT] 172 pinsrw X0, word [BLOCK + T1 * 2], 0 173 174 cmp LENEND, 2 175 jl %%.ELOAD7 176 mov T1, INT [LUT + 1*SIZEOF_INT] 177 pinsrw X0, word [BLOCK + T1 * 2], 1 178 179 cmp LENEND, 3 180 jl %%.ELOAD7 181 mov T1, INT [LUT + 2*SIZEOF_INT] 182 pinsrw X0, word [BLOCK + T1 * 2], 2 183 184 cmp LENEND, 4 185 jl %%.ELOAD7 186 mov T1, INT [LUT + 3*SIZEOF_INT] 187 pinsrw X0, word [BLOCK + T1 * 2], 3 188 189 cmp LENEND, 5 190 jl %%.ELOAD7 191 mov T1, INT [LUT + 4*SIZEOF_INT] 192 pinsrw X0, word [BLOCK + T1 * 2], 4 193 194 cmp LENEND, 6 195 jl %%.ELOAD7 196 mov T1, INT [LUT + 5*SIZEOF_INT] 197 pinsrw X0, word [BLOCK + T1 * 2], 5 198 199 cmp LENEND, 7 200 jl %%.ELOAD7 201 mov T1, INT [LUT + 6*SIZEOF_INT] 202 pinsrw X0, word [BLOCK + T1 * 2], 6 203%%.ELOAD7: 204%endmacro 205 206%macro REDUCE0 0 207 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 208 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 209 movdqa xmm2, XMMWORD [VALUES + (16*2)] 210 movdqa xmm3, XMMWORD [VALUES + (24*2)] 211 movdqa xmm4, XMMWORD [VALUES + (32*2)] 212 movdqa xmm5, XMMWORD [VALUES + (40*2)] 213 movdqa xmm6, XMMWORD [VALUES + (48*2)] 214 215 pcmpeqw xmm0, ZERO 216 pcmpeqw xmm1, ZERO 217 pcmpeqw xmm2, ZERO 218 pcmpeqw xmm3, ZERO 219 pcmpeqw xmm4, ZERO 220 pcmpeqw xmm5, ZERO 221 pcmpeqw xmm6, ZERO 222 pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] 223 224 packsswb xmm0, xmm1 225 packsswb xmm2, xmm3 226 packsswb xmm4, xmm5 227 packsswb xmm6, xmm7 228 229 pmovmskb eax, xmm0 230 pmovmskb ecx, xmm2 231 pmovmskb edx, xmm4 232 pmovmskb esi, xmm6 233 234 shl ecx, 16 235 shl esi, 16 236 237 or eax, ecx 238 or edx, esi 239 240 not eax 241 not edx 242 243 mov edi, ZEROBITS 244 245 mov INT [edi], eax 246 mov INT [edi+SIZEOF_INT], edx 247%endmacro 248 249; 250; Prepare data for jsimd_encode_mcu_AC_first(). 251; 252; GLOBAL(void) 253; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 254; const int *jpeg_natural_order_start, 255; int Sl, int Al, JCOEF *values, 256; size_t *zerobits) 257; 258; eax + 8 = const JCOEF *block 259; eax + 12 = const int *jpeg_natural_order_start 260; eax + 16 = int Sl 261; eax + 20 = int Al 262; eax + 24 = JCOEF *values 263; eax + 28 = size_t *zerobits 264 265%define ZERO xmm7 266%define X0 xmm0 267%define X1 xmm1 268%define N0 xmm2 269%define N1 xmm3 270%define AL xmm4 271%define K eax 272%define LENEND eax 273%define LUT ebx 274%define T0 ecx 275%define T1 edx 276%define BLOCK esi 277%define VALUES edi 278%define LEN ebp 279 280%define ZEROBITS INT [esp + 5 * 4] 281 282 align 32 283 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 284 285EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 286 push ebp 287 mov eax, esp ; eax = original ebp 288 sub esp, byte 4 289 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 290 mov [esp], eax 291 mov ebp, esp ; ebp = aligned ebp 292 sub esp, 4 293 push ebx 294 push ecx 295; push edx ; need not be preserved 296 push esi 297 push edi 298 push ebp 299 300 mov BLOCK, INT [eax + 8] 301 mov LUT, INT [eax + 12] 302 mov VALUES, INT [eax + 24] 303 movd AL, INT [eax + 20] 304 mov T0, INT [eax + 28] 305 mov ZEROBITS, T0 306 mov LEN, INT [eax + 16] 307 pxor ZERO, ZERO 308 mov K, LEN 309 and K, -16 310 shr K, 4 311 jz .ELOOP16 312.BLOOP16: 313 LOAD16 314 pcmpgtw N0, X0 315 pcmpgtw N1, X1 316 paddw X0, N0 317 paddw X1, N1 318 pxor X0, N0 319 pxor X1, N1 320 psrlw X0, AL 321 psrlw X1, AL 322 pxor N0, X0 323 pxor N1, X1 324 movdqa XMMWORD [VALUES + (0) * 2], X0 325 movdqa XMMWORD [VALUES + (8) * 2], X1 326 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 327 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 328 add VALUES, 16*2 329 add LUT, 16*SIZEOF_INT 330 dec K 331 jnz .BLOOP16 332.ELOOP16: 333 mov LENEND, LEN 334 and LENEND, 7 335 336 test LEN, 8 337 jz .TRY7 338 test LEN, 7 339 jz .TRY8 340 341 LOAD15 342 pcmpgtw N0, X0 343 pcmpgtw N1, X1 344 paddw X0, N0 345 paddw X1, N1 346 pxor X0, N0 347 pxor X1, N1 348 psrlw X0, AL 349 psrlw X1, AL 350 pxor N0, X0 351 pxor N1, X1 352 movdqa XMMWORD [VALUES + (0) * 2], X0 353 movdqa XMMWORD [VALUES + (8) * 2], X1 354 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 355 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 356 add VALUES, 16*2 357 jmp .PADDING 358.TRY8: 359 LOAD8 360 pcmpgtw N0, X0 361 paddw X0, N0 362 pxor X0, N0 363 psrlw X0, AL 364 pxor N0, X0 365 movdqa XMMWORD [VALUES + (0) * 2], X0 366 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 367 add VALUES, 8*2 368 jmp .PADDING 369.TRY7: 370 LOAD7 371 pcmpgtw N0, X0 372 paddw X0, N0 373 pxor X0, N0 374 psrlw X0, AL 375 pxor N0, X0 376 movdqa XMMWORD [VALUES + (0) * 2], X0 377 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 378 add VALUES, 8*2 379.PADDING: 380 mov K, LEN 381 add K, 7 382 and K, -8 383 shr K, 3 384 sub K, DCTSIZE2/8 385 jz .EPADDING 386 align 16 387.ZEROLOOP: 388 movdqa XMMWORD [VALUES + 0], ZERO 389 add VALUES, 8*2 390 inc K 391 jnz .ZEROLOOP 392.EPADDING: 393 sub VALUES, DCTSIZE2*2 394 395 REDUCE0 396 397 pop ebp 398 pop edi 399 pop esi 400; pop edx ; need not be preserved 401 pop ecx 402 pop ebx 403 mov esp, ebp ; esp <- aligned ebp 404 pop esp ; esp <- original ebp 405 pop ebp 406 ret 407 408%undef ZERO 409%undef X0 410%undef X1 411%undef N0 412%undef N1 413%undef AL 414%undef K 415%undef LUT 416%undef T0 417%undef T1 418%undef BLOCK 419%undef VALUES 420%undef LEN 421 422; 423; Prepare data for jsimd_encode_mcu_AC_refine(). 424; 425; GLOBAL(int) 426; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 427; const int *jpeg_natural_order_start, 428; int Sl, int Al, JCOEF *absvalues, 429; size_t *bits) 430; 431; eax + 8 = const JCOEF *block 432; eax + 12 = const int *jpeg_natural_order_start 433; eax + 16 = int Sl 434; eax + 20 = int Al 435; eax + 24 = JCOEF *values 436; eax + 28 = size_t *bits 437 438%define ZERO xmm7 439%define ONE xmm5 440%define X0 xmm0 441%define X1 xmm1 442%define N0 xmm2 443%define N1 xmm3 444%define AL xmm4 445%define K eax 446%define LENEND eax 447%define LUT ebx 448%define T0 ecx 449%define T0w cx 450%define T1 edx 451%define BLOCK esi 452%define VALUES edi 453%define KK ebp 454 455%define ZEROBITS INT [esp + 5 * 4] 456%define EOB INT [esp + 5 * 4 + 4] 457%define LEN INT [esp + 5 * 4 + 8] 458 459 align 32 460 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 461 462EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 463 push ebp 464 mov eax, esp ; eax = original ebp 465 sub esp, byte 4 466 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 467 mov [esp], eax 468 mov ebp, esp ; ebp = aligned ebp 469 sub esp, 16 470 push ebx 471 push ecx 472; push edx ; need not be preserved 473 push esi 474 push edi 475 push ebp 476 477 pcmpeqw ONE, ONE 478 psrlw ONE, 15 479 mov BLOCK, INT [eax + 8] 480 mov LUT, INT [eax + 12] 481 mov VALUES, INT [eax + 24] 482 movd AL, INT [eax + 20] 483 mov T0, INT [eax + 28] 484 mov K, INT [eax + 16] 485 mov INT [T0 + 2 * SIZEOF_INT], -1 486 mov INT [T0 + 3 * SIZEOF_INT], -1 487 mov ZEROBITS, T0 488 mov LEN, K 489 pxor ZERO, ZERO 490 and K, -16 491 mov EOB, 0 492 xor KK, KK 493 shr K, 4 494 jz .ELOOPR16 495.BLOOPR16: 496 LOAD16 497 pcmpgtw N0, X0 498 pcmpgtw N1, X1 499 paddw X0, N0 500 paddw X1, N1 501 pxor X0, N0 502 pxor X1, N1 503 psrlw X0, AL 504 psrlw X1, AL 505 movdqa XMMWORD [VALUES + (0) * 2], X0 506 movdqa XMMWORD [VALUES + (8) * 2], X1 507 pcmpeqw X0, ONE 508 pcmpeqw X1, ONE 509 packsswb N0, N1 510 packsswb X0, X1 511 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 512 mov T1, ZEROBITS 513 not T0 514 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 515 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 516 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 517 jz .CONTINUER16 ; if (idx) { 518 lea T1, [T1+KK*8] 519 mov EOB, T1 ; EOB = k + idx; 520.CONTINUER16: 521 add VALUES, 16*2 522 add LUT, 16*SIZEOF_INT 523 add KK, 2 524 dec K 525 jnz .BLOOPR16 526.ELOOPR16: 527 mov LENEND, LEN 528 529 test LENEND, 8 530 jz .TRYR7 531 test LENEND, 7 532 jz .TRYR8 533 534 and LENEND, 7 535 LOAD15 536 pcmpgtw N0, X0 537 pcmpgtw N1, X1 538 paddw X0, N0 539 paddw X1, N1 540 pxor X0, N0 541 pxor X1, N1 542 psrlw X0, AL 543 psrlw X1, AL 544 movdqa XMMWORD [VALUES + (0) * 2], X0 545 movdqa XMMWORD [VALUES + (8) * 2], X1 546 pcmpeqw X0, ONE 547 pcmpeqw X1, ONE 548 packsswb N0, N1 549 packsswb X0, X1 550 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 551 mov T1, ZEROBITS 552 not T0 553 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 554 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 555 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 556 jz .CONTINUER15 ; if (idx) { 557 lea T1, [T1+KK*8] 558 mov EOB, T1 ; EOB = k + idx; 559.CONTINUER15: 560 add VALUES, 16*2 561 jmp .PADDINGR 562.TRYR8: 563 LOAD8 564 565 pcmpgtw N0, X0 566 paddw X0, N0 567 pxor X0, N0 568 psrlw X0, AL 569 movdqa XMMWORD [VALUES + (0) * 2], X0 570 pcmpeqw X0, ONE 571 packsswb N0, ZERO 572 packsswb X0, ZERO 573 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 574 mov T1, ZEROBITS 575 not T0 576 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 577 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 578 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 579 jz .CONTINUER8 ; if (idx) { 580 lea T1, [T1+KK*8] 581 mov EOB, T1 ; EOB = k + idx; 582.CONTINUER8: 583 add VALUES, 8*2 584 jmp .PADDINGR 585.TRYR7: 586 and LENEND, 7 587 LOAD7 588 589 pcmpgtw N0, X0 590 paddw X0, N0 591 pxor X0, N0 592 psrlw X0, AL 593 movdqa XMMWORD [VALUES + (0) * 2], X0 594 pcmpeqw X0, ONE 595 packsswb N0, ZERO 596 packsswb X0, ZERO 597 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 598 mov T1, ZEROBITS 599 not T0 600 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 601 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 602 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 603 jz .CONTINUER7 ; if (idx) { 604 lea T1, [T1+KK*8] 605 mov EOB, T1 ; EOB = k + idx; 606.CONTINUER7: 607 add VALUES, 8*2 608.PADDINGR: 609 mov K, LEN 610 add K, 7 611 and K, -8 612 shr K, 3 613 sub K, DCTSIZE2/8 614 jz .EPADDINGR 615 align 16 616.ZEROLOOPR: 617 movdqa XMMWORD [VALUES + 0], ZERO 618 add VALUES, 8*2 619 inc K 620 jnz .ZEROLOOPR 621.EPADDINGR: 622 sub VALUES, DCTSIZE2*2 623 624 REDUCE0 625 626 mov eax, EOB 627 628 pop ebp 629 pop edi 630 pop esi 631; pop edx ; need not be preserved 632 pop ecx 633 pop ebx 634 mov esp, ebp ; esp <- aligned ebp 635 pop esp ; esp <- original ebp 636 pop ebp 637 ret 638 639%undef ZERO 640%undef ONE 641%undef X0 642%undef X1 643%undef N0 644%undef N1 645%undef AL 646%undef K 647%undef KK 648%undef EOB 649%undef SIGN 650%undef LUT 651%undef T0 652%undef T1 653%undef BLOCK 654%undef VALUES 655%undef LEN 656%undef LENEND 657 658; For some reason, the OS X linker does not honor the request to align the 659; segment unless we do this. 660 align 32 661