1; 2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) 3; 4; Copyright (C) 2016, 2018, Matthieu Darbois 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; This file contains an SSE2 implementation of data preparation for progressive 17; Huffman encoding. See jcphuff.c for more details. 18; 19; [TAB8] 20 21%include "jsimdext.inc" 22 23; -------------------------------------------------------------------------- 24 SECTION SEG_TEXT 25 BITS 32 26 27; -------------------------------------------------------------------------- 28; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 29; jsimd_encode_mcu_AC_refine_prepare_sse2() 30 31%macro LOAD16 0 32 pxor N0, N0 33 pxor N1, N1 34 35 mov T0, INT [LUT + 0*SIZEOF_INT] 36 mov T1, INT [LUT + 8*SIZEOF_INT] 37 pinsrw X0, word [BLOCK + T0 * 2], 0 38 pinsrw X1, word [BLOCK + T1 * 2], 0 39 40 mov T0, INT [LUT + 1*SIZEOF_INT] 41 mov T1, INT [LUT + 9*SIZEOF_INT] 42 pinsrw X0, word [BLOCK + T0 * 2], 1 43 pinsrw X1, word [BLOCK + T1 * 2], 1 44 45 mov T0, INT [LUT + 2*SIZEOF_INT] 46 mov T1, INT [LUT + 10*SIZEOF_INT] 47 pinsrw X0, word [BLOCK + T0 * 2], 2 48 pinsrw X1, word [BLOCK + T1 * 2], 2 49 50 mov T0, INT [LUT + 3*SIZEOF_INT] 51 mov T1, INT [LUT + 11*SIZEOF_INT] 52 pinsrw X0, word [BLOCK + T0 * 2], 3 53 pinsrw X1, word [BLOCK + T1 * 2], 3 54 55 mov T0, INT [LUT + 4*SIZEOF_INT] 56 mov T1, INT [LUT + 12*SIZEOF_INT] 57 pinsrw X0, word [BLOCK + T0 * 2], 4 58 pinsrw X1, word [BLOCK + T1 * 2], 4 59 60 mov T0, INT [LUT + 5*SIZEOF_INT] 61 mov T1, INT [LUT + 13*SIZEOF_INT] 62 pinsrw X0, word [BLOCK + T0 * 2], 5 63 pinsrw X1, word [BLOCK + T1 * 2], 5 64 65 mov T0, INT [LUT + 6*SIZEOF_INT] 66 mov T1, INT [LUT + 14*SIZEOF_INT] 67 pinsrw X0, word [BLOCK + T0 * 2], 6 68 pinsrw X1, word [BLOCK + T1 * 2], 6 69 70 mov T0, INT [LUT + 7*SIZEOF_INT] 71 mov T1, INT [LUT + 15*SIZEOF_INT] 72 pinsrw X0, word [BLOCK + T0 * 2], 7 73 pinsrw X1, word [BLOCK + T1 * 2], 7 74%endmacro 75 76%macro LOAD15 0 77 pxor N0, N0 78 pxor N1, N1 79 pxor X1, X1 80 81 mov T0, INT [LUT + 0*SIZEOF_INT] 82 mov T1, INT [LUT + 8*SIZEOF_INT] 83 pinsrw X0, word [BLOCK + T0 * 2], 0 84 pinsrw X1, word [BLOCK + T1 * 2], 0 85 86 mov T0, INT [LUT + 1*SIZEOF_INT] 87 pinsrw X0, word [BLOCK + T0 * 2], 1 88 89 mov T0, INT [LUT + 2*SIZEOF_INT] 90 pinsrw X0, word [BLOCK + T0 * 2], 2 91 92 mov T0, INT [LUT + 3*SIZEOF_INT] 93 pinsrw X0, word [BLOCK + T0 * 2], 3 94 95 mov T0, INT [LUT + 4*SIZEOF_INT] 96 pinsrw X0, word [BLOCK + T0 * 2], 4 97 98 mov T0, INT [LUT + 5*SIZEOF_INT] 99 pinsrw X0, word [BLOCK + T0 * 2], 5 100 101 mov T0, INT [LUT + 6*SIZEOF_INT] 102 pinsrw X0, word [BLOCK + T0 * 2], 6 103 104 mov T0, INT [LUT + 7*SIZEOF_INT] 105 pinsrw X0, word [BLOCK + T0 * 2], 7 106 107 cmp LENEND, 2 108 jl %%.ELOAD15 109 mov T1, INT [LUT + 9*SIZEOF_INT] 110 pinsrw X1, word [BLOCK + T1 * 2], 1 111 112 cmp LENEND, 3 113 jl %%.ELOAD15 114 mov T1, INT [LUT + 10*SIZEOF_INT] 115 pinsrw X1, word [BLOCK + T1 * 2], 2 116 117 cmp LENEND, 4 118 jl %%.ELOAD15 119 mov T1, INT [LUT + 11*SIZEOF_INT] 120 pinsrw X1, word [BLOCK + T1 * 2], 3 121 122 cmp LENEND, 5 123 jl %%.ELOAD15 124 mov T1, INT [LUT + 12*SIZEOF_INT] 125 pinsrw X1, word [BLOCK + T1 * 2], 4 126 127 cmp LENEND, 6 128 jl %%.ELOAD15 129 mov T1, INT [LUT + 13*SIZEOF_INT] 130 pinsrw X1, word [BLOCK + T1 * 2], 5 131 132 cmp LENEND, 7 133 jl %%.ELOAD15 134 mov T1, INT [LUT + 14*SIZEOF_INT] 135 pinsrw X1, word [BLOCK + T1 * 2], 6 136%%.ELOAD15: 137%endmacro 138 139%macro LOAD8 0 140 pxor N0, N0 141 142 mov T0, INT [LUT + 0*SIZEOF_INT] 143 pinsrw X0, word [BLOCK + T0 * 2], 0 144 145 mov T0, INT [LUT + 1*SIZEOF_INT] 146 pinsrw X0, word [BLOCK + T0 * 2], 1 147 148 mov T0, INT [LUT + 2*SIZEOF_INT] 149 pinsrw X0, word [BLOCK + T0 * 2], 2 150 151 mov T0, INT [LUT + 3*SIZEOF_INT] 152 pinsrw X0, word [BLOCK + T0 * 2], 3 153 154 mov T0, INT [LUT + 4*SIZEOF_INT] 155 pinsrw X0, word [BLOCK + T0 * 2], 4 156 157 mov T0, INT [LUT + 5*SIZEOF_INT] 158 pinsrw X0, word [BLOCK + T0 * 2], 5 159 160 mov T0, INT [LUT + 6*SIZEOF_INT] 161 pinsrw X0, word [BLOCK + T0 * 2], 6 162 163 mov T0, INT [LUT + 7*SIZEOF_INT] 164 pinsrw X0, word [BLOCK + T0 * 2], 7 165%endmacro 166 167%macro LOAD7 0 168 pxor N0, N0 169 pxor X0, X0 170 171 mov T1, INT [LUT + 0*SIZEOF_INT] 172 pinsrw X0, word [BLOCK + T1 * 2], 0 173 174 cmp LENEND, 2 175 jl %%.ELOAD7 176 mov T1, INT [LUT + 1*SIZEOF_INT] 177 pinsrw X0, word [BLOCK + T1 * 2], 1 178 179 cmp LENEND, 3 180 jl %%.ELOAD7 181 mov T1, INT [LUT + 2*SIZEOF_INT] 182 pinsrw X0, word [BLOCK + T1 * 2], 2 183 184 cmp LENEND, 4 185 jl %%.ELOAD7 186 mov T1, INT [LUT + 3*SIZEOF_INT] 187 pinsrw X0, word [BLOCK + T1 * 2], 3 188 189 cmp LENEND, 5 190 jl %%.ELOAD7 191 mov T1, INT [LUT + 4*SIZEOF_INT] 192 pinsrw X0, word [BLOCK + T1 * 2], 4 193 194 cmp LENEND, 6 195 jl %%.ELOAD7 196 mov T1, INT [LUT + 5*SIZEOF_INT] 197 pinsrw X0, word [BLOCK + T1 * 2], 5 198 199 cmp LENEND, 7 200 jl %%.ELOAD7 201 mov T1, INT [LUT + 6*SIZEOF_INT] 202 pinsrw X0, word [BLOCK + T1 * 2], 6 203%%.ELOAD7: 204%endmacro 205 206%macro REDUCE0 0 207 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 208 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 209 movdqa xmm2, XMMWORD [VALUES + (16*2)] 210 movdqa xmm3, XMMWORD [VALUES + (24*2)] 211 movdqa xmm4, XMMWORD [VALUES + (32*2)] 212 movdqa xmm5, XMMWORD [VALUES + (40*2)] 213 movdqa xmm6, XMMWORD [VALUES + (48*2)] 214 215 pcmpeqw xmm0, ZERO 216 pcmpeqw xmm1, ZERO 217 pcmpeqw xmm2, ZERO 218 pcmpeqw xmm3, ZERO 219 pcmpeqw xmm4, ZERO 220 pcmpeqw xmm5, ZERO 221 pcmpeqw xmm6, ZERO 222 pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] 223 224 packsswb xmm0, xmm1 225 packsswb xmm2, xmm3 226 packsswb xmm4, xmm5 227 packsswb xmm6, xmm7 228 229 pmovmskb eax, xmm0 230 pmovmskb ecx, xmm2 231 pmovmskb edx, xmm4 232 pmovmskb esi, xmm6 233 234 shl ecx, 16 235 shl esi, 16 236 237 or eax, ecx 238 or edx, esi 239 240 not eax 241 not edx 242 243 mov edi, ZEROBITS 244 245 mov INT [edi], eax 246 mov INT [edi+SIZEOF_INT], edx 247%endmacro 248 249; 250; Prepare data for jsimd_encode_mcu_AC_first(). 251; 252; GLOBAL(void) 253; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 254; const int *jpeg_natural_order_start, 255; int Sl, int Al, JCOEF *values, 256; size_t *zerobits) 257; 258; eax + 8 = const JCOEF *block 259; eax + 12 = const int *jpeg_natural_order_start 260; eax + 16 = int Sl 261; eax + 20 = int Al 262; eax + 24 = JCOEF *values 263; eax + 28 = size_t *zerobits 264 265%define ZERO xmm7 266%define X0 xmm0 267%define X1 xmm1 268%define N0 xmm2 269%define N1 xmm3 270%define AL xmm4 271%define K eax 272%define LENEND eax 273%define LUT ebx 274%define T0 ecx 275%define T1 edx 276%define BLOCK esi 277%define VALUES edi 278%define LEN ebp 279 280%define ZEROBITS INT [esp + 5 * 4] 281 282 align 32 283 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 284 285EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 286 push ebp 287 mov eax, esp ; eax = original ebp 288 sub esp, byte 4 289 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 290 mov [esp], eax 291 mov ebp, esp ; ebp = aligned ebp 292 sub esp, 4 293 push ebx 294 push ecx 295; push edx ; need not be preserved 296 push esi 297 push edi 298 push ebp 299 300 mov BLOCK, INT [eax + 8] 301 mov LUT, INT [eax + 12] 302 mov VALUES, INT [eax + 24] 303 movd AL, INT [eax + 20] 304 mov T0, INT [eax + 28] 305 mov ZEROBITS, T0 306 mov LEN, INT [eax + 16] 307 pxor ZERO, ZERO 308 mov K, LEN 309 and K, -16 310 shr K, 4 311 jz .ELOOP16 312.BLOOP16: 313 LOAD16 314 pcmpgtw N0, X0 315 pcmpgtw N1, X1 316 paddw X0, N0 317 paddw X1, N1 318 pxor X0, N0 319 pxor X1, N1 320 psrlw X0, AL 321 psrlw X1, AL 322 pxor N0, X0 323 pxor N1, X1 324 movdqa XMMWORD [VALUES + (0) * 2], X0 325 movdqa XMMWORD [VALUES + (8) * 2], X1 326 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 327 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 328 add VALUES, 16*2 329 add LUT, 16*SIZEOF_INT 330 dec K 331 jnz .BLOOP16 332 test LEN, 15 333 je .PADDING 334.ELOOP16: 335 mov LENEND, LEN 336 and LENEND, 7 337 338 test LEN, 8 339 jz .TRY7 340 test LEN, 7 341 jz .TRY8 342 343 LOAD15 344 pcmpgtw N0, X0 345 pcmpgtw N1, X1 346 paddw X0, N0 347 paddw X1, N1 348 pxor X0, N0 349 pxor X1, N1 350 psrlw X0, AL 351 psrlw X1, AL 352 pxor N0, X0 353 pxor N1, X1 354 movdqa XMMWORD [VALUES + (0) * 2], X0 355 movdqa XMMWORD [VALUES + (8) * 2], X1 356 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 357 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 358 add VALUES, 16*2 359 jmp .PADDING 360.TRY8: 361 LOAD8 362 pcmpgtw N0, X0 363 paddw X0, N0 364 pxor X0, N0 365 psrlw X0, AL 366 pxor N0, X0 367 movdqa XMMWORD [VALUES + (0) * 2], X0 368 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 369 add VALUES, 8*2 370 jmp .PADDING 371.TRY7: 372 LOAD7 373 pcmpgtw N0, X0 374 paddw X0, N0 375 pxor X0, N0 376 psrlw X0, AL 377 pxor N0, X0 378 movdqa XMMWORD [VALUES + (0) * 2], X0 379 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 380 add VALUES, 8*2 381.PADDING: 382 mov K, LEN 383 add K, 7 384 and K, -8 385 shr K, 3 386 sub K, DCTSIZE2/8 387 jz .EPADDING 388 align 16 389.ZEROLOOP: 390 movdqa XMMWORD [VALUES + 0], ZERO 391 add VALUES, 8*2 392 inc K 393 jnz .ZEROLOOP 394.EPADDING: 395 sub VALUES, DCTSIZE2*2 396 397 REDUCE0 398 399 pop ebp 400 pop edi 401 pop esi 402; pop edx ; need not be preserved 403 pop ecx 404 pop ebx 405 mov esp, ebp ; esp <- aligned ebp 406 pop esp ; esp <- original ebp 407 pop ebp 408 ret 409 410%undef ZERO 411%undef X0 412%undef X1 413%undef N0 414%undef N1 415%undef AL 416%undef K 417%undef LUT 418%undef T0 419%undef T1 420%undef BLOCK 421%undef VALUES 422%undef LEN 423 424; 425; Prepare data for jsimd_encode_mcu_AC_refine(). 426; 427; GLOBAL(int) 428; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 429; const int *jpeg_natural_order_start, 430; int Sl, int Al, JCOEF *absvalues, 431; size_t *bits) 432; 433; eax + 8 = const JCOEF *block 434; eax + 12 = const int *jpeg_natural_order_start 435; eax + 16 = int Sl 436; eax + 20 = int Al 437; eax + 24 = JCOEF *values 438; eax + 28 = size_t *bits 439 440%define ZERO xmm7 441%define ONE xmm5 442%define X0 xmm0 443%define X1 xmm1 444%define N0 xmm2 445%define N1 xmm3 446%define AL xmm4 447%define K eax 448%define LENEND eax 449%define LUT ebx 450%define T0 ecx 451%define T0w cx 452%define T1 edx 453%define BLOCK esi 454%define VALUES edi 455%define KK ebp 456 457%define ZEROBITS INT [esp + 5 * 4] 458%define EOB INT [esp + 5 * 4 + 4] 459%define LEN INT [esp + 5 * 4 + 8] 460 461 align 32 462 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 463 464EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 465 push ebp 466 mov eax, esp ; eax = original ebp 467 sub esp, byte 4 468 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 469 mov [esp], eax 470 mov ebp, esp ; ebp = aligned ebp 471 sub esp, 16 472 push ebx 473 push ecx 474; push edx ; need not be preserved 475 push esi 476 push edi 477 push ebp 478 479 pcmpeqw ONE, ONE 480 psrlw ONE, 15 481 mov BLOCK, INT [eax + 8] 482 mov LUT, INT [eax + 12] 483 mov VALUES, INT [eax + 24] 484 movd AL, INT [eax + 20] 485 mov T0, INT [eax + 28] 486 mov K, INT [eax + 16] 487 mov INT [T0 + 2 * SIZEOF_INT], -1 488 mov INT [T0 + 3 * SIZEOF_INT], -1 489 mov ZEROBITS, T0 490 mov LEN, K 491 pxor ZERO, ZERO 492 and K, -16 493 mov EOB, 0 494 xor KK, KK 495 shr K, 4 496 jz .ELOOPR16 497.BLOOPR16: 498 LOAD16 499 pcmpgtw N0, X0 500 pcmpgtw N1, X1 501 paddw X0, N0 502 paddw X1, N1 503 pxor X0, N0 504 pxor X1, N1 505 psrlw X0, AL 506 psrlw X1, AL 507 movdqa XMMWORD [VALUES + (0) * 2], X0 508 movdqa XMMWORD [VALUES + (8) * 2], X1 509 pcmpeqw X0, ONE 510 pcmpeqw X1, ONE 511 packsswb N0, N1 512 packsswb X0, X1 513 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 514 mov T1, ZEROBITS 515 not T0 516 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 517 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 518 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 519 jz .CONTINUER16 ; if (idx) { 520 lea T1, [T1+KK*8] 521 mov EOB, T1 ; EOB = k + idx; 522.CONTINUER16: 523 add VALUES, 16*2 524 add LUT, 16*SIZEOF_INT 525 add KK, 2 526 dec K 527 jnz .BLOOPR16 528.ELOOPR16: 529 mov LENEND, LEN 530 531 test LENEND, 8 532 jz .TRYR7 533 test LENEND, 7 534 jz .TRYR8 535 536 and LENEND, 7 537 LOAD15 538 pcmpgtw N0, X0 539 pcmpgtw N1, X1 540 paddw X0, N0 541 paddw X1, N1 542 pxor X0, N0 543 pxor X1, N1 544 psrlw X0, AL 545 psrlw X1, AL 546 movdqa XMMWORD [VALUES + (0) * 2], X0 547 movdqa XMMWORD [VALUES + (8) * 2], X1 548 pcmpeqw X0, ONE 549 pcmpeqw X1, ONE 550 packsswb N0, N1 551 packsswb X0, X1 552 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 553 mov T1, ZEROBITS 554 not T0 555 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 556 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 557 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 558 jz .CONTINUER15 ; if (idx) { 559 lea T1, [T1+KK*8] 560 mov EOB, T1 ; EOB = k + idx; 561.CONTINUER15: 562 add VALUES, 16*2 563 jmp .PADDINGR 564.TRYR8: 565 LOAD8 566 567 pcmpgtw N0, X0 568 paddw X0, N0 569 pxor X0, N0 570 psrlw X0, AL 571 movdqa XMMWORD [VALUES + (0) * 2], X0 572 pcmpeqw X0, ONE 573 packsswb N0, ZERO 574 packsswb X0, ZERO 575 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 576 mov T1, ZEROBITS 577 not T0 578 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 579 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 580 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 581 jz .CONTINUER8 ; if (idx) { 582 lea T1, [T1+KK*8] 583 mov EOB, T1 ; EOB = k + idx; 584.CONTINUER8: 585 add VALUES, 8*2 586 jmp .PADDINGR 587.TRYR7: 588 and LENEND, 7 589 LOAD7 590 591 pcmpgtw N0, X0 592 paddw X0, N0 593 pxor X0, N0 594 psrlw X0, AL 595 movdqa XMMWORD [VALUES + (0) * 2], X0 596 pcmpeqw X0, ONE 597 packsswb N0, ZERO 598 packsswb X0, ZERO 599 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 600 mov T1, ZEROBITS 601 not T0 602 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 603 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 604 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 605 jz .CONTINUER7 ; if (idx) { 606 lea T1, [T1+KK*8] 607 mov EOB, T1 ; EOB = k + idx; 608.CONTINUER7: 609 add VALUES, 8*2 610.PADDINGR: 611 mov K, LEN 612 add K, 7 613 and K, -8 614 shr K, 3 615 sub K, DCTSIZE2/8 616 jz .EPADDINGR 617 align 16 618.ZEROLOOPR: 619 movdqa XMMWORD [VALUES + 0], ZERO 620 add VALUES, 8*2 621 inc K 622 jnz .ZEROLOOPR 623.EPADDINGR: 624 sub VALUES, DCTSIZE2*2 625 626 REDUCE0 627 628 mov eax, EOB 629 630 pop ebp 631 pop edi 632 pop esi 633; pop edx ; need not be preserved 634 pop ecx 635 pop ebx 636 mov esp, ebp ; esp <- aligned ebp 637 pop esp ; esp <- original ebp 638 pop ebp 639 ret 640 641%undef ZERO 642%undef ONE 643%undef X0 644%undef X1 645%undef N0 646%undef N1 647%undef AL 648%undef K 649%undef KK 650%undef EOB 651%undef SIGN 652%undef LUT 653%undef T0 654%undef T1 655%undef BLOCK 656%undef VALUES 657%undef LEN 658%undef LENEND 659 660; For some reason, the OS X linker does not honor the request to align the 661; segment unless we do this. 662 align 32 663