1; 2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2) 3; 4; Copyright (C) 2016, 2018, Matthieu Darbois 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; This file contains an SSE2 implementation of data preparation for progressive 17; Huffman encoding. See jcphuff.c for more details. 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 32 24 25; -------------------------------------------------------------------------- 26; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 27; jsimd_encode_mcu_AC_refine_prepare_sse2() 28 29%macro LOAD16 0 30 pxor N0, N0 31 pxor N1, N1 32 33 mov T0, INT [LUT + 0*SIZEOF_INT] 34 mov T1, INT [LUT + 8*SIZEOF_INT] 35 pinsrw X0, word [BLOCK + T0 * 2], 0 36 pinsrw X1, word [BLOCK + T1 * 2], 0 37 38 mov T0, INT [LUT + 1*SIZEOF_INT] 39 mov T1, INT [LUT + 9*SIZEOF_INT] 40 pinsrw X0, word [BLOCK + T0 * 2], 1 41 pinsrw X1, word [BLOCK + T1 * 2], 1 42 43 mov T0, INT [LUT + 2*SIZEOF_INT] 44 mov T1, INT [LUT + 10*SIZEOF_INT] 45 pinsrw X0, word [BLOCK + T0 * 2], 2 46 pinsrw X1, word [BLOCK + T1 * 2], 2 47 48 mov T0, INT [LUT + 3*SIZEOF_INT] 49 mov T1, INT [LUT + 11*SIZEOF_INT] 50 pinsrw X0, word [BLOCK + T0 * 2], 3 51 pinsrw X1, word [BLOCK + T1 * 2], 3 52 53 mov T0, INT [LUT + 4*SIZEOF_INT] 54 mov T1, INT [LUT + 12*SIZEOF_INT] 55 pinsrw X0, word [BLOCK + T0 * 2], 4 56 pinsrw X1, word [BLOCK + T1 * 2], 4 57 58 mov T0, INT [LUT + 5*SIZEOF_INT] 59 mov T1, INT [LUT + 13*SIZEOF_INT] 60 pinsrw X0, word [BLOCK + T0 * 2], 5 61 pinsrw X1, word [BLOCK + T1 * 2], 5 62 63 mov T0, INT [LUT + 6*SIZEOF_INT] 64 mov T1, INT [LUT + 14*SIZEOF_INT] 65 pinsrw X0, word [BLOCK + T0 * 2], 6 66 pinsrw X1, word [BLOCK + T1 * 2], 6 67 68 mov T0, INT [LUT + 7*SIZEOF_INT] 69 mov T1, INT [LUT + 15*SIZEOF_INT] 70 pinsrw X0, word [BLOCK + T0 * 2], 7 71 pinsrw X1, word [BLOCK + T1 * 2], 7 72%endmacro 73 74%macro LOAD15 0 75 pxor N0, N0 76 pxor N1, N1 77 pxor X1, X1 78 79 mov T0, INT [LUT + 0*SIZEOF_INT] 80 mov T1, INT [LUT + 8*SIZEOF_INT] 81 pinsrw X0, word [BLOCK + T0 * 2], 0 82 pinsrw X1, word [BLOCK + T1 * 2], 0 83 84 mov T0, INT [LUT + 1*SIZEOF_INT] 85 pinsrw X0, word [BLOCK + T0 * 2], 1 86 87 mov T0, INT [LUT + 2*SIZEOF_INT] 88 pinsrw X0, word [BLOCK + T0 * 2], 2 89 90 mov T0, INT [LUT + 3*SIZEOF_INT] 91 pinsrw X0, word [BLOCK + T0 * 2], 3 92 93 mov T0, INT [LUT + 4*SIZEOF_INT] 94 pinsrw X0, word [BLOCK + T0 * 2], 4 95 96 mov T0, INT [LUT + 5*SIZEOF_INT] 97 pinsrw X0, word [BLOCK + T0 * 2], 5 98 99 mov T0, INT [LUT + 6*SIZEOF_INT] 100 pinsrw X0, word [BLOCK + T0 * 2], 6 101 102 mov T0, INT [LUT + 7*SIZEOF_INT] 103 pinsrw X0, word [BLOCK + T0 * 2], 7 104 105 cmp LENEND, 2 106 jl %%.ELOAD15 107 mov T1, INT [LUT + 9*SIZEOF_INT] 108 pinsrw X1, word [BLOCK + T1 * 2], 1 109 110 cmp LENEND, 3 111 jl %%.ELOAD15 112 mov T1, INT [LUT + 10*SIZEOF_INT] 113 pinsrw X1, word [BLOCK + T1 * 2], 2 114 115 cmp LENEND, 4 116 jl %%.ELOAD15 117 mov T1, INT [LUT + 11*SIZEOF_INT] 118 pinsrw X1, word [BLOCK + T1 * 2], 3 119 120 cmp LENEND, 5 121 jl %%.ELOAD15 122 mov T1, INT [LUT + 12*SIZEOF_INT] 123 pinsrw X1, word [BLOCK + T1 * 2], 4 124 125 cmp LENEND, 6 126 jl %%.ELOAD15 127 mov T1, INT [LUT + 13*SIZEOF_INT] 128 pinsrw X1, word [BLOCK + T1 * 2], 5 129 130 cmp LENEND, 7 131 jl %%.ELOAD15 132 mov T1, INT [LUT + 14*SIZEOF_INT] 133 pinsrw X1, word [BLOCK + T1 * 2], 6 134%%.ELOAD15: 135%endmacro 136 137%macro LOAD8 0 138 pxor N0, N0 139 140 mov T0, INT [LUT + 0*SIZEOF_INT] 141 pinsrw X0, word [BLOCK + T0 * 2], 0 142 143 mov T0, INT [LUT + 1*SIZEOF_INT] 144 pinsrw X0, word [BLOCK + T0 * 2], 1 145 146 mov T0, INT [LUT + 2*SIZEOF_INT] 147 pinsrw X0, word [BLOCK + T0 * 2], 2 148 149 mov T0, INT [LUT + 3*SIZEOF_INT] 150 pinsrw X0, word [BLOCK + T0 * 2], 3 151 152 mov T0, INT [LUT + 4*SIZEOF_INT] 153 pinsrw X0, word [BLOCK + T0 * 2], 4 154 155 mov T0, INT [LUT + 5*SIZEOF_INT] 156 pinsrw X0, word [BLOCK + T0 * 2], 5 157 158 mov T0, INT [LUT + 6*SIZEOF_INT] 159 pinsrw X0, word [BLOCK + T0 * 2], 6 160 161 mov T0, INT [LUT + 7*SIZEOF_INT] 162 pinsrw X0, word [BLOCK + T0 * 2], 7 163%endmacro 164 165%macro LOAD7 0 166 pxor N0, N0 167 pxor X0, X0 168 169 mov T1, INT [LUT + 0*SIZEOF_INT] 170 pinsrw X0, word [BLOCK + T1 * 2], 0 171 172 cmp LENEND, 2 173 jl %%.ELOAD7 174 mov T1, INT [LUT + 1*SIZEOF_INT] 175 pinsrw X0, word [BLOCK + T1 * 2], 1 176 177 cmp LENEND, 3 178 jl %%.ELOAD7 179 mov T1, INT [LUT + 2*SIZEOF_INT] 180 pinsrw X0, word [BLOCK + T1 * 2], 2 181 182 cmp LENEND, 4 183 jl %%.ELOAD7 184 mov T1, INT [LUT + 3*SIZEOF_INT] 185 pinsrw X0, word [BLOCK + T1 * 2], 3 186 187 cmp LENEND, 5 188 jl %%.ELOAD7 189 mov T1, INT [LUT + 4*SIZEOF_INT] 190 pinsrw X0, word [BLOCK + T1 * 2], 4 191 192 cmp LENEND, 6 193 jl %%.ELOAD7 194 mov T1, INT [LUT + 5*SIZEOF_INT] 195 pinsrw X0, word [BLOCK + T1 * 2], 5 196 197 cmp LENEND, 7 198 jl %%.ELOAD7 199 mov T1, INT [LUT + 6*SIZEOF_INT] 200 pinsrw X0, word [BLOCK + T1 * 2], 6 201%%.ELOAD7: 202%endmacro 203 204%macro REDUCE0 0 205 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 206 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 207 movdqa xmm2, XMMWORD [VALUES + (16*2)] 208 movdqa xmm3, XMMWORD [VALUES + (24*2)] 209 movdqa xmm4, XMMWORD [VALUES + (32*2)] 210 movdqa xmm5, XMMWORD [VALUES + (40*2)] 211 movdqa xmm6, XMMWORD [VALUES + (48*2)] 212 213 pcmpeqw xmm0, ZERO 214 pcmpeqw xmm1, ZERO 215 pcmpeqw xmm2, ZERO 216 pcmpeqw xmm3, ZERO 217 pcmpeqw xmm4, ZERO 218 pcmpeqw xmm5, ZERO 219 pcmpeqw xmm6, ZERO 220 pcmpeqw xmm7, XMMWORD [VALUES + (56*2)] 221 222 packsswb xmm0, xmm1 223 packsswb xmm2, xmm3 224 packsswb xmm4, xmm5 225 packsswb xmm6, xmm7 226 227 pmovmskb eax, xmm0 228 pmovmskb ecx, xmm2 229 pmovmskb edx, xmm4 230 pmovmskb esi, xmm6 231 232 shl ecx, 16 233 shl esi, 16 234 235 or eax, ecx 236 or edx, esi 237 238 not eax 239 not edx 240 241 mov edi, ZEROBITS 242 243 mov INT [edi], eax 244 mov INT [edi+SIZEOF_INT], edx 245%endmacro 246 247; 248; Prepare data for jsimd_encode_mcu_AC_first(). 249; 250; GLOBAL(void) 251; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 252; const int *jpeg_natural_order_start, 253; int Sl, int Al, JCOEF *values, 254; size_t *zerobits) 255; 256; eax + 8 = const JCOEF *block 257; eax + 12 = const int *jpeg_natural_order_start 258; eax + 16 = int Sl 259; eax + 20 = int Al 260; eax + 24 = JCOEF *values 261; eax + 28 = size_t *zerobits 262 263%define ZERO xmm7 264%define X0 xmm0 265%define X1 xmm1 266%define N0 xmm2 267%define N1 xmm3 268%define AL xmm4 269%define K eax 270%define LENEND eax 271%define LUT ebx 272%define T0 ecx 273%define T1 edx 274%define BLOCK esi 275%define VALUES edi 276%define LEN ebp 277 278%define ZEROBITS INT [esp + 5 * 4] 279 280 align 32 281 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 282 283EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 284 push ebp 285 mov eax, esp ; eax = original ebp 286 sub esp, byte 4 287 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 288 mov [esp], eax 289 mov ebp, esp ; ebp = aligned ebp 290 sub esp, 4 291 push ebx 292 push ecx 293; push edx ; need not be preserved 294 push esi 295 push edi 296 push ebp 297 298 mov BLOCK, INT [eax + 8] 299 mov LUT, INT [eax + 12] 300 mov VALUES, INT [eax + 24] 301 movd AL, INT [eax + 20] 302 mov T0, INT [eax + 28] 303 mov ZEROBITS, T0 304 mov LEN, INT [eax + 16] 305 pxor ZERO, ZERO 306 mov K, LEN 307 and K, -16 308 shr K, 4 309 jz .ELOOP16 310.BLOOP16: 311 LOAD16 312 pcmpgtw N0, X0 313 pcmpgtw N1, X1 314 paddw X0, N0 315 paddw X1, N1 316 pxor X0, N0 317 pxor X1, N1 318 psrlw X0, AL 319 psrlw X1, AL 320 pxor N0, X0 321 pxor N1, X1 322 movdqa XMMWORD [VALUES + (0) * 2], X0 323 movdqa XMMWORD [VALUES + (8) * 2], X1 324 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 325 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 326 add VALUES, 16*2 327 add LUT, 16*SIZEOF_INT 328 dec K 329 jnz .BLOOP16 330 test LEN, 15 331 je .PADDING 332.ELOOP16: 333 mov LENEND, LEN 334 and LENEND, 7 335 336 test LEN, 8 337 jz .TRY7 338 test LEN, 7 339 jz .TRY8 340 341 LOAD15 342 pcmpgtw N0, X0 343 pcmpgtw N1, X1 344 paddw X0, N0 345 paddw X1, N1 346 pxor X0, N0 347 pxor X1, N1 348 psrlw X0, AL 349 psrlw X1, AL 350 pxor N0, X0 351 pxor N1, X1 352 movdqa XMMWORD [VALUES + (0) * 2], X0 353 movdqa XMMWORD [VALUES + (8) * 2], X1 354 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 355 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 356 add VALUES, 16*2 357 jmp .PADDING 358.TRY8: 359 LOAD8 360 pcmpgtw N0, X0 361 paddw X0, N0 362 pxor X0, N0 363 psrlw X0, AL 364 pxor N0, X0 365 movdqa XMMWORD [VALUES + (0) * 2], X0 366 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 367 add VALUES, 8*2 368 jmp .PADDING 369.TRY7: 370 LOAD7 371 pcmpgtw N0, X0 372 paddw X0, N0 373 pxor X0, N0 374 psrlw X0, AL 375 pxor N0, X0 376 movdqa XMMWORD [VALUES + (0) * 2], X0 377 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 378 add VALUES, 8*2 379.PADDING: 380 mov K, LEN 381 add K, 7 382 and K, -8 383 shr K, 3 384 sub K, DCTSIZE2/8 385 jz .EPADDING 386 align 16 387.ZEROLOOP: 388 movdqa XMMWORD [VALUES + 0], ZERO 389 add VALUES, 8*2 390 inc K 391 jnz .ZEROLOOP 392.EPADDING: 393 sub VALUES, DCTSIZE2*2 394 395 REDUCE0 396 397 pop ebp 398 pop edi 399 pop esi 400; pop edx ; need not be preserved 401 pop ecx 402 pop ebx 403 mov esp, ebp ; esp <- aligned ebp 404 pop esp ; esp <- original ebp 405 pop ebp 406 ret 407 408%undef ZERO 409%undef X0 410%undef X1 411%undef N0 412%undef N1 413%undef AL 414%undef K 415%undef LUT 416%undef T0 417%undef T1 418%undef BLOCK 419%undef VALUES 420%undef LEN 421 422; 423; Prepare data for jsimd_encode_mcu_AC_refine(). 424; 425; GLOBAL(int) 426; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 427; const int *jpeg_natural_order_start, 428; int Sl, int Al, JCOEF *absvalues, 429; size_t *bits) 430; 431; eax + 8 = const JCOEF *block 432; eax + 12 = const int *jpeg_natural_order_start 433; eax + 16 = int Sl 434; eax + 20 = int Al 435; eax + 24 = JCOEF *values 436; eax + 28 = size_t *bits 437 438%define ZERO xmm7 439%define ONE xmm5 440%define X0 xmm0 441%define X1 xmm1 442%define N0 xmm2 443%define N1 xmm3 444%define AL xmm4 445%define K eax 446%define LENEND eax 447%define LUT ebx 448%define T0 ecx 449%define T0w cx 450%define T1 edx 451%define BLOCK esi 452%define VALUES edi 453%define KK ebp 454 455%define ZEROBITS INT [esp + 5 * 4] 456%define EOB INT [esp + 5 * 4 + 4] 457%define LEN INT [esp + 5 * 4 + 8] 458 459 align 32 460 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 461 462EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 463 push ebp 464 mov eax, esp ; eax = original ebp 465 sub esp, byte 4 466 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 467 mov [esp], eax 468 mov ebp, esp ; ebp = aligned ebp 469 sub esp, 16 470 push ebx 471 push ecx 472; push edx ; need not be preserved 473 push esi 474 push edi 475 push ebp 476 477 pcmpeqw ONE, ONE 478 psrlw ONE, 15 479 mov BLOCK, INT [eax + 8] 480 mov LUT, INT [eax + 12] 481 mov VALUES, INT [eax + 24] 482 movd AL, INT [eax + 20] 483 mov T0, INT [eax + 28] 484 mov K, INT [eax + 16] 485 mov INT [T0 + 2 * SIZEOF_INT], -1 486 mov INT [T0 + 3 * SIZEOF_INT], -1 487 mov ZEROBITS, T0 488 mov LEN, K 489 pxor ZERO, ZERO 490 and K, -16 491 mov EOB, 0 492 xor KK, KK 493 shr K, 4 494 jz .ELOOPR16 495.BLOOPR16: 496 LOAD16 497 pcmpgtw N0, X0 498 pcmpgtw N1, X1 499 paddw X0, N0 500 paddw X1, N1 501 pxor X0, N0 502 pxor X1, N1 503 psrlw X0, AL 504 psrlw X1, AL 505 movdqa XMMWORD [VALUES + (0) * 2], X0 506 movdqa XMMWORD [VALUES + (8) * 2], X1 507 pcmpeqw X0, ONE 508 pcmpeqw X1, ONE 509 packsswb N0, N1 510 packsswb X0, X1 511 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 512 mov T1, ZEROBITS 513 not T0 514 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 515 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 516 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 517 jz .CONTINUER16 ; if (idx) { 518 lea T1, [T1+KK*8] 519 mov EOB, T1 ; EOB = k + idx; 520.CONTINUER16: 521 add VALUES, 16*2 522 add LUT, 16*SIZEOF_INT 523 add KK, 2 524 dec K 525 jnz .BLOOPR16 526 test LEN, 15 527 je .PADDINGR 528.ELOOPR16: 529 mov LENEND, LEN 530 531 test LENEND, 8 532 jz .TRYR7 533 test LENEND, 7 534 jz .TRYR8 535 536 and LENEND, 7 537 LOAD15 538 pcmpgtw N0, X0 539 pcmpgtw N1, X1 540 paddw X0, N0 541 paddw X1, N1 542 pxor X0, N0 543 pxor X1, N1 544 psrlw X0, AL 545 psrlw X1, AL 546 movdqa XMMWORD [VALUES + (0) * 2], X0 547 movdqa XMMWORD [VALUES + (8) * 2], X1 548 pcmpeqw X0, ONE 549 pcmpeqw X1, ONE 550 packsswb N0, N1 551 packsswb X0, X1 552 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 553 mov T1, ZEROBITS 554 not T0 555 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 556 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 557 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 558 jz .CONTINUER15 ; if (idx) { 559 lea T1, [T1+KK*8] 560 mov EOB, T1 ; EOB = k + idx; 561.CONTINUER15: 562 add VALUES, 16*2 563 jmp .PADDINGR 564.TRYR8: 565 LOAD8 566 567 pcmpgtw N0, X0 568 paddw X0, N0 569 pxor X0, N0 570 psrlw X0, AL 571 movdqa XMMWORD [VALUES + (0) * 2], X0 572 pcmpeqw X0, ONE 573 packsswb N0, ZERO 574 packsswb X0, ZERO 575 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 576 mov T1, ZEROBITS 577 not T0 578 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 579 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 580 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 581 jz .CONTINUER8 ; if (idx) { 582 lea T1, [T1+KK*8] 583 mov EOB, T1 ; EOB = k + idx; 584.CONTINUER8: 585 add VALUES, 8*2 586 jmp .PADDINGR 587.TRYR7: 588 and LENEND, 7 589 LOAD7 590 591 pcmpgtw N0, X0 592 paddw X0, N0 593 pxor X0, N0 594 psrlw X0, AL 595 movdqa XMMWORD [VALUES + (0) * 2], X0 596 pcmpeqw X0, ONE 597 packsswb N0, ZERO 598 packsswb X0, ZERO 599 pmovmskb T0, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 600 mov T1, ZEROBITS 601 not T0 602 mov word [T1 + 2 * SIZEOF_INT + KK], T0w 603 pmovmskb T1, X0 ; idx = _mm_movemask_epi8(x1); 604 bsr T1, T1 ; idx = 16 - (__builtin_clz(idx)>>1); 605 jz .CONTINUER7 ; if (idx) { 606 lea T1, [T1+KK*8] 607 mov EOB, T1 ; EOB = k + idx; 608.CONTINUER7: 609 add VALUES, 8*2 610.PADDINGR: 611 mov K, LEN 612 add K, 7 613 and K, -8 614 shr K, 3 615 sub K, DCTSIZE2/8 616 jz .EPADDINGR 617 align 16 618.ZEROLOOPR: 619 movdqa XMMWORD [VALUES + 0], ZERO 620 add VALUES, 8*2 621 inc K 622 jnz .ZEROLOOPR 623.EPADDINGR: 624 sub VALUES, DCTSIZE2*2 625 626 REDUCE0 627 628 mov eax, EOB 629 630 pop ebp 631 pop edi 632 pop esi 633; pop edx ; need not be preserved 634 pop ecx 635 pop ebx 636 mov esp, ebp ; esp <- aligned ebp 637 pop esp ; esp <- original ebp 638 pop ebp 639 ret 640 641%undef ZERO 642%undef ONE 643%undef X0 644%undef X1 645%undef N0 646%undef N1 647%undef AL 648%undef K 649%undef KK 650%undef EOB 651%undef SIGN 652%undef LUT 653%undef T0 654%undef T1 655%undef BLOCK 656%undef VALUES 657%undef LEN 658%undef LENEND 659 660; For some reason, the OS X linker does not honor the request to align the 661; segment unless we do this. 662 align 32 663