1; 2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding 3; (64-bit SSE2) 4; 5; Copyright (C) 2016, 2018, Matthieu Darbois 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains an SSE2 implementation of data preparation for progressive 18; Huffman encoding. See jcphuff.c for more details. 19; 20; [TAB8] 21 22%include "jsimdext.inc" 23 24; -------------------------------------------------------------------------- 25 SECTION SEG_TEXT 26 BITS 64 27 28; -------------------------------------------------------------------------- 29; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 30; jsimd_encode_mcu_AC_refine_prepare_sse2() 31 32%macro LOAD16 0 33 pxor N0, N0 34 pxor N1, N1 35 36 mov T0d, INT [LUT + 0*SIZEOF_INT] 37 mov T1d, INT [LUT + 8*SIZEOF_INT] 38 pinsrw X0, word [BLOCK + T0 * 2], 0 39 pinsrw X1, word [BLOCK + T1 * 2], 0 40 41 mov T0d, INT [LUT + 1*SIZEOF_INT] 42 mov T1d, INT [LUT + 9*SIZEOF_INT] 43 pinsrw X0, word [BLOCK + T0 * 2], 1 44 pinsrw X1, word [BLOCK + T1 * 2], 1 45 46 mov T0d, INT [LUT + 2*SIZEOF_INT] 47 mov T1d, INT [LUT + 10*SIZEOF_INT] 48 pinsrw X0, word [BLOCK + T0 * 2], 2 49 pinsrw X1, word [BLOCK + T1 * 2], 2 50 51 mov T0d, INT [LUT + 3*SIZEOF_INT] 52 mov T1d, INT [LUT + 11*SIZEOF_INT] 53 pinsrw X0, word [BLOCK + T0 * 2], 3 54 pinsrw X1, word [BLOCK + T1 * 2], 3 55 56 mov T0d, INT [LUT + 4*SIZEOF_INT] 57 mov T1d, INT [LUT + 12*SIZEOF_INT] 58 pinsrw X0, word [BLOCK + T0 * 2], 4 59 pinsrw X1, word [BLOCK + T1 * 2], 4 60 61 mov T0d, INT [LUT + 5*SIZEOF_INT] 62 mov T1d, INT [LUT + 13*SIZEOF_INT] 63 pinsrw X0, word [BLOCK + T0 * 2], 5 64 pinsrw X1, word [BLOCK + T1 * 2], 5 65 66 mov T0d, INT [LUT + 6*SIZEOF_INT] 67 mov T1d, INT [LUT + 14*SIZEOF_INT] 68 pinsrw X0, word [BLOCK + T0 * 2], 6 69 pinsrw X1, word [BLOCK + T1 * 2], 6 70 71 mov T0d, INT [LUT + 7*SIZEOF_INT] 72 mov T1d, INT [LUT + 15*SIZEOF_INT] 73 pinsrw X0, word [BLOCK + T0 * 2], 7 74 pinsrw X1, word [BLOCK + T1 * 2], 7 75%endmacro 76 77%macro LOAD15 0 78 pxor N0, N0 79 pxor N1, N1 80 pxor X1, X1 81 82 mov T0d, INT [LUT + 0*SIZEOF_INT] 83 mov T1d, INT [LUT + 8*SIZEOF_INT] 84 pinsrw X0, word [BLOCK + T0 * 2], 0 85 pinsrw X1, word [BLOCK + T1 * 2], 0 86 87 mov T0d, INT [LUT + 1*SIZEOF_INT] 88 pinsrw X0, word [BLOCK + T0 * 2], 1 89 90 mov T0d, INT [LUT + 2*SIZEOF_INT] 91 pinsrw X0, word [BLOCK + T0 * 2], 2 92 93 mov T0d, INT [LUT + 3*SIZEOF_INT] 94 pinsrw X0, word [BLOCK + T0 * 2], 3 95 96 mov T0d, INT [LUT + 4*SIZEOF_INT] 97 pinsrw X0, word [BLOCK + T0 * 2], 4 98 99 mov T0d, INT [LUT + 5*SIZEOF_INT] 100 pinsrw X0, word [BLOCK + T0 * 2], 5 101 102 mov T0d, INT [LUT + 6*SIZEOF_INT] 103 pinsrw X0, word [BLOCK + T0 * 2], 6 104 105 mov T0d, INT [LUT + 7*SIZEOF_INT] 106 pinsrw X0, word [BLOCK + T0 * 2], 7 107 108 cmp LENEND, 2 109 jl %%.ELOAD15 110 mov T1d, INT [LUT + 9*SIZEOF_INT] 111 pinsrw X1, word [BLOCK + T1 * 2], 1 112 113 cmp LENEND, 3 114 jl %%.ELOAD15 115 mov T1d, INT [LUT + 10*SIZEOF_INT] 116 pinsrw X1, word [BLOCK + T1 * 2], 2 117 118 cmp LENEND, 4 119 jl %%.ELOAD15 120 mov T1d, INT [LUT + 11*SIZEOF_INT] 121 pinsrw X1, word [BLOCK + T1 * 2], 3 122 123 cmp LENEND, 5 124 jl %%.ELOAD15 125 mov T1d, INT [LUT + 12*SIZEOF_INT] 126 pinsrw X1, word [BLOCK + T1 * 2], 4 127 128 cmp LENEND, 6 129 jl %%.ELOAD15 130 mov T1d, INT [LUT + 13*SIZEOF_INT] 131 pinsrw X1, word [BLOCK + T1 * 2], 5 132 133 cmp LENEND, 7 134 jl %%.ELOAD15 135 mov T1d, INT [LUT + 14*SIZEOF_INT] 136 pinsrw X1, word [BLOCK + T1 * 2], 6 137%%.ELOAD15: 138%endmacro 139 140%macro LOAD8 0 141 pxor N0, N0 142 143 mov T0d, INT [LUT + 0*SIZEOF_INT] 144 pinsrw X0, word [BLOCK + T0 * 2], 0 145 146 mov T0d, INT [LUT + 1*SIZEOF_INT] 147 pinsrw X0, word [BLOCK + T0 * 2], 1 148 149 mov T0d, INT [LUT + 2*SIZEOF_INT] 150 pinsrw X0, word [BLOCK + T0 * 2], 2 151 152 mov T0d, INT [LUT + 3*SIZEOF_INT] 153 pinsrw X0, word [BLOCK + T0 * 2], 3 154 155 mov T0d, INT [LUT + 4*SIZEOF_INT] 156 pinsrw X0, word [BLOCK + T0 * 2], 4 157 158 mov T0d, INT [LUT + 5*SIZEOF_INT] 159 pinsrw X0, word [BLOCK + T0 * 2], 5 160 161 mov T0d, INT [LUT + 6*SIZEOF_INT] 162 pinsrw X0, word [BLOCK + T0 * 2], 6 163 164 mov T0d, INT [LUT + 7*SIZEOF_INT] 165 pinsrw X0, word [BLOCK + T0 * 2], 7 166%endmacro 167 168%macro LOAD7 0 169 pxor N0, N0 170 pxor X0, X0 171 172 mov T1d, INT [LUT + 0*SIZEOF_INT] 173 pinsrw X0, word [BLOCK + T1 * 2], 0 174 175 cmp LENEND, 2 176 jl %%.ELOAD7 177 mov T1d, INT [LUT + 1*SIZEOF_INT] 178 pinsrw X0, word [BLOCK + T1 * 2], 1 179 180 cmp LENEND, 3 181 jl %%.ELOAD7 182 mov T1d, INT [LUT + 2*SIZEOF_INT] 183 pinsrw X0, word [BLOCK + T1 * 2], 2 184 185 cmp LENEND, 4 186 jl %%.ELOAD7 187 mov T1d, INT [LUT + 3*SIZEOF_INT] 188 pinsrw X0, word [BLOCK + T1 * 2], 3 189 190 cmp LENEND, 5 191 jl %%.ELOAD7 192 mov T1d, INT [LUT + 4*SIZEOF_INT] 193 pinsrw X0, word [BLOCK + T1 * 2], 4 194 195 cmp LENEND, 6 196 jl %%.ELOAD7 197 mov T1d, INT [LUT + 5*SIZEOF_INT] 198 pinsrw X0, word [BLOCK + T1 * 2], 5 199 200 cmp LENEND, 7 201 jl %%.ELOAD7 202 mov T1d, INT [LUT + 6*SIZEOF_INT] 203 pinsrw X0, word [BLOCK + T1 * 2], 6 204%%.ELOAD7: 205%endmacro 206 207%macro REDUCE0 0 208 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 209 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 210 movdqa xmm2, XMMWORD [VALUES + (16*2)] 211 movdqa xmm3, XMMWORD [VALUES + (24*2)] 212 movdqa xmm4, XMMWORD [VALUES + (32*2)] 213 movdqa xmm5, XMMWORD [VALUES + (40*2)] 214 movdqa xmm6, XMMWORD [VALUES + (48*2)] 215 movdqa xmm7, XMMWORD [VALUES + (56*2)] 216 217 pcmpeqw xmm0, ZERO 218 pcmpeqw xmm1, ZERO 219 pcmpeqw xmm2, ZERO 220 pcmpeqw xmm3, ZERO 221 pcmpeqw xmm4, ZERO 222 pcmpeqw xmm5, ZERO 223 pcmpeqw xmm6, ZERO 224 pcmpeqw xmm7, ZERO 225 226 packsswb xmm0, xmm1 227 packsswb xmm2, xmm3 228 packsswb xmm4, xmm5 229 packsswb xmm6, xmm7 230 231 pmovmskb eax, xmm0 232 pmovmskb ecx, xmm2 233 pmovmskb edx, xmm4 234 pmovmskb esi, xmm6 235 236 shl rcx, 16 237 shl rdx, 32 238 shl rsi, 48 239 240 or rax, rcx 241 or rdx, rsi 242 or rax, rdx 243 244 not rax 245 246 mov MMWORD [r15], rax 247%endmacro 248 249; 250; Prepare data for jsimd_encode_mcu_AC_first(). 251; 252; GLOBAL(void) 253; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 254; const int *jpeg_natural_order_start, 255; int Sl, int Al, JCOEF *values, 256; size_t *zerobits) 257; 258; r10 = const JCOEF *block 259; r11 = const int *jpeg_natural_order_start 260; r12 = int Sl 261; r13 = int Al 262; r14 = JCOEF *values 263; r15 = size_t *zerobits 264 265%define ZERO xmm9 266%define X0 xmm0 267%define X1 xmm1 268%define N0 xmm2 269%define N1 xmm3 270%define AL xmm4 271%define K eax 272%define LUT r11 273%define T0 rcx 274%define T0d ecx 275%define T1 rdx 276%define T1d edx 277%define BLOCK r10 278%define VALUES r14 279%define LEN r12d 280%define LENEND r13d 281 282 align 32 283 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 284 285EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 286 push rbp 287 mov rax, rsp ; rax = original rbp 288 sub rsp, byte 4 289 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 290 mov [rsp], rax 291 mov rbp, rsp ; rbp = aligned rbp 292 lea rsp, [rbp - 16] 293 collect_args 6 294 295 movdqa XMMWORD [rbp - 16], ZERO 296 297 movd AL, r13d 298 pxor ZERO, ZERO 299 mov K, LEN 300 mov LENEND, LEN 301 and K, -16 302 and LENEND, 7 303 shr K, 4 304 jz .ELOOP16 305.BLOOP16: 306 LOAD16 307 pcmpgtw N0, X0 308 pcmpgtw N1, X1 309 paddw X0, N0 310 paddw X1, N1 311 pxor X0, N0 312 pxor X1, N1 313 psrlw X0, AL 314 psrlw X1, AL 315 pxor N0, X0 316 pxor N1, X1 317 movdqa XMMWORD [VALUES + (0) * 2], X0 318 movdqa XMMWORD [VALUES + (8) * 2], X1 319 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 320 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 321 add VALUES, 16*2 322 add LUT, 16*SIZEOF_INT 323 dec K 324 jnz .BLOOP16 325.ELOOP16: 326 test LEN, 8 327 jz .TRY7 328 test LEN, 7 329 jz .TRY8 330 331 LOAD15 332 pcmpgtw N0, X0 333 pcmpgtw N1, X1 334 paddw X0, N0 335 paddw X1, N1 336 pxor X0, N0 337 pxor X1, N1 338 psrlw X0, AL 339 psrlw X1, AL 340 pxor N0, X0 341 pxor N1, X1 342 movdqa XMMWORD [VALUES + (0) * 2], X0 343 movdqa XMMWORD [VALUES + (8) * 2], X1 344 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 345 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 346 add VALUES, 16*2 347 jmp .PADDING 348.TRY8: 349 LOAD8 350 pcmpgtw N0, X0 351 paddw X0, N0 352 pxor X0, N0 353 psrlw X0, AL 354 pxor N0, X0 355 movdqa XMMWORD [VALUES + (0) * 2], X0 356 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 357 add VALUES, 8*2 358 jmp .PADDING 359.TRY7: 360 LOAD7 361 pcmpgtw N0, X0 362 paddw X0, N0 363 pxor X0, N0 364 psrlw X0, AL 365 pxor N0, X0 366 movdqa XMMWORD [VALUES + (0) * 2], X0 367 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 368 add VALUES, 8*2 369.PADDING: 370 mov K, LEN 371 add K, 7 372 and K, -8 373 shr K, 3 374 sub K, DCTSIZE2/8 375 jz .EPADDING 376 align 16 377.ZEROLOOP: 378 movdqa XMMWORD [VALUES + 0], ZERO 379 add VALUES, 8*2 380 inc K 381 jnz .ZEROLOOP 382.EPADDING: 383 sub VALUES, DCTSIZE2*2 384 385 REDUCE0 386 387 movdqa ZERO, XMMWORD [rbp - 16] 388 uncollect_args 6 389 mov rsp, rbp ; rsp <- aligned rbp 390 pop rsp ; rsp <- original rbp 391 pop rbp 392 ret 393 394%undef ZERO 395%undef X0 396%undef X1 397%undef N0 398%undef N1 399%undef AL 400%undef K 401%undef LUT 402%undef T0 403%undef T0d 404%undef T1 405%undef T1d 406%undef BLOCK 407%undef VALUES 408%undef LEN 409%undef LENEND 410 411; 412; Prepare data for jsimd_encode_mcu_AC_refine(). 413; 414; GLOBAL(int) 415; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 416; const int *jpeg_natural_order_start, 417; int Sl, int Al, JCOEF *absvalues, 418; size_t *bits) 419; 420; r10 = const JCOEF *block 421; r11 = const int *jpeg_natural_order_start 422; r12 = int Sl 423; r13 = int Al 424; r14 = JCOEF *values 425; r15 = size_t *bits 426 427%define ZERO xmm9 428%define ONE xmm5 429%define X0 xmm0 430%define X1 xmm1 431%define N0 xmm2 432%define N1 xmm3 433%define AL xmm4 434%define K eax 435%define KK r9d 436%define EOB r8d 437%define SIGN rdi 438%define LUT r11 439%define T0 rcx 440%define T0d ecx 441%define T1 rdx 442%define T1d edx 443%define BLOCK r10 444%define VALUES r14 445%define LEN r12d 446%define LENEND r13d 447 448 align 32 449 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 450 451EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 452 push rbp 453 mov rax, rsp ; rax = original rbp 454 sub rsp, byte 4 455 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 456 mov [rsp], rax 457 mov rbp, rsp ; rbp = aligned rbp 458 lea rsp, [rbp - 16] 459 collect_args 6 460 461 movdqa XMMWORD [rbp - 16], ZERO 462 463 xor SIGN, SIGN 464 xor EOB, EOB 465 xor KK, KK 466 movd AL, r13d 467 pxor ZERO, ZERO 468 pcmpeqw ONE, ONE 469 psrlw ONE, 15 470 mov K, LEN 471 mov LENEND, LEN 472 and K, -16 473 and LENEND, 7 474 shr K, 4 475 jz .ELOOPR16 476.BLOOPR16: 477 LOAD16 478 pcmpgtw N0, X0 479 pcmpgtw N1, X1 480 paddw X0, N0 481 paddw X1, N1 482 pxor X0, N0 483 pxor X1, N1 484 psrlw X0, AL 485 psrlw X1, AL 486 movdqa XMMWORD [VALUES + (0) * 2], X0 487 movdqa XMMWORD [VALUES + (8) * 2], X1 488 pcmpeqw X0, ONE 489 pcmpeqw X1, ONE 490 packsswb N0, N1 491 packsswb X0, X1 492 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 493 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 494 shr SIGN, 16 ; make room for sizebits 495 shl T0, 48 496 or SIGN, T0 497 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 498 jz .CONTINUER16 ; if (idx) { 499 mov EOB, KK 500 add EOB, T1d ; EOB = k + idx; 501.CONTINUER16: 502 add VALUES, 16*2 503 add LUT, 16*SIZEOF_INT 504 add KK, 16 505 dec K 506 jnz .BLOOPR16 507.ELOOPR16: 508 test LEN, 8 509 jz .TRYR7 510 test LEN, 7 511 jz .TRYR8 512 513 LOAD15 514 pcmpgtw N0, X0 515 pcmpgtw N1, X1 516 paddw X0, N0 517 paddw X1, N1 518 pxor X0, N0 519 pxor X1, N1 520 psrlw X0, AL 521 psrlw X1, AL 522 movdqa XMMWORD [VALUES + (0) * 2], X0 523 movdqa XMMWORD [VALUES + (8) * 2], X1 524 pcmpeqw X0, ONE 525 pcmpeqw X1, ONE 526 packsswb N0, N1 527 packsswb X0, X1 528 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 529 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 530 shr SIGN, 16 ; make room for sizebits 531 shl T0, 48 532 or SIGN, T0 533 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 534 jz .CONTINUER15 ; if (idx) { 535 mov EOB, KK 536 add EOB, T1d ; EOB = k + idx; 537.CONTINUER15: 538 add VALUES, 16*2 539 jmp .PADDINGR 540.TRYR8: 541 LOAD8 542 543 pcmpgtw N0, X0 544 paddw X0, N0 545 pxor X0, N0 546 psrlw X0, AL 547 movdqa XMMWORD [VALUES + (0) * 2], X0 548 pcmpeqw X0, ONE 549 packsswb N0, ZERO 550 packsswb X0, ZERO 551 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 552 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 553 shr SIGN, 8 ; make room for sizebits 554 shl T0, 56 555 or SIGN, T0 556 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 557 jz .CONTINUER8 ; if (idx) { 558 mov EOB, KK 559 add EOB, T1d ; EOB = k + idx; 560.CONTINUER8: 561 add VALUES, 8*2 562 jmp .PADDINGR 563.TRYR7: 564 LOAD7 565 566 pcmpgtw N0, X0 567 paddw X0, N0 568 pxor X0, N0 569 psrlw X0, AL 570 movdqa XMMWORD [VALUES + (0) * 2], X0 571 pcmpeqw X0, ONE 572 packsswb N0, ZERO 573 packsswb X0, ZERO 574 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 575 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 576 shr SIGN, 8 ; make room for sizebits 577 shl T0, 56 578 or SIGN, T0 579 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 580 jz .CONTINUER7 ; if (idx) { 581 mov EOB, KK 582 add EOB, T1d ; EOB = k + idx; 583.CONTINUER7: 584 add VALUES, 8*2 585.PADDINGR: 586 mov K, LEN 587 add K, 7 588 and K, -8 589 shr K, 3 590 sub K, DCTSIZE2/8 591 jz .EPADDINGR 592 align 16 593.ZEROLOOPR: 594 movdqa XMMWORD [VALUES + 0], ZERO 595 shr SIGN, 8 596 add VALUES, 8*2 597 inc K 598 jnz .ZEROLOOPR 599.EPADDINGR: 600 not SIGN 601 sub VALUES, DCTSIZE2*2 602 mov MMWORD [r15+SIZEOF_MMWORD], SIGN 603 604 REDUCE0 605 606 mov eax, EOB 607 movdqa ZERO, XMMWORD [rbp - 16] 608 uncollect_args 6 609 mov rsp, rbp ; rsp <- aligned rbp 610 pop rsp ; rsp <- original rbp 611 pop rbp 612 ret 613 614%undef ZERO 615%undef ONE 616%undef X0 617%undef X1 618%undef N0 619%undef N1 620%undef AL 621%undef K 622%undef KK 623%undef EOB 624%undef SIGN 625%undef LUT 626%undef T0 627%undef T0d 628%undef T1 629%undef T1d 630%undef BLOCK 631%undef VALUES 632%undef LEN 633%undef LENEND 634 635; For some reason, the OS X linker does not honor the request to align the 636; segment unless we do this. 637 align 32 638