1; 2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding 3; (64-bit SSE2) 4; 5; Copyright (C) 2016, 2018, Matthieu Darbois 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains an SSE2 implementation of data preparation for progressive 18; Huffman encoding. See jcphuff.c for more details. 19 20%include "jsimdext.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 64 25 26; -------------------------------------------------------------------------- 27; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 28; jsimd_encode_mcu_AC_refine_prepare_sse2() 29 30%macro LOAD16 0 31 pxor N0, N0 32 pxor N1, N1 33 34 mov T0d, INT [LUT + 0*SIZEOF_INT] 35 mov T1d, INT [LUT + 8*SIZEOF_INT] 36 pinsrw X0, word [BLOCK + T0 * 2], 0 37 pinsrw X1, word [BLOCK + T1 * 2], 0 38 39 mov T0d, INT [LUT + 1*SIZEOF_INT] 40 mov T1d, INT [LUT + 9*SIZEOF_INT] 41 pinsrw X0, word [BLOCK + T0 * 2], 1 42 pinsrw X1, word [BLOCK + T1 * 2], 1 43 44 mov T0d, INT [LUT + 2*SIZEOF_INT] 45 mov T1d, INT [LUT + 10*SIZEOF_INT] 46 pinsrw X0, word [BLOCK + T0 * 2], 2 47 pinsrw X1, word [BLOCK + T1 * 2], 2 48 49 mov T0d, INT [LUT + 3*SIZEOF_INT] 50 mov T1d, INT [LUT + 11*SIZEOF_INT] 51 pinsrw X0, word [BLOCK + T0 * 2], 3 52 pinsrw X1, word [BLOCK + T1 * 2], 3 53 54 mov T0d, INT [LUT + 4*SIZEOF_INT] 55 mov T1d, INT [LUT + 12*SIZEOF_INT] 56 pinsrw X0, word [BLOCK + T0 * 2], 4 57 pinsrw X1, word [BLOCK + T1 * 2], 4 58 59 mov T0d, INT [LUT + 5*SIZEOF_INT] 60 mov T1d, INT [LUT + 13*SIZEOF_INT] 61 pinsrw X0, word [BLOCK + T0 * 2], 5 62 pinsrw X1, word [BLOCK + T1 * 2], 5 63 64 mov T0d, INT [LUT + 6*SIZEOF_INT] 65 mov T1d, INT [LUT + 14*SIZEOF_INT] 66 pinsrw X0, word [BLOCK + T0 * 2], 6 67 pinsrw X1, word [BLOCK + T1 * 2], 6 68 69 mov T0d, INT [LUT + 7*SIZEOF_INT] 70 mov T1d, INT [LUT + 15*SIZEOF_INT] 71 pinsrw X0, word [BLOCK + T0 * 2], 7 72 pinsrw X1, word [BLOCK + T1 * 2], 7 73%endmacro 74 75%macro LOAD15 0 76 pxor N0, N0 77 pxor N1, N1 78 pxor X1, X1 79 80 mov T0d, INT [LUT + 0*SIZEOF_INT] 81 mov T1d, INT [LUT + 8*SIZEOF_INT] 82 pinsrw X0, word [BLOCK + T0 * 2], 0 83 pinsrw X1, word [BLOCK + T1 * 2], 0 84 85 mov T0d, INT [LUT + 1*SIZEOF_INT] 86 pinsrw X0, word [BLOCK + T0 * 2], 1 87 88 mov T0d, INT [LUT + 2*SIZEOF_INT] 89 pinsrw X0, word [BLOCK + T0 * 2], 2 90 91 mov T0d, INT [LUT + 3*SIZEOF_INT] 92 pinsrw X0, word [BLOCK + T0 * 2], 3 93 94 mov T0d, INT [LUT + 4*SIZEOF_INT] 95 pinsrw X0, word [BLOCK + T0 * 2], 4 96 97 mov T0d, INT [LUT + 5*SIZEOF_INT] 98 pinsrw X0, word [BLOCK + T0 * 2], 5 99 100 mov T0d, INT [LUT + 6*SIZEOF_INT] 101 pinsrw X0, word [BLOCK + T0 * 2], 6 102 103 mov T0d, INT [LUT + 7*SIZEOF_INT] 104 pinsrw X0, word [BLOCK + T0 * 2], 7 105 106 cmp LENEND, 2 107 jl %%.ELOAD15 108 mov T1d, INT [LUT + 9*SIZEOF_INT] 109 pinsrw X1, word [BLOCK + T1 * 2], 1 110 111 cmp LENEND, 3 112 jl %%.ELOAD15 113 mov T1d, INT [LUT + 10*SIZEOF_INT] 114 pinsrw X1, word [BLOCK + T1 * 2], 2 115 116 cmp LENEND, 4 117 jl %%.ELOAD15 118 mov T1d, INT [LUT + 11*SIZEOF_INT] 119 pinsrw X1, word [BLOCK + T1 * 2], 3 120 121 cmp LENEND, 5 122 jl %%.ELOAD15 123 mov T1d, INT [LUT + 12*SIZEOF_INT] 124 pinsrw X1, word [BLOCK + T1 * 2], 4 125 126 cmp LENEND, 6 127 jl %%.ELOAD15 128 mov T1d, INT [LUT + 13*SIZEOF_INT] 129 pinsrw X1, word [BLOCK + T1 * 2], 5 130 131 cmp LENEND, 7 132 jl %%.ELOAD15 133 mov T1d, INT [LUT + 14*SIZEOF_INT] 134 pinsrw X1, word [BLOCK + T1 * 2], 6 135%%.ELOAD15: 136%endmacro 137 138%macro LOAD8 0 139 pxor N0, N0 140 141 mov T0d, INT [LUT + 0*SIZEOF_INT] 142 pinsrw X0, word [BLOCK + T0 * 2], 0 143 144 mov T0d, INT [LUT + 1*SIZEOF_INT] 145 pinsrw X0, word [BLOCK + T0 * 2], 1 146 147 mov T0d, INT [LUT + 2*SIZEOF_INT] 148 pinsrw X0, word [BLOCK + T0 * 2], 2 149 150 mov T0d, INT [LUT + 3*SIZEOF_INT] 151 pinsrw X0, word [BLOCK + T0 * 2], 3 152 153 mov T0d, INT [LUT + 4*SIZEOF_INT] 154 pinsrw X0, word [BLOCK + T0 * 2], 4 155 156 mov T0d, INT [LUT + 5*SIZEOF_INT] 157 pinsrw X0, word [BLOCK + T0 * 2], 5 158 159 mov T0d, INT [LUT + 6*SIZEOF_INT] 160 pinsrw X0, word [BLOCK + T0 * 2], 6 161 162 mov T0d, INT [LUT + 7*SIZEOF_INT] 163 pinsrw X0, word [BLOCK + T0 * 2], 7 164%endmacro 165 166%macro LOAD7 0 167 pxor N0, N0 168 pxor X0, X0 169 170 mov T1d, INT [LUT + 0*SIZEOF_INT] 171 pinsrw X0, word [BLOCK + T1 * 2], 0 172 173 cmp LENEND, 2 174 jl %%.ELOAD7 175 mov T1d, INT [LUT + 1*SIZEOF_INT] 176 pinsrw X0, word [BLOCK + T1 * 2], 1 177 178 cmp LENEND, 3 179 jl %%.ELOAD7 180 mov T1d, INT [LUT + 2*SIZEOF_INT] 181 pinsrw X0, word [BLOCK + T1 * 2], 2 182 183 cmp LENEND, 4 184 jl %%.ELOAD7 185 mov T1d, INT [LUT + 3*SIZEOF_INT] 186 pinsrw X0, word [BLOCK + T1 * 2], 3 187 188 cmp LENEND, 5 189 jl %%.ELOAD7 190 mov T1d, INT [LUT + 4*SIZEOF_INT] 191 pinsrw X0, word [BLOCK + T1 * 2], 4 192 193 cmp LENEND, 6 194 jl %%.ELOAD7 195 mov T1d, INT [LUT + 5*SIZEOF_INT] 196 pinsrw X0, word [BLOCK + T1 * 2], 5 197 198 cmp LENEND, 7 199 jl %%.ELOAD7 200 mov T1d, INT [LUT + 6*SIZEOF_INT] 201 pinsrw X0, word [BLOCK + T1 * 2], 6 202%%.ELOAD7: 203%endmacro 204 205%macro REDUCE0 0 206 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 207 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 208 movdqa xmm2, XMMWORD [VALUES + (16*2)] 209 movdqa xmm3, XMMWORD [VALUES + (24*2)] 210 movdqa xmm4, XMMWORD [VALUES + (32*2)] 211 movdqa xmm5, XMMWORD [VALUES + (40*2)] 212 movdqa xmm6, XMMWORD [VALUES + (48*2)] 213 movdqa xmm7, XMMWORD [VALUES + (56*2)] 214 215 pcmpeqw xmm0, ZERO 216 pcmpeqw xmm1, ZERO 217 pcmpeqw xmm2, ZERO 218 pcmpeqw xmm3, ZERO 219 pcmpeqw xmm4, ZERO 220 pcmpeqw xmm5, ZERO 221 pcmpeqw xmm6, ZERO 222 pcmpeqw xmm7, ZERO 223 224 packsswb xmm0, xmm1 225 packsswb xmm2, xmm3 226 packsswb xmm4, xmm5 227 packsswb xmm6, xmm7 228 229 pmovmskb eax, xmm0 230 pmovmskb ecx, xmm2 231 pmovmskb edx, xmm4 232 pmovmskb esi, xmm6 233 234 shl rcx, 16 235 shl rdx, 32 236 shl rsi, 48 237 238 or rax, rcx 239 or rdx, rsi 240 or rax, rdx 241 242 not rax 243 244 mov MMWORD [r15], rax 245%endmacro 246 247; 248; Prepare data for jsimd_encode_mcu_AC_first(). 249; 250; GLOBAL(void) 251; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 252; const int *jpeg_natural_order_start, 253; int Sl, int Al, JCOEF *values, 254; size_t *zerobits) 255; 256; r10 = const JCOEF *block 257; r11 = const int *jpeg_natural_order_start 258; r12 = int Sl 259; r13 = int Al 260; r14 = JCOEF *values 261; r15 = size_t *zerobits 262 263%define ZERO xmm9 264%define X0 xmm0 265%define X1 xmm1 266%define N0 xmm2 267%define N1 xmm3 268%define AL xmm4 269%define K eax 270%define LUT r11 271%define T0 rcx 272%define T0d ecx 273%define T1 rdx 274%define T1d edx 275%define BLOCK r10 276%define VALUES r14 277%define LEN r12d 278%define LENEND r13d 279 280 align 32 281 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 282 283EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 284 push rbp 285 mov rax, rsp ; rax = original rbp 286 sub rsp, byte 4 287 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 288 mov [rsp], rax 289 mov rbp, rsp ; rbp = aligned rbp 290 lea rsp, [rbp - 16] 291 collect_args 6 292 293 movdqa XMMWORD [rbp - 16], ZERO 294 295 movd AL, r13d 296 pxor ZERO, ZERO 297 mov K, LEN 298 mov LENEND, LEN 299 and K, -16 300 and LENEND, 7 301 shr K, 4 302 jz .ELOOP16 303.BLOOP16: 304 LOAD16 305 pcmpgtw N0, X0 306 pcmpgtw N1, X1 307 paddw X0, N0 308 paddw X1, N1 309 pxor X0, N0 310 pxor X1, N1 311 psrlw X0, AL 312 psrlw X1, AL 313 pxor N0, X0 314 pxor N1, X1 315 movdqa XMMWORD [VALUES + (0) * 2], X0 316 movdqa XMMWORD [VALUES + (8) * 2], X1 317 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 318 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 319 add VALUES, 16*2 320 add LUT, 16*SIZEOF_INT 321 dec K 322 jnz .BLOOP16 323 test LEN, 15 324 je .PADDING 325.ELOOP16: 326 test LEN, 8 327 jz .TRY7 328 test LEN, 7 329 jz .TRY8 330 331 LOAD15 332 pcmpgtw N0, X0 333 pcmpgtw N1, X1 334 paddw X0, N0 335 paddw X1, N1 336 pxor X0, N0 337 pxor X1, N1 338 psrlw X0, AL 339 psrlw X1, AL 340 pxor N0, X0 341 pxor N1, X1 342 movdqa XMMWORD [VALUES + (0) * 2], X0 343 movdqa XMMWORD [VALUES + (8) * 2], X1 344 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 345 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 346 add VALUES, 16*2 347 jmp .PADDING 348.TRY8: 349 LOAD8 350 pcmpgtw N0, X0 351 paddw X0, N0 352 pxor X0, N0 353 psrlw X0, AL 354 pxor N0, X0 355 movdqa XMMWORD [VALUES + (0) * 2], X0 356 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 357 add VALUES, 8*2 358 jmp .PADDING 359.TRY7: 360 LOAD7 361 pcmpgtw N0, X0 362 paddw X0, N0 363 pxor X0, N0 364 psrlw X0, AL 365 pxor N0, X0 366 movdqa XMMWORD [VALUES + (0) * 2], X0 367 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 368 add VALUES, 8*2 369.PADDING: 370 mov K, LEN 371 add K, 7 372 and K, -8 373 shr K, 3 374 sub K, DCTSIZE2/8 375 jz .EPADDING 376 align 16 377.ZEROLOOP: 378 movdqa XMMWORD [VALUES + 0], ZERO 379 add VALUES, 8*2 380 inc K 381 jnz .ZEROLOOP 382.EPADDING: 383 sub VALUES, DCTSIZE2*2 384 385 REDUCE0 386 387 movdqa ZERO, XMMWORD [rbp - 16] 388 uncollect_args 6 389 mov rsp, rbp ; rsp <- aligned rbp 390 pop rsp ; rsp <- original rbp 391 pop rbp 392 ret 393 394%undef ZERO 395%undef X0 396%undef X1 397%undef N0 398%undef N1 399%undef AL 400%undef K 401%undef LUT 402%undef T0 403%undef T0d 404%undef T1 405%undef T1d 406%undef BLOCK 407%undef VALUES 408%undef LEN 409%undef LENEND 410 411; 412; Prepare data for jsimd_encode_mcu_AC_refine(). 413; 414; GLOBAL(int) 415; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 416; const int *jpeg_natural_order_start, 417; int Sl, int Al, JCOEF *absvalues, 418; size_t *bits) 419; 420; r10 = const JCOEF *block 421; r11 = const int *jpeg_natural_order_start 422; r12 = int Sl 423; r13 = int Al 424; r14 = JCOEF *values 425; r15 = size_t *bits 426 427%define ZERO xmm9 428%define ONE xmm5 429%define X0 xmm0 430%define X1 xmm1 431%define N0 xmm2 432%define N1 xmm3 433%define AL xmm4 434%define K eax 435%define KK r9d 436%define EOB r8d 437%define SIGN rdi 438%define LUT r11 439%define T0 rcx 440%define T0d ecx 441%define T1 rdx 442%define T1d edx 443%define BLOCK r10 444%define VALUES r14 445%define LEN r12d 446%define LENEND r13d 447 448 align 32 449 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 450 451EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 452 push rbp 453 mov rax, rsp ; rax = original rbp 454 sub rsp, byte 4 455 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 456 mov [rsp], rax 457 mov rbp, rsp ; rbp = aligned rbp 458 lea rsp, [rbp - 16] 459 collect_args 6 460 461 movdqa XMMWORD [rbp - 16], ZERO 462 463 xor SIGN, SIGN 464 xor EOB, EOB 465 xor KK, KK 466 movd AL, r13d 467 pxor ZERO, ZERO 468 pcmpeqw ONE, ONE 469 psrlw ONE, 15 470 mov K, LEN 471 mov LENEND, LEN 472 and K, -16 473 and LENEND, 7 474 shr K, 4 475 jz .ELOOPR16 476.BLOOPR16: 477 LOAD16 478 pcmpgtw N0, X0 479 pcmpgtw N1, X1 480 paddw X0, N0 481 paddw X1, N1 482 pxor X0, N0 483 pxor X1, N1 484 psrlw X0, AL 485 psrlw X1, AL 486 movdqa XMMWORD [VALUES + (0) * 2], X0 487 movdqa XMMWORD [VALUES + (8) * 2], X1 488 pcmpeqw X0, ONE 489 pcmpeqw X1, ONE 490 packsswb N0, N1 491 packsswb X0, X1 492 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 493 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 494 shr SIGN, 16 ; make room for sizebits 495 shl T0, 48 496 or SIGN, T0 497 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 498 jz .CONTINUER16 ; if (idx) { 499 mov EOB, KK 500 add EOB, T1d ; EOB = k + idx; 501.CONTINUER16: 502 add VALUES, 16*2 503 add LUT, 16*SIZEOF_INT 504 add KK, 16 505 dec K 506 jnz .BLOOPR16 507 test LEN, 15 508 je .PADDINGR 509.ELOOPR16: 510 test LEN, 8 511 jz .TRYR7 512 test LEN, 7 513 jz .TRYR8 514 515 LOAD15 516 pcmpgtw N0, X0 517 pcmpgtw N1, X1 518 paddw X0, N0 519 paddw X1, N1 520 pxor X0, N0 521 pxor X1, N1 522 psrlw X0, AL 523 psrlw X1, AL 524 movdqa XMMWORD [VALUES + (0) * 2], X0 525 movdqa XMMWORD [VALUES + (8) * 2], X1 526 pcmpeqw X0, ONE 527 pcmpeqw X1, ONE 528 packsswb N0, N1 529 packsswb X0, X1 530 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 531 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 532 shr SIGN, 16 ; make room for sizebits 533 shl T0, 48 534 or SIGN, T0 535 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 536 jz .CONTINUER15 ; if (idx) { 537 mov EOB, KK 538 add EOB, T1d ; EOB = k + idx; 539.CONTINUER15: 540 add VALUES, 16*2 541 jmp .PADDINGR 542.TRYR8: 543 LOAD8 544 545 pcmpgtw N0, X0 546 paddw X0, N0 547 pxor X0, N0 548 psrlw X0, AL 549 movdqa XMMWORD [VALUES + (0) * 2], X0 550 pcmpeqw X0, ONE 551 packsswb N0, ZERO 552 packsswb X0, ZERO 553 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 554 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 555 shr SIGN, 8 ; make room for sizebits 556 shl T0, 56 557 or SIGN, T0 558 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 559 jz .CONTINUER8 ; if (idx) { 560 mov EOB, KK 561 add EOB, T1d ; EOB = k + idx; 562.CONTINUER8: 563 add VALUES, 8*2 564 jmp .PADDINGR 565.TRYR7: 566 LOAD7 567 568 pcmpgtw N0, X0 569 paddw X0, N0 570 pxor X0, N0 571 psrlw X0, AL 572 movdqa XMMWORD [VALUES + (0) * 2], X0 573 pcmpeqw X0, ONE 574 packsswb N0, ZERO 575 packsswb X0, ZERO 576 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 577 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 578 shr SIGN, 8 ; make room for sizebits 579 shl T0, 56 580 or SIGN, T0 581 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 582 jz .CONTINUER7 ; if (idx) { 583 mov EOB, KK 584 add EOB, T1d ; EOB = k + idx; 585.CONTINUER7: 586 add VALUES, 8*2 587.PADDINGR: 588 mov K, LEN 589 add K, 7 590 and K, -8 591 shr K, 3 592 sub K, DCTSIZE2/8 593 jz .EPADDINGR 594 align 16 595.ZEROLOOPR: 596 movdqa XMMWORD [VALUES + 0], ZERO 597 shr SIGN, 8 598 add VALUES, 8*2 599 inc K 600 jnz .ZEROLOOPR 601.EPADDINGR: 602 not SIGN 603 sub VALUES, DCTSIZE2*2 604 mov MMWORD [r15+SIZEOF_MMWORD], SIGN 605 606 REDUCE0 607 608 mov eax, EOB 609 movdqa ZERO, XMMWORD [rbp - 16] 610 uncollect_args 6 611 mov rsp, rbp ; rsp <- aligned rbp 612 pop rsp ; rsp <- original rbp 613 pop rbp 614 ret 615 616%undef ZERO 617%undef ONE 618%undef X0 619%undef X1 620%undef N0 621%undef N1 622%undef AL 623%undef K 624%undef KK 625%undef EOB 626%undef SIGN 627%undef LUT 628%undef T0 629%undef T0d 630%undef T1 631%undef T1d 632%undef BLOCK 633%undef VALUES 634%undef LEN 635%undef LENEND 636 637; For some reason, the OS X linker does not honor the request to align the 638; segment unless we do this. 639 align 32 640