1; 2; jcphuff-sse2.asm - prepare data for progressive Huffman encoding 3; (64-bit SSE2) 4; 5; Copyright (C) 2016, 2018, Matthieu Darbois 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains an SSE2 implementation of data preparation for progressive 18; Huffman encoding. See jcphuff.c for more details. 19 20%include "jsimdext.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 64 25 26; -------------------------------------------------------------------------- 27; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and 28; jsimd_encode_mcu_AC_refine_prepare_sse2() 29 30%macro LOAD16 0 31 pxor N0, N0 32 pxor N1, N1 33 34 mov T0d, INT [LUT + 0*SIZEOF_INT] 35 mov T1d, INT [LUT + 8*SIZEOF_INT] 36 pinsrw X0, word [BLOCK + T0 * 2], 0 37 pinsrw X1, word [BLOCK + T1 * 2], 0 38 39 mov T0d, INT [LUT + 1*SIZEOF_INT] 40 mov T1d, INT [LUT + 9*SIZEOF_INT] 41 pinsrw X0, word [BLOCK + T0 * 2], 1 42 pinsrw X1, word [BLOCK + T1 * 2], 1 43 44 mov T0d, INT [LUT + 2*SIZEOF_INT] 45 mov T1d, INT [LUT + 10*SIZEOF_INT] 46 pinsrw X0, word [BLOCK + T0 * 2], 2 47 pinsrw X1, word [BLOCK + T1 * 2], 2 48 49 mov T0d, INT [LUT + 3*SIZEOF_INT] 50 mov T1d, INT [LUT + 11*SIZEOF_INT] 51 pinsrw X0, word [BLOCK + T0 * 2], 3 52 pinsrw X1, word [BLOCK + T1 * 2], 3 53 54 mov T0d, INT [LUT + 4*SIZEOF_INT] 55 mov T1d, INT [LUT + 12*SIZEOF_INT] 56 pinsrw X0, word [BLOCK + T0 * 2], 4 57 pinsrw X1, word [BLOCK + T1 * 2], 4 58 59 mov T0d, INT [LUT + 5*SIZEOF_INT] 60 mov T1d, INT [LUT + 13*SIZEOF_INT] 61 pinsrw X0, word [BLOCK + T0 * 2], 5 62 pinsrw X1, word [BLOCK + T1 * 2], 5 63 64 mov T0d, INT [LUT + 6*SIZEOF_INT] 65 mov T1d, INT [LUT + 14*SIZEOF_INT] 66 pinsrw X0, word [BLOCK + T0 * 2], 6 67 pinsrw X1, word [BLOCK + T1 * 2], 6 68 69 mov T0d, INT [LUT + 7*SIZEOF_INT] 70 mov T1d, INT [LUT + 15*SIZEOF_INT] 71 pinsrw X0, word [BLOCK + T0 * 2], 7 72 pinsrw X1, word [BLOCK + T1 * 2], 7 73%endmacro 74 75%macro LOAD15 0 76 pxor N0, N0 77 pxor N1, N1 78 pxor X1, X1 79 80 mov T0d, INT [LUT + 0*SIZEOF_INT] 81 mov T1d, INT [LUT + 8*SIZEOF_INT] 82 pinsrw X0, word [BLOCK + T0 * 2], 0 83 pinsrw X1, word [BLOCK + T1 * 2], 0 84 85 mov T0d, INT [LUT + 1*SIZEOF_INT] 86 pinsrw X0, word [BLOCK + T0 * 2], 1 87 88 mov T0d, INT [LUT + 2*SIZEOF_INT] 89 pinsrw X0, word [BLOCK + T0 * 2], 2 90 91 mov T0d, INT [LUT + 3*SIZEOF_INT] 92 pinsrw X0, word [BLOCK + T0 * 2], 3 93 94 mov T0d, INT [LUT + 4*SIZEOF_INT] 95 pinsrw X0, word [BLOCK + T0 * 2], 4 96 97 mov T0d, INT [LUT + 5*SIZEOF_INT] 98 pinsrw X0, word [BLOCK + T0 * 2], 5 99 100 mov T0d, INT [LUT + 6*SIZEOF_INT] 101 pinsrw X0, word [BLOCK + T0 * 2], 6 102 103 mov T0d, INT [LUT + 7*SIZEOF_INT] 104 pinsrw X0, word [BLOCK + T0 * 2], 7 105 106 cmp LENEND, 2 107 jl %%.ELOAD15 108 mov T1d, INT [LUT + 9*SIZEOF_INT] 109 pinsrw X1, word [BLOCK + T1 * 2], 1 110 111 cmp LENEND, 3 112 jl %%.ELOAD15 113 mov T1d, INT [LUT + 10*SIZEOF_INT] 114 pinsrw X1, word [BLOCK + T1 * 2], 2 115 116 cmp LENEND, 4 117 jl %%.ELOAD15 118 mov T1d, INT [LUT + 11*SIZEOF_INT] 119 pinsrw X1, word [BLOCK + T1 * 2], 3 120 121 cmp LENEND, 5 122 jl %%.ELOAD15 123 mov T1d, INT [LUT + 12*SIZEOF_INT] 124 pinsrw X1, word [BLOCK + T1 * 2], 4 125 126 cmp LENEND, 6 127 jl %%.ELOAD15 128 mov T1d, INT [LUT + 13*SIZEOF_INT] 129 pinsrw X1, word [BLOCK + T1 * 2], 5 130 131 cmp LENEND, 7 132 jl %%.ELOAD15 133 mov T1d, INT [LUT + 14*SIZEOF_INT] 134 pinsrw X1, word [BLOCK + T1 * 2], 6 135%%.ELOAD15: 136%endmacro 137 138%macro LOAD8 0 139 pxor N0, N0 140 141 mov T0d, INT [LUT + 0*SIZEOF_INT] 142 pinsrw X0, word [BLOCK + T0 * 2], 0 143 144 mov T0d, INT [LUT + 1*SIZEOF_INT] 145 pinsrw X0, word [BLOCK + T0 * 2], 1 146 147 mov T0d, INT [LUT + 2*SIZEOF_INT] 148 pinsrw X0, word [BLOCK + T0 * 2], 2 149 150 mov T0d, INT [LUT + 3*SIZEOF_INT] 151 pinsrw X0, word [BLOCK + T0 * 2], 3 152 153 mov T0d, INT [LUT + 4*SIZEOF_INT] 154 pinsrw X0, word [BLOCK + T0 * 2], 4 155 156 mov T0d, INT [LUT + 5*SIZEOF_INT] 157 pinsrw X0, word [BLOCK + T0 * 2], 5 158 159 mov T0d, INT [LUT + 6*SIZEOF_INT] 160 pinsrw X0, word [BLOCK + T0 * 2], 6 161 162 mov T0d, INT [LUT + 7*SIZEOF_INT] 163 pinsrw X0, word [BLOCK + T0 * 2], 7 164%endmacro 165 166%macro LOAD7 0 167 pxor N0, N0 168 pxor X0, X0 169 170 mov T1d, INT [LUT + 0*SIZEOF_INT] 171 pinsrw X0, word [BLOCK + T1 * 2], 0 172 173 cmp LENEND, 2 174 jl %%.ELOAD7 175 mov T1d, INT [LUT + 1*SIZEOF_INT] 176 pinsrw X0, word [BLOCK + T1 * 2], 1 177 178 cmp LENEND, 3 179 jl %%.ELOAD7 180 mov T1d, INT [LUT + 2*SIZEOF_INT] 181 pinsrw X0, word [BLOCK + T1 * 2], 2 182 183 cmp LENEND, 4 184 jl %%.ELOAD7 185 mov T1d, INT [LUT + 3*SIZEOF_INT] 186 pinsrw X0, word [BLOCK + T1 * 2], 3 187 188 cmp LENEND, 5 189 jl %%.ELOAD7 190 mov T1d, INT [LUT + 4*SIZEOF_INT] 191 pinsrw X0, word [BLOCK + T1 * 2], 4 192 193 cmp LENEND, 6 194 jl %%.ELOAD7 195 mov T1d, INT [LUT + 5*SIZEOF_INT] 196 pinsrw X0, word [BLOCK + T1 * 2], 5 197 198 cmp LENEND, 7 199 jl %%.ELOAD7 200 mov T1d, INT [LUT + 6*SIZEOF_INT] 201 pinsrw X0, word [BLOCK + T1 * 2], 6 202%%.ELOAD7: 203%endmacro 204 205%macro REDUCE0 0 206 movdqa xmm0, XMMWORD [VALUES + ( 0*2)] 207 movdqa xmm1, XMMWORD [VALUES + ( 8*2)] 208 movdqa xmm2, XMMWORD [VALUES + (16*2)] 209 movdqa xmm3, XMMWORD [VALUES + (24*2)] 210 movdqa xmm4, XMMWORD [VALUES + (32*2)] 211 movdqa xmm5, XMMWORD [VALUES + (40*2)] 212 movdqa xmm6, XMMWORD [VALUES + (48*2)] 213 movdqa xmm7, XMMWORD [VALUES + (56*2)] 214 215 pcmpeqw xmm0, ZERO 216 pcmpeqw xmm1, ZERO 217 pcmpeqw xmm2, ZERO 218 pcmpeqw xmm3, ZERO 219 pcmpeqw xmm4, ZERO 220 pcmpeqw xmm5, ZERO 221 pcmpeqw xmm6, ZERO 222 pcmpeqw xmm7, ZERO 223 224 packsswb xmm0, xmm1 225 packsswb xmm2, xmm3 226 packsswb xmm4, xmm5 227 packsswb xmm6, xmm7 228 229 pmovmskb eax, xmm0 230 pmovmskb ecx, xmm2 231 pmovmskb edx, xmm4 232 pmovmskb esi, xmm6 233 234 shl rcx, 16 235 shl rdx, 32 236 shl rsi, 48 237 238 or rax, rcx 239 or rdx, rsi 240 or rax, rdx 241 242 not rax 243 244 mov MMWORD [r15], rax 245%endmacro 246 247; 248; Prepare data for jsimd_encode_mcu_AC_first(). 249; 250; GLOBAL(void) 251; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block, 252; const int *jpeg_natural_order_start, 253; int Sl, int Al, JCOEF *values, 254; size_t *zerobits) 255; 256; r10 = const JCOEF *block 257; r11 = const int *jpeg_natural_order_start 258; r12 = int Sl 259; r13 = int Al 260; r14 = JCOEF *values 261; r15 = size_t *zerobits 262 263%define ZERO xmm9 264%define X0 xmm0 265%define X1 xmm1 266%define N0 xmm2 267%define N1 xmm3 268%define AL xmm4 269%define K eax 270%define LUT r11 271%define T0 rcx 272%define T0d ecx 273%define T1 rdx 274%define T1d edx 275%define BLOCK r10 276%define VALUES r14 277%define LEN r12d 278%define LENEND r13d 279 280 align 32 281 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2) 282 283EXTN(jsimd_encode_mcu_AC_first_prepare_sse2): 284 push rbp 285 mov rax, rsp ; rax = original rbp 286 sub rsp, byte 4 287 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 288 mov [rsp], rax 289 mov rbp, rsp ; rbp = aligned rbp 290 lea rsp, [rbp - 16] 291 collect_args 6 292 293 movdqa XMMWORD [rbp - 16], ZERO 294 295 movd AL, r13d 296 pxor ZERO, ZERO 297 mov K, LEN 298 mov LENEND, LEN 299 and K, -16 300 and LENEND, 7 301 shr K, 4 302 jz .ELOOP16 303.BLOOP16: 304 LOAD16 305 pcmpgtw N0, X0 306 pcmpgtw N1, X1 307 paddw X0, N0 308 paddw X1, N1 309 pxor X0, N0 310 pxor X1, N1 311 psrlw X0, AL 312 psrlw X1, AL 313 pxor N0, X0 314 pxor N1, X1 315 movdqa XMMWORD [VALUES + (0) * 2], X0 316 movdqa XMMWORD [VALUES + (8) * 2], X1 317 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 318 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 319 add VALUES, 16*2 320 add LUT, 16*SIZEOF_INT 321 dec K 322 jnz .BLOOP16 323 test LEN, 15 324 je .PADDING 325.ELOOP16: 326 test LEN, 8 327 jz .TRY7 328 test LEN, 7 329 jz .TRY8 330 331 LOAD15 332 pcmpgtw N0, X0 333 pcmpgtw N1, X1 334 paddw X0, N0 335 paddw X1, N1 336 pxor X0, N0 337 pxor X1, N1 338 psrlw X0, AL 339 psrlw X1, AL 340 pxor N0, X0 341 pxor N1, X1 342 movdqa XMMWORD [VALUES + (0) * 2], X0 343 movdqa XMMWORD [VALUES + (8) * 2], X1 344 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 345 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1 346 add VALUES, 16*2 347 jmp .PADDING 348.TRY8: 349 LOAD8 350 pcmpgtw N0, X0 351 paddw X0, N0 352 pxor X0, N0 353 psrlw X0, AL 354 pxor N0, X0 355 movdqa XMMWORD [VALUES + (0) * 2], X0 356 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 357 add VALUES, 8*2 358 jmp .PADDING 359.TRY7: 360 LOAD7 361 pcmpgtw N0, X0 362 paddw X0, N0 363 pxor X0, N0 364 psrlw X0, AL 365 pxor N0, X0 366 movdqa XMMWORD [VALUES + (0) * 2], X0 367 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0 368 add VALUES, 8*2 369.PADDING: 370 mov K, LEN 371 add K, 7 372 and K, -8 373 shr K, 3 374 sub K, DCTSIZE2/8 375 jz .EPADDING 376 align 16 377.ZEROLOOP: 378 movdqa XMMWORD [VALUES + 0], ZERO 379 add VALUES, 8*2 380 inc K 381 jnz .ZEROLOOP 382.EPADDING: 383 sub VALUES, DCTSIZE2*2 384 385 REDUCE0 386 387 movdqa ZERO, XMMWORD [rbp - 16] 388 uncollect_args 6 389 mov rsp, rbp ; rsp <- aligned rbp 390 pop rsp ; rsp <- original rbp 391 pop rbp 392 ret 393 394%undef ZERO 395%undef X0 396%undef X1 397%undef N0 398%undef N1 399%undef AL 400%undef K 401%undef LUT 402%undef T0 403%undef T0d 404%undef T1 405%undef T1d 406%undef BLOCK 407%undef VALUES 408%undef LEN 409%undef LENEND 410 411; 412; Prepare data for jsimd_encode_mcu_AC_refine(). 413; 414; GLOBAL(int) 415; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block, 416; const int *jpeg_natural_order_start, 417; int Sl, int Al, JCOEF *absvalues, 418; size_t *bits) 419; 420; r10 = const JCOEF *block 421; r11 = const int *jpeg_natural_order_start 422; r12 = int Sl 423; r13 = int Al 424; r14 = JCOEF *values 425; r15 = size_t *bits 426 427%define ZERO xmm9 428%define ONE xmm5 429%define X0 xmm0 430%define X1 xmm1 431%define N0 xmm2 432%define N1 xmm3 433%define AL xmm4 434%define K eax 435%define KK r9d 436%define EOB r8d 437%define SIGN rdi 438%define LUT r11 439%define T0 rcx 440%define T0d ecx 441%define T1 rdx 442%define T1d edx 443%define BLOCK r10 444%define VALUES r14 445%define LEN r12d 446%define LENEND r13d 447 448 align 32 449 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2) 450 451EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2): 452 push rbp 453 mov rax, rsp ; rax = original rbp 454 sub rsp, byte 4 455 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 456 mov [rsp], rax 457 mov rbp, rsp ; rbp = aligned rbp 458 lea rsp, [rbp - 16] 459 collect_args 6 460 461 movdqa XMMWORD [rbp - 16], ZERO 462 463 xor SIGN, SIGN 464 xor EOB, EOB 465 xor KK, KK 466 movd AL, r13d 467 pxor ZERO, ZERO 468 pcmpeqw ONE, ONE 469 psrlw ONE, 15 470 mov K, LEN 471 mov LENEND, LEN 472 and K, -16 473 and LENEND, 7 474 shr K, 4 475 jz .ELOOPR16 476.BLOOPR16: 477 LOAD16 478 pcmpgtw N0, X0 479 pcmpgtw N1, X1 480 paddw X0, N0 481 paddw X1, N1 482 pxor X0, N0 483 pxor X1, N1 484 psrlw X0, AL 485 psrlw X1, AL 486 movdqa XMMWORD [VALUES + (0) * 2], X0 487 movdqa XMMWORD [VALUES + (8) * 2], X1 488 pcmpeqw X0, ONE 489 pcmpeqw X1, ONE 490 packsswb N0, N1 491 packsswb X0, X1 492 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 493 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 494 shr SIGN, 16 ; make room for sizebits 495 shl T0, 48 496 or SIGN, T0 497 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 498 jz .CONTINUER16 ; if (idx) { 499 mov EOB, KK 500 add EOB, T1d ; EOB = k + idx; 501.CONTINUER16: 502 add VALUES, 16*2 503 add LUT, 16*SIZEOF_INT 504 add KK, 16 505 dec K 506 jnz .BLOOPR16 507.ELOOPR16: 508 test LEN, 8 509 jz .TRYR7 510 test LEN, 7 511 jz .TRYR8 512 513 LOAD15 514 pcmpgtw N0, X0 515 pcmpgtw N1, X1 516 paddw X0, N0 517 paddw X1, N1 518 pxor X0, N0 519 pxor X1, N1 520 psrlw X0, AL 521 psrlw X1, AL 522 movdqa XMMWORD [VALUES + (0) * 2], X0 523 movdqa XMMWORD [VALUES + (8) * 2], X1 524 pcmpeqw X0, ONE 525 pcmpeqw X1, ONE 526 packsswb N0, N1 527 packsswb X0, X1 528 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 529 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 530 shr SIGN, 16 ; make room for sizebits 531 shl T0, 48 532 or SIGN, T0 533 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 534 jz .CONTINUER15 ; if (idx) { 535 mov EOB, KK 536 add EOB, T1d ; EOB = k + idx; 537.CONTINUER15: 538 add VALUES, 16*2 539 jmp .PADDINGR 540.TRYR8: 541 LOAD8 542 543 pcmpgtw N0, X0 544 paddw X0, N0 545 pxor X0, N0 546 psrlw X0, AL 547 movdqa XMMWORD [VALUES + (0) * 2], X0 548 pcmpeqw X0, ONE 549 packsswb N0, ZERO 550 packsswb X0, ZERO 551 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 552 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 553 shr SIGN, 8 ; make room for sizebits 554 shl T0, 56 555 or SIGN, T0 556 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 557 jz .CONTINUER8 ; if (idx) { 558 mov EOB, KK 559 add EOB, T1d ; EOB = k + idx; 560.CONTINUER8: 561 add VALUES, 8*2 562 jmp .PADDINGR 563.TRYR7: 564 LOAD7 565 566 pcmpgtw N0, X0 567 paddw X0, N0 568 pxor X0, N0 569 psrlw X0, AL 570 movdqa XMMWORD [VALUES + (0) * 2], X0 571 pcmpeqw X0, ONE 572 packsswb N0, ZERO 573 packsswb X0, ZERO 574 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg); 575 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1); 576 shr SIGN, 8 ; make room for sizebits 577 shl T0, 56 578 or SIGN, T0 579 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1); 580 jz .CONTINUER7 ; if (idx) { 581 mov EOB, KK 582 add EOB, T1d ; EOB = k + idx; 583.CONTINUER7: 584 add VALUES, 8*2 585.PADDINGR: 586 mov K, LEN 587 add K, 7 588 and K, -8 589 shr K, 3 590 sub K, DCTSIZE2/8 591 jz .EPADDINGR 592 align 16 593.ZEROLOOPR: 594 movdqa XMMWORD [VALUES + 0], ZERO 595 shr SIGN, 8 596 add VALUES, 8*2 597 inc K 598 jnz .ZEROLOOPR 599.EPADDINGR: 600 not SIGN 601 sub VALUES, DCTSIZE2*2 602 mov MMWORD [r15+SIZEOF_MMWORD], SIGN 603 604 REDUCE0 605 606 mov eax, EOB 607 movdqa ZERO, XMMWORD [rbp - 16] 608 uncollect_args 6 609 mov rsp, rbp ; rsp <- aligned rbp 610 pop rsp ; rsp <- original rbp 611 pop rbp 612 ret 613 614%undef ZERO 615%undef ONE 616%undef X0 617%undef X1 618%undef N0 619%undef N1 620%undef AL 621%undef K 622%undef KK 623%undef EOB 624%undef SIGN 625%undef LUT 626%undef T0 627%undef T0d 628%undef T1 629%undef T1d 630%undef BLOCK 631%undef VALUES 632%undef LEN 633%undef LENEND 634 635; For some reason, the OS X linker does not honor the request to align the 636; segment unless we do this. 637 align 32 638