1; 2; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) 3; 4; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander. 5; Copyright (C) 2015, Matthieu Darbois. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17; 18; This file contains an SSE2 implementation for Huffman coding of one block. 19; The following code is based on jchuff.c; see jchuff.c for more details. 20 21%include "jsimdext.inc" 22 23struc working_state 24.next_output_byte: resp 1 ; => next byte to write in buffer 25.free_in_buffer: resp 1 ; # of byte spaces remaining in buffer 26.cur.put_buffer.simd resq 1 ; current bit accumulation buffer 27.cur.free_bits resd 1 ; # of bits available in it 28.cur.last_dc_val resd 4 ; last DC coef for each component 29.cinfo: resp 1 ; dump_buffer needs access to this 30endstruc 31 32struc c_derived_tbl 33.ehufco: resd 256 ; code for each symbol 34.ehufsi: resb 256 ; length of code for each symbol 35; If no code has been allocated for a symbol S, ehufsi[S] contains 0 36endstruc 37 38; -------------------------------------------------------------------------- 39 SECTION SEG_CONST 40 41 alignz 32 42 GLOBAL_DATA(jconst_huff_encode_one_block) 43 44EXTN(jconst_huff_encode_one_block): 45 46jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007 47 dd 0x000f, 0x001f, 0x003f, 0x007f 48 dd 0x00ff, 0x01ff, 0x03ff, 0x07ff 49 dd 0x0fff, 0x1fff, 0x3fff, 0x7fff 50 51 alignz 32 52 53times 1 << 14 db 15 54times 1 << 13 db 14 55times 1 << 12 db 13 56times 1 << 11 db 12 57times 1 << 10 db 11 58times 1 << 9 db 10 59times 1 << 8 db 9 60times 1 << 7 db 8 61times 1 << 6 db 7 62times 1 << 5 db 6 63times 1 << 4 db 5 64times 1 << 3 db 4 65times 1 << 2 db 3 66times 1 << 1 db 2 67times 1 << 0 db 1 68times 1 db 0 69jpeg_nbits_table: 70times 1 db 0 71times 1 << 0 db 1 72times 1 << 1 db 2 73times 1 << 2 db 3 74times 1 << 3 db 4 75times 1 << 4 db 5 76times 1 << 5 db 6 77times 1 << 6 db 7 78times 1 << 7 db 8 79times 1 << 8 db 9 80times 1 << 9 db 10 81times 1 << 10 db 11 82times 1 << 11 db 12 83times 1 << 12 db 13 84times 1 << 13 db 14 85times 1 << 14 db 15 86 87 alignz 32 88 89%define NBITS(x) nbits_base + x 90%define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table) 91 92; -------------------------------------------------------------------------- 93 SECTION SEG_TEXT 94 BITS 64 95 96; Shorthand used to describe SIMD operations: 97; wN: xmmN treated as eight signed 16-bit values 98; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 99; bN: xmmN treated as 16 unsigned 8-bit values 100; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15 101; Contents of SIMD registers are shown in memory order. 102 103; Fill the bit buffer to capacity with the leading bits from code, then output 104; the bit buffer and put the remaining bits from code into the bit buffer. 105; 106; Usage: 107; code - contains the bits to shift into the bit buffer (LSB-aligned) 108; %1 - the label to which to jump when the macro completes 109; %2 (optional) - extra instructions to execute after nbits has been set 110; 111; Upon completion, free_bits will be set to the number of remaining bits from 112; code, and put_buffer will contain those remaining bits. temp and code will 113; be clobbered. 114; 115; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() 116; macro in jchuff.c. 117 118%macro EMIT_QWORD 1-2 119 add nbitsb, free_bitsb ; nbits += free_bits; 120 neg free_bitsb ; free_bits = -free_bits; 121 mov tempd, code ; temp = code; 122 shl put_buffer, nbitsb ; put_buffer <<= nbits; 123 mov nbitsb, free_bitsb ; nbits = free_bits; 124 neg free_bitsb ; free_bits = -free_bits; 125 shr tempd, nbitsb ; temp >>= nbits; 126 or tempq, put_buffer ; temp |= put_buffer; 127 movq xmm0, tempq ; xmm0.u64 = { temp, 0 }; 128 bswap tempq ; temp = htonl(temp); 129 mov put_buffer, codeq ; put_buffer = code; 130 pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0); 131 %2 132 pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i); 133 mov qword [buffer], tempq ; memcpy(buffer, &temp, 8); 134 ; (speculative; will be overwritten if 135 ; code contains any 0xFF bytes) 136 add free_bitsb, 64 ; free_bits += 64; 137 add bufferp, 8 ; buffer += 8; 138 test code, code ; if (code == 0) /* No 0xFF bytes */ 139 jz %1 ; return; 140 ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 141 ; bytes in the qword. 142 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 143 mov byte [buffer-7], 0 ; buffer[-7] = 0; 144 sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0)); 145 mov byte [buffer], temph ; buffer[0] = temp[1]; 146 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 147 mov byte [buffer+1], 0 ; buffer[1] = 0; 148 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 149 shr tempq, 16 ; temp >>= 16; 150 mov byte [buffer], tempb ; buffer[0] = temp[0]; 151 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 152 mov byte [buffer+1], 0 ; buffer[1] = 0; 153 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 154 mov byte [buffer], temph ; buffer[0] = temp[1]; 155 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 156 mov byte [buffer+1], 0 ; buffer[1] = 0; 157 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 158 shr tempq, 16 ; temp >>= 16; 159 mov byte [buffer], tempb ; buffer[0] = temp[0]; 160 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 161 mov byte [buffer+1], 0 ; buffer[1] = 0; 162 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 163 mov byte [buffer], temph ; buffer[0] = temp[1]; 164 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 165 mov byte [buffer+1], 0 ; buffer[1] = 0; 166 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 167 shr tempd, 16 ; temp >>= 16; 168 mov byte [buffer], tempb ; buffer[0] = temp[0]; 169 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 170 mov byte [buffer+1], 0 ; buffer[1] = 0; 171 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 172 mov byte [buffer], temph ; buffer[0] = temp[1]; 173 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 174 mov byte [buffer+1], 0 ; buffer[1] = 0; 175 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 176 jmp %1 ; return; 177%endmacro 178 179; 180; Encode a single block's worth of coefficients. 181; 182; GLOBAL(JOCTET *) 183; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, 184; JCOEFPTR block, int last_dc_val, 185; c_derived_tbl *dctbl, c_derived_tbl *actbl) 186; 187; NOTES: 188; When shuffling data, we try to avoid pinsrw as much as possible, since it is 189; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on 190; modern CPUs, so chains of pinsrw instructions (even with different outputs) 191; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and 192; requires 2 µops (with memory operand) on Intel. In either case, only one 193; pinsrw instruction can be decoded per cycle (and nothing else if they are 194; back-to-back), so out-of-order execution cannot be used to work around long 195; pinsrw chains (though for Sandy Bridge and later, this may be less of a 196; problem if the code runs from the µop cache.) 197; 198; We use tzcnt instead of bsf without checking for support. The instruction is 199; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to 200; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is 201; an input dependency (although the behavior is not formally defined, Intel 202; CPUs usually leave the destination unmodified if the source is zero.) This 203; can prevent out-of-order execution, so we clear the destination before 204; invoking tzcnt. 205; 206; Initial register allocation 207; rax - buffer 208; rbx - temp 209; rcx - nbits 210; rdx - block --> free_bits 211; rsi - nbits_base 212; rdi - t 213; rbp - code 214; r8 - dctbl --> code_temp 215; r9 - actbl 216; r10 - state 217; r11 - index 218; r12 - put_buffer 219 220%define buffer rax 221%ifdef WIN64 222%define bufferp rax 223%else 224%define bufferp raxp 225%endif 226%define tempq rbx 227%define tempd ebx 228%define tempb bl 229%define temph bh 230%define nbitsq rcx 231%define nbits ecx 232%define nbitsb cl 233%define block rdx 234%define nbits_base rsi 235%define t rdi 236%define td edi 237%define codeq rbp 238%define code ebp 239%define dctbl r8 240%define actbl r9 241%define state r10 242%define index r11 243%define indexd r11d 244%define put_buffer r12 245%define put_bufferd r12d 246 247; Step 1: Re-arrange input data according to jpeg_natural_order 248; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 249; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 250; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 251; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 252; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 253; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 254; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 255; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 256 257 align 32 258 GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) 259 260EXTN(jsimd_huff_encode_one_block_sse2): 261 262%ifdef WIN64 263 264; rcx = working_state *state 265; rdx = JOCTET *buffer 266; r8 = JCOEFPTR block 267; r9 = int last_dc_val 268; [rax+48] = c_derived_tbl *dctbl 269; [rax+56] = c_derived_tbl *actbl 270 271 ;X: X = code stream 272 mov buffer, rdx 273 mov block, r8 274 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 275 push rbx 276 push rbp 277 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 278 push rsi 279 push rdi 280 push r12 281 movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 282 mov state, rcx 283 movsx code, word [block] ;Z: code = block[0]; 284 pxor xmm4, xmm4 ;A: w4[i] = 0; 285 sub code, r9d ;Z: code -= last_dc_val; 286 mov dctbl, POINTER [rsp+6*8+4*8] 287 mov actbl, POINTER [rsp+6*8+5*8] 288 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 289 lea nbits_base, [rel jpeg_nbits_table] 290 add rsp, -DCTSIZE2 * SIZEOF_WORD 291 mov t, rsp 292 293%else 294 295; rdi = working_state *state 296; rsi = JOCTET *buffer 297; rdx = JCOEFPTR block 298; rcx = int last_dc_val 299; r8 = c_derived_tbl *dctbl 300; r9 = c_derived_tbl *actbl 301 302 ;X: X = code stream 303 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 304 push rbx 305 push rbp 306 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 307 push r12 308 mov state, rdi 309 mov buffer, rsi 310 movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 311 movsx codeq, word [block] ;Z: code = block[0]; 312 lea nbits_base, [rel jpeg_nbits_table] 313 pxor xmm4, xmm4 ;A: w4[i] = 0; 314 sub codeq, rcx ;Z: code -= last_dc_val; 315 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 316 lea t, [rsp - DCTSIZE2 * SIZEOF_WORD] ; use red zone for t_ 317 318%endif 319 320 pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 321 pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 322 punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 323 punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 324 pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 325 ;A: (Row 0, offset 1) 326 pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); 327 paddw xmm0, xmm4 ;A: w0[i] += w4[i]; 328 movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; 329 330 movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- 331 pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- 332 pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 333 movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 334 movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 335 punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 336 pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 337 pxor xmm4, xmm4 ;A: w4[i] = 0; 338 psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- 339 pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); 340 pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 341 ; (Row 1, offset 1) 342 pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); 343 paddw xmm1, xmm4 ;B: w1[i] += w4[i]; 344 movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; 345 pxor xmm4, xmm4 ;B: w4[i] = 0; 346 pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); 347 348 packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] 349 ; w/ signed saturation 350 351 pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- 352 pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- 353 pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- 354 pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 355 ; (Row 3, offset 1) 356 pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); 357 paddw xmm3, xmm4 ;D: w3[i] += w4[i]; 358 movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; 359 pxor xmm4, xmm4 ;D: w4[i] = 0; 360 pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); 361 362 pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 363 cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000, 364 ;Z: i.e. if code is positive 365 pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 366 pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 367 adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0); 368 pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 369 pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 370 movsxd codeq, code ;Z: sign extend code 371 pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 372 ; (Row 2, offset 1) 373 pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); 374 paddw xmm2, xmm4 ;C: w2[i] += w4[i]; 375 movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i]; 376 pxor xmm4, xmm4 ;C: w4[i] = 0; 377 pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); 378 379 packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] 380 ; w/ signed saturation 381 382 movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code); 383 movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 384 pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i); 385 pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i); 386 movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 387 punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 388 shl tempd, 16 ;Z: temp <<= 16; 389 psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- 390 pxor xmm2, xmm2 ;H: w2[i] = 0; 391 or put_bufferd, tempd ;Z: put_buffer |= temp; 392 pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- 393 movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- 394 unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 395 pxor xmm0, xmm0 ;H: w0[i] = 0; 396 pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- 397 ; (Row 7, offset 1) 398 pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); 399 paddw xmm3, xmm2 ;H: w3[i] += w2[i]; 400 movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; 401 movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- 402 pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); 403 punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 404 mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4] 405 ;Z: temp = dctbl->ehufco[nbits]; 406 movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 407 psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- 408 shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 409 and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1; 410 pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- 411 pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 412 shl tempq, nbitsb ;Z: temp <<= nbits; 413 pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- 414 pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 415 pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- 416 or code, tempd ;Z: code |= temp; 417 movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 418 pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- 419 pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 420 pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 421 ; (Row 6, offset 1) 422 pxor xmm2, xmm2 ;G: w2[i] = 0; 423 pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); 424 pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 425 paddw xmm4, xmm0 ;G: w4[i] += w0[i]; 426 movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; 427 pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 428 ; (Row 5, offset 1) 429 pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); 430 pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 431 432 packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] 433 ; w/ signed saturation 434 435 pxor xmm0, xmm0 ;F: w0[i] = 0; 436 pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 437 pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); 438 pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i); 439 pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 440 paddw xmm1, xmm2 ;F: w1[i] += w2[i]; 441 movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; 442 pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 443 ; (Row 4, offset 1) 444%undef block 445%define free_bitsq rdx 446%define free_bitsd edx 447%define free_bitsb dl 448 pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); 449 shl tempq, 48 ;Z: temp <<= 48; 450 pxor xmm2, xmm2 ;E: w2[i] = 0; 451 pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); 452 paddw xmm5, xmm0 ;E: w5[i] += w0[i]; 453 or tempq, put_buffer ;Z: temp |= put_buffer; 454 movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; 455 lea t, [dword t - 2] ;Z: t = &t[-1]; 456 pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); 457 458 packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] 459 ; w/ signed saturation 460 461 add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq] 462 ;Z: nbits += dctbl->ehufsi[nbits]; 463%undef dctbl 464%define code_temp r8d 465 pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i); 466 mov free_bitsd, [state+working_state.cur.free_bits] 467 ;Z: free_bits = state->cur.free_bits; 468 pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF; 469 shl index, 32 ;Z: index <<= 32; 470 mov put_buffer, [state+working_state.cur.put_buffer.simd] 471 ;Z: put_buffer = state->cur.put_buffer.simd; 472 or index, tempq ;Z: index |= temp; 473 not index ;Z: index = ~index; 474 sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0) 475 jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE; 476 align 16 477.EMIT_CODE: ;Z: .EMIT_CODE: 478 EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND 479 480; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 481 482 align 16 483.BRLOOP: ; do { 484 lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16; 485 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] 486 ; nbits = actbl->ehufsi[0xf0]; 487 mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4] 488 ; code = actbl->ehufco[0xf0]; 489 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 490 jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE; 491 shl put_buffer, nbitsb ; put_buffer <<= nbits; 492 mov nbits, code_temp ; nbits = code_temp; 493 or put_buffer, codeq ; put_buffer |= code; 494 cmp nbits, 16 ; if (nbits <= 16) 495 jle .ERLOOP ; break; 496 jmp .BRLOOP ; } while (1); 497 498; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 499 500 align 16 501 times 5 nop 502.ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE: 503 shl put_buffer, nbitsb ; put_buffer <<= nbits; 504 or put_buffer, codeq ; put_buffer |= code; 505.BLOOP_COND: ; .BLOOP_COND: 506 test index, index ; if (index != 0) 507 jz .ELOOP ; { 508.BLOOP: ; do { 509 xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */ 510 tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index 511 inc nbits ; ++nbits; 512 lea t, [t + nbitsq * 2] ; t = &t[nbits]; 513 shr index, nbitsb ; index >>= nbits; 514.EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END: 515 cmp nbits, 16 ; if (nbits > 16) 516 jg .BRLOOP ; goto .BRLOOP; 517.ERLOOP: ; .ERLOOP: 518 movsx codeq, word [t] ; code = *t; 519 lea tempd, [nbitsq * 2] ; temp = nbits * 2; 520 movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code); 521 lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits; 522 mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4] 523 ; code_temp = actbl->ehufco[temp-16]; 524 shl code_temp, nbitsb ; code_temp <<= nbits; 525 and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1; 526 add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)] 527 ; free_bits -= actbl->ehufsi[temp-16]; 528 or code, code_temp ; code |= code_temp; 529 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 530 jle .EMIT_CODE ; goto .EMIT_CODE; 531 shl put_buffer, nbitsb ; put_buffer <<= nbits; 532 or put_buffer, codeq ; put_buffer |= code; 533 test index, index 534 jnz .BLOOP ; } while (index != 0); 535.ELOOP: ; } /* index != 0 */ 536 sub td, esp ; t -= (WIN64: &t_[0], UNIX: &t_[64]); 537%ifdef WIN64 538 cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) 539%else 540 cmp td, -2 * SIZEOF_WORD ; if (t != -2) 541%endif 542 je .EFN ; { 543 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] 544 ; nbits = actbl->ehufsi[0]; 545 mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0]; 546 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 547 jg .EFN_SKIP_EMIT_CODE ; { 548 EMIT_QWORD .EFN ; insert code, flush buffer 549 align 16 550.EFN_SKIP_EMIT_CODE: ; } else { 551 shl put_buffer, nbitsb ; put_buffer <<= nbits; 552 or put_buffer, codeq ; put_buffer |= code; 553.EFN: ; } } 554 mov [state + working_state.cur.put_buffer.simd], put_buffer 555 ; state->cur.put_buffer.simd = put_buffer; 556 mov byte [state + working_state.cur.free_bits], free_bitsb 557 ; state->cur.free_bits = free_bits; 558%ifdef WIN64 559 sub rsp, -DCTSIZE2 * SIZEOF_WORD 560 pop r12 561 pop rdi 562 pop rsi 563 pop rbp 564 pop rbx 565%else 566 pop r12 567 pop rbp 568 pop rbx 569%endif 570 ret 571 572; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 573 574 align 16 575.EMIT_BRLOOP_CODE: 576 EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp } 577 ; insert code, flush buffer, 578 ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END 579 580; For some reason, the OS X linker does not honor the request to align the 581; segment unless we do this. 582 align 32 583