1; 2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the inverse DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20 21%include "jsimdext.inc" 22%include "jdct.inc" 23 24; -------------------------------------------------------------------------- 25 26%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 27 shufps %1, %2, 0x44 28%endmacro 29 30%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 31 shufps %1, %2, 0xEE 32%endmacro 33 34; -------------------------------------------------------------------------- 35 SECTION SEG_CONST 36 37 alignz 32 38 GLOBAL_DATA(jconst_idct_float_sse2) 39 40EXTN(jconst_idct_float_sse2): 41 42PD_1_414 times 4 dd 1.414213562373095048801689 43PD_1_847 times 4 dd 1.847759065022573512256366 44PD_1_082 times 4 dd 1.082392200292393968799446 45PD_M2_613 times 4 dd -2.613125929752753055713286 46PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 47PB_CENTERJSAMP times 16 db CENTERJSAMPLE 48 49 alignz 32 50 51; -------------------------------------------------------------------------- 52 SECTION SEG_TEXT 53 BITS 64 54; 55; Perform dequantization and inverse DCT on one block of coefficients. 56; 57; GLOBAL(void) 58; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 59; JSAMPARRAY output_buf, JDIMENSION output_col) 60; 61 62; r10 = void *dct_table 63; r11 = JCOEFPTR coef_block 64; r12 = JSAMPARRAY output_buf 65; r13d = JDIMENSION output_col 66 67%define original_rbp rbp + 0 68%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD 69 ; xmmword wk[WK_NUM] 70%define WK_NUM 2 71%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 72 ; FAST_FLOAT workspace[DCTSIZE2] 73 74 align 32 75 GLOBAL_FUNCTION(jsimd_idct_float_sse2) 76 77EXTN(jsimd_idct_float_sse2): 78 push rbp 79 mov rax, rsp ; rax = original rbp 80 sub rsp, byte 4 81 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 82 mov [rsp], rax 83 mov rbp, rsp ; rbp = aligned rbp 84 lea rsp, [workspace] 85 collect_args 4 86 push rbx 87 88 ; ---- Pass 1: process columns from input, store into work array. 89 90 mov rdx, r10 ; quantptr 91 mov rsi, r11 ; inptr 92 lea rdi, [workspace] ; FAST_FLOAT *wsptr 93 mov rcx, DCTSIZE/4 ; ctr 94.columnloop: 95%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 96 mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 97 or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 98 jnz near .columnDCT 99 100 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 101 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 102 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 103 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 104 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 105 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 106 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 107 por xmm1, xmm2 108 por xmm3, xmm4 109 por xmm5, xmm6 110 por xmm1, xmm3 111 por xmm5, xmm7 112 por xmm1, xmm5 113 packsswb xmm1, xmm1 114 movd eax, xmm1 115 test rax, rax 116 jnz short .columnDCT 117 118 ; -- AC terms all zero 119 120 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 121 122 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 123 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 124 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 125 126 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 127 128 movaps xmm1, xmm0 129 movaps xmm2, xmm0 130 movaps xmm3, xmm0 131 132 shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 133 shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 134 shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 135 shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 136 137 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 138 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 139 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 140 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 141 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 142 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2 143 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 144 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 145 jmp near .nextcolumn 146%endif 147.columnDCT: 148 149 ; -- Even part 150 151 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 152 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 153 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 154 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 155 156 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 157 punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 158 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 159 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 160 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 161 cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 162 163 punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 164 punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 165 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 166 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 167 cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 168 cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 169 170 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 171 mulps xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 172 mulps xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 173 mulps xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 174 175 movaps xmm4, xmm0 176 movaps xmm5, xmm1 177 subps xmm0, xmm2 ; xmm0=tmp11 178 subps xmm1, xmm3 179 addps xmm4, xmm2 ; xmm4=tmp10 180 addps xmm5, xmm3 ; xmm5=tmp13 181 182 mulps xmm1, [rel PD_1_414] 183 subps xmm1, xmm5 ; xmm1=tmp12 184 185 movaps xmm6, xmm4 186 movaps xmm7, xmm0 187 subps xmm4, xmm5 ; xmm4=tmp3 188 subps xmm0, xmm1 ; xmm0=tmp2 189 addps xmm6, xmm5 ; xmm6=tmp0 190 addps xmm7, xmm1 ; xmm7=tmp1 191 192 movaps XMMWORD [wk(1)], xmm4 ; tmp3 193 movaps XMMWORD [wk(0)], xmm0 ; tmp2 194 195 ; -- Odd part 196 197 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 198 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 199 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 200 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 201 202 punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 203 punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 204 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 205 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 206 cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 207 cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 208 209 punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 210 punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 211 psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 212 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 213 cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 214 cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 215 216 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 217 mulps xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 218 mulps xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 219 mulps xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)] 220 221 movaps xmm4, xmm2 222 movaps xmm0, xmm5 223 addps xmm2, xmm1 ; xmm2=z11 224 addps xmm5, xmm3 ; xmm5=z13 225 subps xmm4, xmm1 ; xmm4=z12 226 subps xmm0, xmm3 ; xmm0=z10 227 228 movaps xmm1, xmm2 229 subps xmm2, xmm5 230 addps xmm1, xmm5 ; xmm1=tmp7 231 232 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 233 234 movaps xmm3, xmm0 235 addps xmm0, xmm4 236 mulps xmm0, [rel PD_1_847] ; xmm0=z5 237 mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 238 mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 239 addps xmm3, xmm0 ; xmm3=tmp12 240 subps xmm4, xmm0 ; xmm4=tmp10 241 242 ; -- Final output stage 243 244 subps xmm3, xmm1 ; xmm3=tmp6 245 movaps xmm5, xmm6 246 movaps xmm0, xmm7 247 addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 248 addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 249 subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 250 subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 251 subps xmm2, xmm3 ; xmm2=tmp5 252 253 movaps xmm1, xmm6 ; transpose coefficients(phase 1) 254 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 255 unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 256 movaps xmm3, xmm0 ; transpose coefficients(phase 1) 257 unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 258 unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 259 260 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 261 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 262 263 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 264 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 265 266 addps xmm4, xmm2 ; xmm4=tmp4 267 movaps xmm0, xmm7 268 movaps xmm3, xmm5 269 addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 270 addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 271 subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 272 subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 273 274 movaps xmm2, xmm7 ; transpose coefficients(phase 1) 275 unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 276 unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 277 movaps xmm4, xmm5 ; transpose coefficients(phase 1) 278 unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 279 unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 280 281 movaps xmm3, xmm6 ; transpose coefficients(phase 2) 282 unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) 283 unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) 284 movaps xmm0, xmm1 ; transpose coefficients(phase 2) 285 unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) 286 unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) 287 288 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 289 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 290 291 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6 292 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 293 movaps XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1 294 movaps XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0 295 296 movaps xmm6, xmm5 ; transpose coefficients(phase 2) 297 unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) 298 unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) 299 movaps xmm3, xmm4 ; transpose coefficients(phase 2) 300 unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) 301 unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) 302 303 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5 304 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6 305 movaps XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4 306 movaps XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3 307 308.nextcolumn: 309 add rsi, byte 4*SIZEOF_JCOEF ; coef_block 310 add rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 311 add rdi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 312 dec rcx ; ctr 313 jnz near .columnloop 314 315 ; -- Prefetch the next coefficient block 316 317 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 318 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 319 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 320 prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 321 322 ; ---- Pass 2: process rows from work array, store into output array. 323 324 mov rax, [original_rbp] 325 lea rsi, [workspace] ; FAST_FLOAT *wsptr 326 mov rdi, r12 ; (JSAMPROW *) 327 mov eax, r13d 328 mov rcx, DCTSIZE/4 ; ctr 329.rowloop: 330 331 ; -- Even part 332 333 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 334 movaps xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)] 335 movaps xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)] 336 movaps xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)] 337 338 movaps xmm4, xmm0 339 movaps xmm5, xmm1 340 subps xmm0, xmm2 ; xmm0=tmp11 341 subps xmm1, xmm3 342 addps xmm4, xmm2 ; xmm4=tmp10 343 addps xmm5, xmm3 ; xmm5=tmp13 344 345 mulps xmm1, [rel PD_1_414] 346 subps xmm1, xmm5 ; xmm1=tmp12 347 348 movaps xmm6, xmm4 349 movaps xmm7, xmm0 350 subps xmm4, xmm5 ; xmm4=tmp3 351 subps xmm0, xmm1 ; xmm0=tmp2 352 addps xmm6, xmm5 ; xmm6=tmp0 353 addps xmm7, xmm1 ; xmm7=tmp1 354 355 movaps XMMWORD [wk(1)], xmm4 ; tmp3 356 movaps XMMWORD [wk(0)], xmm0 ; tmp2 357 358 ; -- Odd part 359 360 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 361 movaps xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)] 362 movaps xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)] 363 movaps xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)] 364 365 movaps xmm4, xmm2 366 movaps xmm0, xmm5 367 addps xmm2, xmm1 ; xmm2=z11 368 addps xmm5, xmm3 ; xmm5=z13 369 subps xmm4, xmm1 ; xmm4=z12 370 subps xmm0, xmm3 ; xmm0=z10 371 372 movaps xmm1, xmm2 373 subps xmm2, xmm5 374 addps xmm1, xmm5 ; xmm1=tmp7 375 376 mulps xmm2, [rel PD_1_414] ; xmm2=tmp11 377 378 movaps xmm3, xmm0 379 addps xmm0, xmm4 380 mulps xmm0, [rel PD_1_847] ; xmm0=z5 381 mulps xmm3, [rel PD_M2_613] ; xmm3=(z10 * -2.613125930) 382 mulps xmm4, [rel PD_1_082] ; xmm4=(z12 * 1.082392200) 383 addps xmm3, xmm0 ; xmm3=tmp12 384 subps xmm4, xmm0 ; xmm4=tmp10 385 386 ; -- Final output stage 387 388 subps xmm3, xmm1 ; xmm3=tmp6 389 movaps xmm5, xmm6 390 movaps xmm0, xmm7 391 addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 392 addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 393 subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 394 subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 395 subps xmm2, xmm3 ; xmm2=tmp5 396 397 movaps xmm1, [rel PD_RNDINT_MAGIC] ; xmm1=[rel PD_RNDINT_MAGIC] 398 pcmpeqd xmm3, xmm3 399 psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 400 401 addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 402 addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 403 addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 404 addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 405 406 pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 407 pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 408 pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 409 pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 410 por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 411 por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 412 413 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 414 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 415 416 addps xmm4, xmm2 ; xmm4=tmp4 417 movaps xmm7, xmm1 418 movaps xmm5, xmm3 419 addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 420 addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 421 subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 422 subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 423 424 movaps xmm2, [rel PD_RNDINT_MAGIC] ; xmm2=[rel PD_RNDINT_MAGIC] 425 pcmpeqd xmm4, xmm4 426 psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 427 428 addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 429 addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 430 addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 431 addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 432 433 pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 434 pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 435 pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 436 pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 437 por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 438 por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 439 440 movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] 441 442 packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 443 packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 444 paddb xmm6, xmm2 445 paddb xmm1, xmm2 446 447 movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 448 punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 449 punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 450 451 movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 452 punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 453 punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 454 455 pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 456 pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 457 458 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 459 mov rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 460 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 461 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7 462 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 463 mov rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 464 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 465 movq XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3 466 467 add rsi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 468 add rdi, byte 4*SIZEOF_JSAMPROW 469 dec rcx ; ctr 470 jnz near .rowloop 471 472 pop rbx 473 uncollect_args 4 474 mov rsp, rbp ; rsp <- aligned rbp 475 pop rsp ; rsp <- original rbp 476 pop rbp 477 ret 478 479; For some reason, the OS X linker does not honor the request to align the 480; segment unless we do this. 481 align 32 482