1; 2; jidctflt.asm - floating-point IDCT (SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the inverse DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jidctflt.c; see the jidctflt.c for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) 29 shufps %1, %2, 0x44 30%endmacro 31 32%macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) 33 shufps %1, %2, 0xEE 34%endmacro 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_CONST 38 39 alignz 32 40 GLOBAL_DATA(jconst_idct_float_sse2) 41 42EXTN(jconst_idct_float_sse2): 43 44PD_1_414 times 4 dd 1.414213562373095048801689 45PD_1_847 times 4 dd 1.847759065022573512256366 46PD_1_082 times 4 dd 1.082392200292393968799446 47PD_M2_613 times 4 dd -2.613125929752753055713286 48PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) 49PB_CENTERJSAMP times 16 db CENTERJSAMPLE 50 51 alignz 32 52 53; -------------------------------------------------------------------------- 54 SECTION SEG_TEXT 55 BITS 32 56; 57; Perform dequantization and inverse DCT on one block of coefficients. 58; 59; GLOBAL(void) 60; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block, 61; JSAMPARRAY output_buf, JDIMENSION output_col) 62; 63 64%define dct_table(b) (b) + 8 ; void *dct_table 65%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 66%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 67%define output_col(b) (b) + 20 ; JDIMENSION output_col 68 69%define original_ebp ebp + 0 70%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 71 ; xmmword wk[WK_NUM] 72%define WK_NUM 2 73%define workspace wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT 74 ; FAST_FLOAT workspace[DCTSIZE2] 75 76 align 32 77 GLOBAL_FUNCTION(jsimd_idct_float_sse2) 78 79EXTN(jsimd_idct_float_sse2): 80 push ebp 81 mov eax, esp ; eax = original ebp 82 sub esp, byte 4 83 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 84 mov [esp], eax 85 mov ebp, esp ; ebp = aligned ebp 86 lea esp, [workspace] 87 push ebx 88; push ecx ; need not be preserved 89; push edx ; need not be preserved 90 push esi 91 push edi 92 93 get_GOT ebx ; get GOT address 94 95 ; ---- Pass 1: process columns from input, store into work array. 96 97; mov eax, [original_ebp] 98 mov edx, POINTER [dct_table(eax)] ; quantptr 99 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 100 lea edi, [workspace] ; FAST_FLOAT *wsptr 101 mov ecx, DCTSIZE/4 ; ctr 102 alignx 16, 7 103.columnloop: 104%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE 105 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 106 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 107 jnz near .columnDCT 108 109 movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 110 movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 111 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 112 movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 113 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 114 movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 115 movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 116 por xmm1, xmm2 117 por xmm3, xmm4 118 por xmm5, xmm6 119 por xmm1, xmm3 120 por xmm5, xmm7 121 por xmm1, xmm5 122 packsswb xmm1, xmm1 123 movd eax, xmm1 124 test eax, eax 125 jnz short .columnDCT 126 127 ; -- AC terms all zero 128 129 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 130 131 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 132 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 133 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 134 135 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 136 137 movaps xmm1, xmm0 138 movaps xmm2, xmm0 139 movaps xmm3, xmm0 140 141 shufps xmm0, xmm0, 0x00 ; xmm0=(00 00 00 00) 142 shufps xmm1, xmm1, 0x55 ; xmm1=(01 01 01 01) 143 shufps xmm2, xmm2, 0xAA ; xmm2=(02 02 02 02) 144 shufps xmm3, xmm3, 0xFF ; xmm3=(03 03 03 03) 145 146 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 147 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 148 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 149 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 150 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 151 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 152 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 153 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 154 jmp near .nextcolumn 155 alignx 16, 7 156%endif 157.columnDCT: 158 159 ; -- Even part 160 161 movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 162 movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 163 movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 164 movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 165 166 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 167 punpcklwd xmm1, xmm1 ; xmm1=(20 20 21 21 22 22 23 23) 168 psrad xmm0, (DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) 169 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) 170 cvtdq2ps xmm0, xmm0 ; xmm0=in0=(00 01 02 03) 171 cvtdq2ps xmm1, xmm1 ; xmm1=in2=(20 21 22 23) 172 173 punpcklwd xmm2, xmm2 ; xmm2=(40 40 41 41 42 42 43 43) 174 punpcklwd xmm3, xmm3 ; xmm3=(60 60 61 61 62 62 63 63) 175 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) 176 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) 177 cvtdq2ps xmm2, xmm2 ; xmm2=in4=(40 41 42 43) 178 cvtdq2ps xmm3, xmm3 ; xmm3=in6=(60 61 62 63) 179 180 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 181 mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 182 mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 183 mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 184 185 movaps xmm4, xmm0 186 movaps xmm5, xmm1 187 subps xmm0, xmm2 ; xmm0=tmp11 188 subps xmm1, xmm3 189 addps xmm4, xmm2 ; xmm4=tmp10 190 addps xmm5, xmm3 ; xmm5=tmp13 191 192 mulps xmm1, [GOTOFF(ebx,PD_1_414)] 193 subps xmm1, xmm5 ; xmm1=tmp12 194 195 movaps xmm6, xmm4 196 movaps xmm7, xmm0 197 subps xmm4, xmm5 ; xmm4=tmp3 198 subps xmm0, xmm1 ; xmm0=tmp2 199 addps xmm6, xmm5 ; xmm6=tmp0 200 addps xmm7, xmm1 ; xmm7=tmp1 201 202 movaps XMMWORD [wk(1)], xmm4 ; tmp3 203 movaps XMMWORD [wk(0)], xmm0 ; tmp2 204 205 ; -- Odd part 206 207 movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 208 movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 209 movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 210 movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 211 212 punpcklwd xmm2, xmm2 ; xmm2=(10 10 11 11 12 12 13 13) 213 punpcklwd xmm3, xmm3 ; xmm3=(30 30 31 31 32 32 33 33) 214 psrad xmm2, (DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) 215 psrad xmm3, (DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) 216 cvtdq2ps xmm2, xmm2 ; xmm2=in1=(10 11 12 13) 217 cvtdq2ps xmm3, xmm3 ; xmm3=in3=(30 31 32 33) 218 219 punpcklwd xmm5, xmm5 ; xmm5=(50 50 51 51 52 52 53 53) 220 punpcklwd xmm1, xmm1 ; xmm1=(70 70 71 71 72 72 73 73) 221 psrad xmm5, (DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) 222 psrad xmm1, (DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) 223 cvtdq2ps xmm5, xmm5 ; xmm5=in5=(50 51 52 53) 224 cvtdq2ps xmm1, xmm1 ; xmm1=in7=(70 71 72 73) 225 226 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 227 mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 228 mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 229 mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 230 231 movaps xmm4, xmm2 232 movaps xmm0, xmm5 233 addps xmm2, xmm1 ; xmm2=z11 234 addps xmm5, xmm3 ; xmm5=z13 235 subps xmm4, xmm1 ; xmm4=z12 236 subps xmm0, xmm3 ; xmm0=z10 237 238 movaps xmm1, xmm2 239 subps xmm2, xmm5 240 addps xmm1, xmm5 ; xmm1=tmp7 241 242 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 243 244 movaps xmm3, xmm0 245 addps xmm0, xmm4 246 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 247 mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 248 mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 249 addps xmm3, xmm0 ; xmm3=tmp12 250 subps xmm4, xmm0 ; xmm4=tmp10 251 252 ; -- Final output stage 253 254 subps xmm3, xmm1 ; xmm3=tmp6 255 movaps xmm5, xmm6 256 movaps xmm0, xmm7 257 addps xmm6, xmm1 ; xmm6=data0=(00 01 02 03) 258 addps xmm7, xmm3 ; xmm7=data1=(10 11 12 13) 259 subps xmm5, xmm1 ; xmm5=data7=(70 71 72 73) 260 subps xmm0, xmm3 ; xmm0=data6=(60 61 62 63) 261 subps xmm2, xmm3 ; xmm2=tmp5 262 263 movaps xmm1, xmm6 ; transpose coefficients(phase 1) 264 unpcklps xmm6, xmm7 ; xmm6=(00 10 01 11) 265 unpckhps xmm1, xmm7 ; xmm1=(02 12 03 13) 266 movaps xmm3, xmm0 ; transpose coefficients(phase 1) 267 unpcklps xmm0, xmm5 ; xmm0=(60 70 61 71) 268 unpckhps xmm3, xmm5 ; xmm3=(62 72 63 73) 269 270 movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 271 movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 272 273 movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) 274 movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) 275 276 addps xmm4, xmm2 ; xmm4=tmp4 277 movaps xmm0, xmm7 278 movaps xmm3, xmm5 279 addps xmm7, xmm2 ; xmm7=data2=(20 21 22 23) 280 addps xmm5, xmm4 ; xmm5=data4=(40 41 42 43) 281 subps xmm0, xmm2 ; xmm0=data5=(50 51 52 53) 282 subps xmm3, xmm4 ; xmm3=data3=(30 31 32 33) 283 284 movaps xmm2, xmm7 ; transpose coefficients(phase 1) 285 unpcklps xmm7, xmm3 ; xmm7=(20 30 21 31) 286 unpckhps xmm2, xmm3 ; xmm2=(22 32 23 33) 287 movaps xmm4, xmm5 ; transpose coefficients(phase 1) 288 unpcklps xmm5, xmm0 ; xmm5=(40 50 41 51) 289 unpckhps xmm4, xmm0 ; xmm4=(42 52 43 53) 290 291 movaps xmm3, xmm6 ; transpose coefficients(phase 2) 292 unpcklps2 xmm6, xmm7 ; xmm6=(00 10 20 30) 293 unpckhps2 xmm3, xmm7 ; xmm3=(01 11 21 31) 294 movaps xmm0, xmm1 ; transpose coefficients(phase 2) 295 unpcklps2 xmm1, xmm2 ; xmm1=(02 12 22 32) 296 unpckhps2 xmm0, xmm2 ; xmm0=(03 13 23 33) 297 298 movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) 299 movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) 300 301 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 302 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 303 movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 304 movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 305 306 movaps xmm6, xmm5 ; transpose coefficients(phase 2) 307 unpcklps2 xmm5, xmm7 ; xmm5=(40 50 60 70) 308 unpckhps2 xmm6, xmm7 ; xmm6=(41 51 61 71) 309 movaps xmm3, xmm4 ; transpose coefficients(phase 2) 310 unpcklps2 xmm4, xmm2 ; xmm4=(42 52 62 72) 311 unpckhps2 xmm3, xmm2 ; xmm3=(43 53 63 73) 312 313 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 314 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 315 movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 316 movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 317 318.nextcolumn: 319 add esi, byte 4*SIZEOF_JCOEF ; coef_block 320 add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr 321 add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 322 dec ecx ; ctr 323 jnz near .columnloop 324 325 ; -- Prefetch the next coefficient block 326 327 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 328 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 329 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 330 prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 331 332 ; ---- Pass 2: process rows from work array, store into output array. 333 334 mov eax, [original_ebp] 335 lea esi, [workspace] ; FAST_FLOAT *wsptr 336 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 337 mov eax, JDIMENSION [output_col(eax)] 338 mov ecx, DCTSIZE/4 ; ctr 339 alignx 16, 7 340.rowloop: 341 342 ; -- Even part 343 344 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 345 movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 346 movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 347 movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 348 349 movaps xmm4, xmm0 350 movaps xmm5, xmm1 351 subps xmm0, xmm2 ; xmm0=tmp11 352 subps xmm1, xmm3 353 addps xmm4, xmm2 ; xmm4=tmp10 354 addps xmm5, xmm3 ; xmm5=tmp13 355 356 mulps xmm1, [GOTOFF(ebx,PD_1_414)] 357 subps xmm1, xmm5 ; xmm1=tmp12 358 359 movaps xmm6, xmm4 360 movaps xmm7, xmm0 361 subps xmm4, xmm5 ; xmm4=tmp3 362 subps xmm0, xmm1 ; xmm0=tmp2 363 addps xmm6, xmm5 ; xmm6=tmp0 364 addps xmm7, xmm1 ; xmm7=tmp1 365 366 movaps XMMWORD [wk(1)], xmm4 ; tmp3 367 movaps XMMWORD [wk(0)], xmm0 ; tmp2 368 369 ; -- Odd part 370 371 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 372 movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 373 movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 374 movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 375 376 movaps xmm4, xmm2 377 movaps xmm0, xmm5 378 addps xmm2, xmm1 ; xmm2=z11 379 addps xmm5, xmm3 ; xmm5=z13 380 subps xmm4, xmm1 ; xmm4=z12 381 subps xmm0, xmm3 ; xmm0=z10 382 383 movaps xmm1, xmm2 384 subps xmm2, xmm5 385 addps xmm1, xmm5 ; xmm1=tmp7 386 387 mulps xmm2, [GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 388 389 movaps xmm3, xmm0 390 addps xmm0, xmm4 391 mulps xmm0, [GOTOFF(ebx,PD_1_847)] ; xmm0=z5 392 mulps xmm3, [GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) 393 mulps xmm4, [GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) 394 addps xmm3, xmm0 ; xmm3=tmp12 395 subps xmm4, xmm0 ; xmm4=tmp10 396 397 ; -- Final output stage 398 399 subps xmm3, xmm1 ; xmm3=tmp6 400 movaps xmm5, xmm6 401 movaps xmm0, xmm7 402 addps xmm6, xmm1 ; xmm6=data0=(00 10 20 30) 403 addps xmm7, xmm3 ; xmm7=data1=(01 11 21 31) 404 subps xmm5, xmm1 ; xmm5=data7=(07 17 27 37) 405 subps xmm0, xmm3 ; xmm0=data6=(06 16 26 36) 406 subps xmm2, xmm3 ; xmm2=tmp5 407 408 movaps xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] 409 pcmpeqd xmm3, xmm3 410 psrld xmm3, WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} 411 412 addps xmm6, xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) 413 addps xmm7, xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) 414 addps xmm0, xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) 415 addps xmm5, xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) 416 417 pand xmm6, xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) 418 pslld xmm7, WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) 419 pand xmm0, xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) 420 pslld xmm5, WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) 421 por xmm6, xmm7 ; xmm6=(00 01 10 11 20 21 30 31) 422 por xmm0, xmm5 ; xmm0=(06 07 16 17 26 27 36 37) 423 424 movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 425 movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 426 427 addps xmm4, xmm2 ; xmm4=tmp4 428 movaps xmm7, xmm1 429 movaps xmm5, xmm3 430 addps xmm1, xmm2 ; xmm1=data2=(02 12 22 32) 431 addps xmm3, xmm4 ; xmm3=data4=(04 14 24 34) 432 subps xmm7, xmm2 ; xmm7=data5=(05 15 25 35) 433 subps xmm5, xmm4 ; xmm5=data3=(03 13 23 33) 434 435 movaps xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] 436 pcmpeqd xmm4, xmm4 437 psrld xmm4, WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} 438 439 addps xmm3, xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) 440 addps xmm7, xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) 441 addps xmm1, xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) 442 addps xmm5, xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) 443 444 pand xmm3, xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) 445 pslld xmm7, WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) 446 pand xmm1, xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) 447 pslld xmm5, WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) 448 por xmm3, xmm7 ; xmm3=(04 05 14 15 24 25 34 35) 449 por xmm1, xmm5 ; xmm1=(02 03 12 13 22 23 32 33) 450 451 movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 452 453 packsswb xmm6, xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) 454 packsswb xmm1, xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) 455 paddb xmm6, xmm2 456 paddb xmm1, xmm2 457 458 movdqa xmm4, xmm6 ; transpose coefficients(phase 2) 459 punpcklwd xmm6, xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 460 punpckhwd xmm4, xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 461 462 movdqa xmm7, xmm6 ; transpose coefficients(phase 3) 463 punpckldq xmm6, xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 464 punpckhdq xmm7, xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 465 466 pshufd xmm5, xmm6, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 467 pshufd xmm3, xmm7, 0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 468 469 pushpic ebx ; save GOT address 470 471 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 472 mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 473 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 474 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 475 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 476 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 477 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 478 movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 479 480 poppic ebx ; restore GOT address 481 482 add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr 483 add edi, byte 4*SIZEOF_JSAMPROW 484 dec ecx ; ctr 485 jnz near .rowloop 486 487 pop edi 488 pop esi 489; pop edx ; need not be preserved 490; pop ecx ; need not be preserved 491 pop ebx 492 mov esp, ebp ; esp <- aligned ebp 493 pop esp ; esp <- original ebp 494 pop ebp 495 ret 496 497; For some reason, the OS X linker does not honor the request to align the 498; segment unless we do this. 499 align 32 500